### Tratando os dados para regressão, clusterização e demais modelagens

In [1]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder, MinMaxScaler, StandardScaler
import warnings
warnings.filterwarnings("ignore")

In [24]:
diamonds = pd.read_csv('./data/diamonds.csv')
diamonds

Unnamed: 0.1,Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,1,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,2,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,3,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,4,0.29,Premium,I,VS2,62.4,58.0,334,4.20,4.23,2.63
4,5,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75
...,...,...,...,...,...,...,...,...,...,...,...
53935,53936,0.72,Ideal,D,SI1,60.8,57.0,2757,5.75,5.76,3.50
53936,53937,0.72,Good,D,SI1,63.1,55.0,2757,5.69,5.75,3.61
53937,53938,0.70,Very Good,D,SI1,62.8,60.0,2757,5.66,5.68,3.56
53938,53939,0.86,Premium,H,SI2,61.0,58.0,2757,6.15,6.12,3.74


### Acrescentando um nova feature 'carat range' que agrupa 'carat' em 10 faixas

In [25]:
# Definir os bins e labels e acrescenta a coluna 'carat range'
bins = [0.0, 0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 6.0]  # Adjust these ranges as needed
labels = ['0.0-0.5', '0.6-1.0', '1.1-1.5', '1.6-2.0', '2.1-2.5', '2.6-3.0', '3.1-3.5', '3.6-4.0', '4.1-4.5', 'Acima de 4.6']

# Categoriza os quilates nos bis definidos
diamonds['carat range'] = pd.cut(diamonds['carat'], bins=bins, labels=labels)

diamonds

Unnamed: 0.1,Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z,carat range
0,1,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43,0.0-0.5
1,2,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31,0.0-0.5
2,3,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31,0.0-0.5
3,4,0.29,Premium,I,VS2,62.4,58.0,334,4.20,4.23,2.63,0.0-0.5
4,5,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75,0.0-0.5
...,...,...,...,...,...,...,...,...,...,...,...,...
53935,53936,0.72,Ideal,D,SI1,60.8,57.0,2757,5.75,5.76,3.50,0.6-1.0
53936,53937,0.72,Good,D,SI1,63.1,55.0,2757,5.69,5.75,3.61,0.6-1.0
53937,53938,0.70,Very Good,D,SI1,62.8,60.0,2757,5.66,5.68,3.56,0.6-1.0
53938,53939,0.86,Premium,H,SI2,61.0,58.0,2757,6.15,6.12,3.74,0.6-1.0


In [26]:
diamonds = diamonds.drop('Unnamed: 0', axis=1)
diamonds

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z,carat range
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43,0.0-0.5
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31,0.0-0.5
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31,0.0-0.5
3,0.29,Premium,I,VS2,62.4,58.0,334,4.20,4.23,2.63,0.0-0.5
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75,0.0-0.5
...,...,...,...,...,...,...,...,...,...,...,...
53935,0.72,Ideal,D,SI1,60.8,57.0,2757,5.75,5.76,3.50,0.6-1.0
53936,0.72,Good,D,SI1,63.1,55.0,2757,5.69,5.75,3.61,0.6-1.0
53937,0.70,Very Good,D,SI1,62.8,60.0,2757,5.66,5.68,3.56,0.6-1.0
53938,0.86,Premium,H,SI2,61.0,58.0,2757,6.15,6.12,3.74,0.6-1.0


In [27]:
# Definição das hierarquias das categorias
cut_order = ['Fair', 'Good', 'Very Good', 'Premium', 'Ideal']
color_order = ['J', 'I', 'H', 'G', 'F', 'E', 'D']
clarity_order = ['I1', 'SI2', 'SI1', 'VS2', 'VS1', 'VVS2', 'VVS1', 'IF']
carat_order = ['0.0-0.5', '0.6-1.0', '1.1-1.5', '1.6-2.0', '2.1-2.5', '2.6-3.0', '3.1-3.5', '3.6-4.0', '4.1-4.5', 'Acima de 4.6']

# ColumnTransformer com OrdinalEncoder e StardardScaler
preprocessor = ColumnTransformer(
    transformers=[
        # OrdinalEncoder nas colunas categóricas com hierarquia
        ('cut', OrdinalEncoder(categories=[cut_order]), ['cut']),
        ('color', OrdinalEncoder(categories=[color_order]), ['color']),
        ('clarity', OrdinalEncoder(categories=[clarity_order]), ['clarity']),
        ('carat range', OrdinalEncoder(categories=[carat_order]), ['carat range']),
        
        # MinMaxScaler nas colunas numéricas
        ('scaler', StandardScaler(), ['carat', 'depth', 'table', 'price', 'x', 'y', 'z'])
    ]
)

# Pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor)
])

# Transformar o dataset
diamonds_processed = pipeline.fit_transform(diamonds)

# Resultado volta como um DataFrame para facilitar a visualização
diamonds_processed_df = pd.DataFrame(diamonds_processed, columns=[
    'carat', 'cut', 'color', 'clarity', 'depth', 'table', 'price', 'x', 'y', 'z','carat range'
])

# Resultado do processamento
diamonds_processed_df

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z,carat range
0,4.0,5.0,1.0,0.0,-1.198168,-0.174092,-1.099672,-0.904095,-1.587837,-1.536196,-1.571129
1,3.0,5.0,2.0,0.0,-1.240361,-1.360738,1.585529,-0.904095,-1.641325,-1.658774,-1.741175
2,1.0,5.0,4.0,0.0,-1.198168,-3.385019,3.375663,-0.903844,-1.498691,-1.457395,-1.741175
3,3.0,1.0,3.0,0.0,-1.071587,0.454133,0.242928,-0.902090,-1.364971,-1.317305,-1.287720
4,1.0,0.0,1.0,0.0,-1.029394,1.082358,0.242928,-0.901839,-1.240167,-1.212238,-1.117674
...,...,...,...,...,...,...,...,...,...,...,...
53935,4.0,6.0,2.0,1.0,-0.164427,-0.662711,-0.204605,-0.294731,0.016798,0.022304,-0.054888
53936,1.0,6.0,2.0,1.0,-0.164427,0.942753,-1.099672,-0.294731,-0.036690,0.013548,0.100988
53937,2.0,6.0,2.0,1.0,-0.206621,0.733344,1.137995,-0.294731,-0.063434,-0.047741,0.030135
53938,3.0,2.0,1.0,1.0,0.130927,-0.523105,0.242928,-0.294731,0.373383,0.337506,0.285204


In [28]:
# Salvar o DataFrame em um arquivo CSV
caminho_arquivo = "./data/cleaned.csv" 
diamonds_processed_df.to_csv(caminho_arquivo, index=False)
print(f"Arquivo salvo em: {caminho_arquivo}")

Arquivo salvo em: ./data/cleaned.csv
