# OUTROS TESTES REGRESSÃO E CLUSTERIZAÇÃO

| **Característica** | **Descrição** |
|---------------------|---------------|
| `x`                | Comprimento (mm) |
| `y`                | Largura (mm)     |
| `z`                | Profundidade (mm) |
| `carat`            | Peso em quilates do diamante |
| `cut`              | Qualidade do corte do diamante. Qualidade em ordem crescente: Fair (Regular), Good (Bom), Very Good (Muito Bom), Premium, Ideal |
| `color`            | Cor do diamante, sendo D a melhor e J a pior |
| `clarity`          | Quão visíveis são as inclusões no diamante (em ordem da melhor para a pior): FL = impecável, IF, I1, I2, I3 (nível 3 de inclusões) |
| `depth`            | Profundidade %: A altura de um diamante, medida da base (culet) até a mesa, dividida pelo diâmetro médio da cintura |
| `table`            | Tabela %: A largura da mesa do diamante expressa como uma porcentagem do seu diâmetro médio |
| `price`            | O preço do diamante |


### Análises para Modelagem de Clusterização com KMeans

In [41]:
# Importando as bibliotecas para todo o projeto
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.cm as cm
import matplotlib.pyplot as plt

In [9]:
diamonds = pd.read_csv('./data/diamonds.csv')
diamonds

Unnamed: 0.1,Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,1,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,2,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,3,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,4,0.29,Premium,I,VS2,62.4,58.0,334,4.20,4.23,2.63
4,5,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75
...,...,...,...,...,...,...,...,...,...,...,...
53935,53936,0.72,Ideal,D,SI1,60.8,57.0,2757,5.75,5.76,3.50
53936,53937,0.72,Good,D,SI1,63.1,55.0,2757,5.69,5.75,3.61
53937,53938,0.70,Very Good,D,SI1,62.8,60.0,2757,5.66,5.68,3.56
53938,53939,0.86,Premium,H,SI2,61.0,58.0,2757,6.15,6.12,3.74


In [10]:
diamonds_price = diamonds['price']
diamonds_price

0         326
1         326
2         327
3         334
4         335
         ... 
53935    2757
53936    2757
53937    2757
53938    2757
53939    2757
Name: price, Length: 53940, dtype: int64

In [11]:
df = diamonds.drop(['Unnamed: 0', 'price'], axis=1)
df

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,4.20,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,4.34,4.35,2.75
...,...,...,...,...,...,...,...,...,...
53935,0.72,Ideal,D,SI1,60.8,57.0,5.75,5.76,3.50
53936,0.72,Good,D,SI1,63.1,55.0,5.69,5.75,3.61
53937,0.70,Very Good,D,SI1,62.8,60.0,5.66,5.68,3.56
53938,0.86,Premium,H,SI2,61.0,58.0,6.15,6.12,3.74


In [12]:

df = pd.concat([df, diamonds_price], axis = 1)
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,price
0,0.23,Ideal,E,SI2,61.5,55.0,3.95,3.98,2.43,326
1,0.21,Premium,E,SI1,59.8,61.0,3.89,3.84,2.31,326
2,0.23,Good,E,VS1,56.9,65.0,4.05,4.07,2.31,327
3,0.29,Premium,I,VS2,62.4,58.0,4.2,4.23,2.63,334
4,0.31,Good,J,SI2,63.3,58.0,4.34,4.35,2.75,335


In [23]:
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,price
0,0.23,Ideal,E,SI2,61.5,55.0,3.95,3.98,2.43,326
1,0.21,Premium,E,SI1,59.8,61.0,3.89,3.84,2.31,326
2,0.23,Good,E,VS1,56.9,65.0,4.05,4.07,2.31,327
3,0.29,Premium,I,VS2,62.4,58.0,4.2,4.23,2.63,334
4,0.31,Good,J,SI2,63.3,58.0,4.34,4.35,2.75,335


### Regressão com Bagging, Boosting e XGBoost

In [24]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import BaggingRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

# Criando o pipeline de pré-processamento
def preprocess_data(df, features):
    # Separando categóricas e numéricas
    categorical_cols = [col for col in features if df[col].dtype == 'object']
    numerical_cols = [col for col in features if df[col].dtype != 'object']
    
    # Definindo o pré-processador
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), numerical_cols),
            ('cat', OneHotEncoder(drop='first'), categorical_cols)
        ]
    )
    
    return preprocessor

# Função principal para rodar os regressores
def run_regressors(df, y_column, features_list):
    results = {}
    
    # Separando o target
    y = df[y_column]
    
    # Definindo os modelos
    estimadores = {
        'Bagging': BaggingRegressor(n_estimators=100, random_state=42),
        'Boosting': GradientBoostingRegressor(n_estimators=100, random_state=42),
        'XGBoost': XGBRegressor(n_estimators=100, use_label_encoder=False, random_state=42, verbosity=0)
    }
    
    for feature_set in features_list:
        # Pré-processando os dados
        preprocessor = preprocess_data(df, feature_set)
        X = df[feature_set]
        X = preprocessor.fit_transform(X)
        
        # Separando dados de treino e teste
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        
        for nome_estimador, modelo in estimadores.items():
            # Ajustando o modelo
            modelo.fit(X_train, y_train)
            
            # Fazendo previsões
            y_pred = modelo.predict(X_test)
            
            # Calculando métricas
            mse = mean_squared_error(y_test, y_pred)
            r2 = r2_score(y_test, y_pred)
            
            # Salvando resultados
            results[(nome_estimador, tuple(feature_set))] = {
                'MSE': mse,
                'R2': r2
            }
    
    return results

# Rodando a análise
features_list = [
    ['depth', 'table'],                     # features_1
    ['carat', 'cut'],                       # features_2
    ['carat', 'cut', 'depth', 'table']      # features_3
]

resultados = run_regressors(df, 'price', features_list)

# Exibindo resultados
for (est, feat), metrics in resultados.items():
    print(f"Regressor: {est}, Features: {feat}")
    print(f"MSE: {metrics['MSE']:.2f}, R2: {metrics['R2']:.2f}")
    print("-" * 40)

Regressor: Bagging, Features: ('depth', 'table')
MSE: 15754394.27, R2: 0.01
----------------------------------------
Regressor: Boosting, Features: ('depth', 'table')
MSE: 15374827.02, R2: 0.03
----------------------------------------
Regressor: XGBoost, Features: ('depth', 'table')
MSE: 15521411.19, R2: 0.02
----------------------------------------
Regressor: Bagging, Features: ('carat', 'cut')
MSE: 1936437.00, R2: 0.88
----------------------------------------
Regressor: Boosting, Features: ('carat', 'cut')
MSE: 1899267.82, R2: 0.88
----------------------------------------
Regressor: XGBoost, Features: ('carat', 'cut')
MSE: 1889671.26, R2: 0.88
----------------------------------------
Regressor: Bagging, Features: ('carat', 'cut', 'depth', 'table')
MSE: 2095629.12, R2: 0.87
----------------------------------------
Regressor: Boosting, Features: ('carat', 'cut', 'depth', 'table')
MSE: 1823069.31, R2: 0.89
----------------------------------------
Regressor: XGBoost, Features: ('carat', 

### Clusterização KMeans e DBScan

In [37]:
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

# Função de pré-processamento
def preprocess_data(df, features):
    # Aplicar OneHotEncoding na coluna 'cut' (caso exista)
    if 'cut' in df.columns:
        df = pd.get_dummies(df, columns=['cut'], drop_first=True)  # One-hot encoding para 'cut'
    
    # Selecionar as features após o OneHotEncoding
    features_updated = [col for col in features if col in df.columns]
    
    # Normalização
    X = df[features_updated]
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    return X_scaled, features_updated

# Função para KMeans
def kmeans_clusterization(X_scaled, features_updated, k_values=[3, 4, 5, 6]):
    # Armazenar resultados
    results = []
    
    for k in k_values:
        kmeans = KMeans(n_clusters=k, random_state=42)
        kmeans.fit(X_scaled)
        
        # Calcular inércia (soma dos erros quadrados)
        inertia = kmeans.inertia_
        
        # Calcular Silhouette Score
        sil_score = silhouette_score(X_scaled, kmeans.labels_)
        
        results.append({
            'k': k,
            'inertia': inertia,
            'silhouette_score': sil_score
        })
        
    return results

features_1 = ['depth', 'table']
features_2 = ['carat', 'cut', 'depth', 'table']
features_3 = ['carat', 'cut']

# Pré-processar os dados antes de passar para a clusterização
X_scaled_1, features_updated_1 = preprocess_data(df, features_1)
X_scaled_2, features_updated_2 = preprocess_data(df, features_2)
X_scaled_3, features_updated_3 = preprocess_data(df, features_3)

# Agora podemos passar os dados já preparados para a clusterização
kmeans_results_1 = kmeans_clusterization(X_scaled_1, features_updated_1)
kmeans_results_2 = kmeans_clusterization(X_scaled_2, features_updated_2)
kmeans_results_3 = kmeans_clusterization(X_scaled_3, features_updated_3)

# Exibir resultados
print(kmeans_results_1)
print(kmeans_results_2)
print(kmeans_results_3)


[{'k': 3, 'inertia': 47907.59039825515, 'silhouette_score': np.float64(0.3666912110723605)}, {'k': 4, 'inertia': 39348.87184568457, 'silhouette_score': np.float64(0.35367665522578745)}, {'k': 5, 'inertia': 33064.92155121439, 'silhouette_score': np.float64(0.35055620614198335)}, {'k': 6, 'inertia': 29506.796338050986, 'silhouette_score': np.float64(0.32301663247456486)}]
[{'k': 3, 'inertia': 87992.16120084863, 'silhouette_score': np.float64(0.31348202717724205)}, {'k': 4, 'inertia': 75822.65618019918, 'silhouette_score': np.float64(0.28649516678756815)}, {'k': 5, 'inertia': 67769.88817141292, 'silhouette_score': np.float64(0.2813039106126095)}, {'k': 6, 'inertia': 60201.4319064157, 'silhouette_score': np.float64(0.262361145874213)}]
[{'k': 3, 'inertia': 7260.965431236174, 'silhouette_score': np.float64(0.6493072821818987)}, {'k': 4, 'inertia': 5004.26415455817, 'silhouette_score': np.float64(0.6424038692976167)}, {'k': 5, 'inertia': 2490.8850795777807, 'silhouette_score': np.float64(0.6

In [40]:
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import DBSCAN
from sklearn.metrics import silhouette_score
import pandas as pd
import numpy as np

# Função para DBSCAN
def dbscan_clusterization(X_scaled, features_updated, eps_values=np.linspace(0.1, 1.0, 10), minpts_values=[4, 8]):
    # Armazenar resultados
    results = []
    
    for eps in eps_values:
        for min_pts in minpts_values:
            dbscan = DBSCAN(eps=eps, min_samples=min_pts)
            dbscan.fit(X_scaled)
            
            # Verificar número de clusters encontrados
            n_clusters = len(set(dbscan.labels_)) - (1 if -1 in dbscan.labels_ else 0)
            noise = list(dbscan.labels_).count(-1)
            
            # Calcular o Silhouette Score (se possível)
            try:
                sil_score = silhouette_score(X_scaled, dbscan.labels_)
            except ValueError:
                sil_score = None  # Caso o Silhouette Score não possa ser calculado
            
            results.append({
                'eps': eps,
                'min_samples': min_pts,
                'n_clusters': n_clusters,
                'noise': noise,
                'silhouette_score': sil_score
            })
    
    return results

# Pré-processar os dados antes de passar para a clusterização
X_scaled_1, features_updated_1 = preprocess_data(df, features_1)
X_scaled_2, features_updated_2 = preprocess_data(df, features_2)
X_scaled_3, features_updated_3 = preprocess_data(df, features_3)

# Agora podemos passar os dados já preparados para a clusterização
dbscan_results_1 = dbscan_clusterization(X_scaled_1, features_updated_1)
dbscan_results_2 = dbscan_clusterization(X_scaled_2, features_updated_2)
dbscan_results_3 = dbscan_clusterization(X_scaled_3, features_updated_3)

# Exibir resultados
print(dbscan_results_1)
print(dbscan_results_2)
print(dbscan_results_3)


[{'eps': np.float64(0.1), 'min_samples': 4, 'n_clusters': 78, 'noise': 499, 'silhouette_score': np.float64(-0.2060046187918512)}, {'eps': np.float64(0.1), 'min_samples': 8, 'n_clusters': 43, 'noise': 911, 'silhouette_score': np.float64(0.05906627496929224)}, {'eps': np.float64(0.2), 'min_samples': 4, 'n_clusters': 32, 'noise': 256, 'silhouette_score': np.float64(0.16393194633919986)}, {'eps': np.float64(0.2), 'min_samples': 8, 'n_clusters': 20, 'noise': 473, 'silhouette_score': np.float64(0.26355704103734867)}, {'eps': np.float64(0.30000000000000004), 'min_samples': 4, 'n_clusters': 15, 'noise': 161, 'silhouette_score': np.float64(0.42977404971973693)}, {'eps': np.float64(0.30000000000000004), 'min_samples': 8, 'n_clusters': 4, 'noise': 269, 'silhouette_score': np.float64(0.550379708141306)}, {'eps': np.float64(0.4), 'min_samples': 4, 'n_clusters': 10, 'noise': 140, 'silhouette_score': np.float64(0.5238176698692874)}, {'eps': np.float64(0.4), 'min_samples': 8, 'n_clusters': 2, 'noise':