In [180]:
# Análisis
import numpy as np
import pandas as pd
import seaborn as sns
import joblib

# Modelos
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

# Métricas
from sklearn.metrics import accuracy_score, roc_auc_score

# Pipelines
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer, make_column_transformer, make_column_selector

# Herramientas de preprocesamiento
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

In [181]:
df= sns.load_dataset('diamonds')

In [182]:
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [183]:
df= df.replace((0), np.nan)
df.isnull().sum()

carat       0
cut         0
color       0
clarity     0
depth       0
table       0
price       0
x           8
y           7
z          20
dtype: int64

In [184]:
# Borro filas duplicadas
print(df.duplicated().sum())
df.drop_duplicates(inplace=True)

146


In [185]:
df['cut_encoded'] = LabelEncoder().fit(
    ['Fair', 'Good', 'Very Good', 'Premium', 'Ideal']
).transform(df['cut'])
categorical_cols = ['color', 'clarity']
numerical_cols = ['carat', 'depth', 'price','table', 'x', 'y', 'z']

In [186]:
X = df.drop(['cut', 'cut_encoded'], axis=1)
y = df['cut_encoded']

In [187]:
numerical_pipeline = make_pipeline(
    SimpleImputer(strategy='mean'),
    RobustScaler()
    )
categorical_pipeline = make_pipeline(OneHotEncoder())

column_transformer = make_column_transformer(
    (numerical_pipeline, numerical_cols),
    (categorical_pipeline, categorical_cols)
)

pipeline = make_pipeline(column_transformer, RandomForestClassifier(max_depth=15))

pipeline.fit(X, y)
y_pred = pipeline.predict(X)
y_pred_prob = pipeline.predict_proba(X)
auc = roc_auc_score(y, y_pred_prob, multi_class='ovr')
print('Accuracy en train', accuracy_score(y, y_pred))
print('AUC en train:', auc)

Accuracy en train 0.8317656244190802
AUC en train: 0.9746648519163269


In [188]:
joblib.dump(pipeline, 'pipeline_clasificacion.joblib')

['pipeline_clasificacion.joblib']