In [1]:
# Análisis
import numpy as np
import pandas as pd
import seaborn as sns
import joblib


# Modelos
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

# Entrenamiento y test
from sklearn.model_selection import train_test_split

# Métricas
from sklearn.metrics import r2_score, mean_absolute_error, mean_absolute_percentage_error, root_mean_squared_error
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score, make_scorer
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix

# Pipelines
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer, make_column_transformer, make_column_selector

# Herramientas de preprocesamiento
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

In [2]:
df= sns.load_dataset('diamonds')

In [3]:
df= df.replace((0), np.nan)
df.isnull().sum()

carat       0
cut         0
color       0
clarity     0
depth       0
table       0
price       0
x           8
y           7
z          20
dtype: int64

In [4]:
# Borro filas duplicadas
print(df.duplicated().sum())
df.drop_duplicates(inplace=True)

146


In [5]:
X = df.drop(['cut'], axis=1)
y = df['cut']
y_encoded = LabelEncoder().fit_transform(y)

categorical_cols = ['color', 'clarity']
numerical_cols = ['carat', 'depth', 'price','table', 'x', 'y', 'z']

In [6]:
numerical_pipeline = make_pipeline(
    SimpleImputer(strategy='mean'),
    RobustScaler()
    )
categorical_pipeline = make_pipeline(OneHotEncoder())

column_transformer = make_column_transformer(
    (numerical_pipeline, numerical_cols),
    (categorical_pipeline, categorical_cols)
)

pipeline = make_pipeline(column_transformer, RandomForestClassifier(random_state=42))

pipeline.fit(X, y_encoded)
y_pred = pipeline.predict(X)
y_pred_prob = pipeline.predict_proba(X)
auc = roc_auc_score(y_encoded, y_pred_prob, multi_class='ovr')
print('Accuracy en train', accuracy_score(y_encoded, y_pred))
print('AUC en train:', auc)

Accuracy en train 0.9998884633974049
AUC en train: 0.9999999683375093


In [7]:
joblib.dump(pipeline, 'pipeline_clasificacion.joblib')

['pipeline_clasificacion.joblib']