## Librerías

In [18]:
# Tratamiento de datos
import pandas as pd
import numpy as np
# Visualización
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
# Modelos
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder, MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, f1_score
from sklearn.metrics import precision_score, recall_score, roc_curve, roc_auc_score

## Datos

In [2]:
url = 'https://gist.githubusercontent.com/slopp/ce3b90b9168f2f921784de84fa445651/raw/4ecf3041f0ed4913e7c230758733948bc561f434/penguins.csv'
data = pd.read_csv(url)

## Primera Exploración

In [3]:
df = data.copy()

In [None]:
df.head()

## MiniEDA

In [None]:
df.info()

In [None]:
df[df.duplicated()] 

In [None]:
df.isna().sum()

In [None]:
df.describe(include='all').T

In [None]:
df.hist(bins=50, figsize=(12,8))
plt.show()

In [None]:
numeral = ['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g', 'species']
sns.pairplot(df[numeral], hue='species')

In [None]:
plt.figure(figsize=(10, 10))
sns.heatmap(df.replace({'Adelie': 0, 'Gentoo': 1, 'Chinstrap': 2, 'Torgensen':0, 'Biscoe':1, 'Dream':2, 'male':0, 'female':1}).corr)

In [None]:
sns.countplot(x='species', data=df)

In [None]:
sns.countplot(x='island', hue='species', data=df)

In [None]:
df['species'].value_counts()/len(df['species'])

In [None]:
df.info()

## Tratamiento de datos

In [4]:
mean = df[['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g']].mean()

In [5]:
df.fillna(mean, inplace=True)

In [6]:
df['sex'].ffill(inplace=True)

In [None]:
# df.drop(columns='rowid', inplace=True)

In [None]:
# df.drop(columns='year', inplace=True)

In [None]:
df

In [None]:
df.isna().sum()

## División de datos

In [7]:
X_train, X_test, y_train, y_test = train_test_split(df.drop('species', axis=1), df['species'], test_size=0.2, random_state=23)

## Escalado

- species --> mapeo
- island --> LabelEncoder
- sex --> OHE
- bill_length_mm	bill_depth_mm	flipper_length_mm	body_mass_g --> MinMaxScaler
- rowid --> eliminar
- year --> nada

In [8]:
X_train.drop(columns='rowid', inplace=True)

In [9]:
mapeo = {'Adelie':0, 'Gentoo':1, 'Chinstrap':2}
y_train = y_train.map(mapeo)
y_test = y_test.map(mapeo)

In [10]:
encoder = LabelEncoder()
X_train['island'] =  encoder.fit_transform(X_train['island'])
X_test['island'] = encoder.transform(X_test['island'])

In [13]:
ohe = OneHotEncoder(sparse=False, handle_unknown='ignore')

ohe.fit(X_train[['sex']])

transformed_X_train = ohe.transform(X_train[['sex']])
transformed_df = pd.DataFrame(transformed_X_train, columns=ohe.get_feature_names_out(['sex']), index=X_train.index)
X_train_sca_ohe = pd.concat([X_train, transformed_df], axis=1).drop(columns='sex')

transformed_X_val = ohe.transform(X_test[['sex']])
transformed_df = pd.DataFrame(transformed_X_val, columns=ohe.get_feature_names_out(['sex']), index=X_test.index)
X_test_sca_ohe = pd.concat([X_test, transformed_df], axis=1).drop(columns='sex')

In [14]:
col_minmax = ['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g']

minmax = MinMaxScaler()

minmax.fit(X_train_sca_ohe[col_minmax])

X_train_sca_minmax = minmax.transform(X_train_sca_ohe[col_minmax])
X_train_sca_ohe[col_minmax] = X_train_sca_minmax

X_test_sca_minmax = minmax.transform(X_test_sca_ohe[col_minmax])
X_test_sca_ohe[col_minmax] = X_test_sca_minmax

In [17]:
X_train_sca_ohe

Unnamed: 0,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,year,sex_female,sex_male
95,1,0.316364,0.690476,0.610169,0.444444,2008,0.0,1.0
44,1,0.178182,0.452381,0.220339,0.083333,2007,1.0,0.0
119,2,0.327273,0.654762,0.288136,0.173611,2009,0.0,1.0
245,0,0.632727,0.357143,0.881356,0.819444,2009,0.0,1.0
266,0,0.512727,0.119048,0.762712,0.465278,2009,1.0,0.0
...,...,...,...,...,...,...,...,...
237,0,0.680000,0.500000,0.949153,0.805556,2009,0.0,1.0
31,1,0.185455,0.595238,0.101695,0.333333,2007,0.0,1.0
40,1,0.160000,0.583333,0.169492,0.125000,2007,1.0,0.0
230,0,0.476364,0.083333,0.728814,0.569444,2008,1.0,0.0


## BaseLines y Selección de Modelo

In [None]:
modelos = {
    'LogisticRegression': LogisticRegression(),
    'SVC': SVC()
}

metricas = ['accuracy', 'f1_macro', 'recall_macro', 'precision_macro', 'roc_auc_ovr']

resultados_dict = {}

for nombre_modelo, modelos in modelos.items():
    cv_resultados = cross_validate(modelo, X_train_sca_ohe, y_train, cv=5, scoring=metricas)

    for metricas in metricas

## Entrenar

## Predicción

## Validar modelo