# Modelo AirQuaility

### Importación de Librerías

In [41]:
import numpy as np
import pandas as pd
from scipy.stats import randint

from joblib import dump
import pickle

from sklearn.model_selection import train_test_split, cross_validate, RandomizedSearchCV
from sklearn.metrics import classification_report

from sklearn.impute import KNNImputer
from sklearn.preprocessing import PowerTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier

import warnings
warnings.filterwarnings("ignore")

### Carga de datos

In [2]:
air = pd.read_csv('../data/raw/AirQuality.csv', sep=';')

### Primera visualización

In [3]:
air

Unnamed: 0,Date,Time,CO(GT),PT08.S1(CO),NMHC(GT),C6H6(GT),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),T,RH,AH,Unnamed: 15,Unnamed: 16
0,10/03/2004,18.00.00,26,1360.0,150.0,119,1046.0,166.0,1056.0,113.0,1692.0,1268.0,136,489,07578,,
1,10/03/2004,19.00.00,2,1292.0,112.0,94,955.0,103.0,1174.0,92.0,1559.0,972.0,133,477,07255,,
2,10/03/2004,20.00.00,22,1402.0,88.0,90,939.0,131.0,1140.0,114.0,1555.0,1074.0,119,540,07502,,
3,10/03/2004,21.00.00,22,1376.0,80.0,92,948.0,172.0,1092.0,122.0,1584.0,1203.0,110,600,07867,,
4,10/03/2004,22.00.00,16,1272.0,51.0,65,836.0,131.0,1205.0,116.0,1490.0,1110.0,112,596,07888,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9466,,,,,,,,,,,,,,,,,
9467,,,,,,,,,,,,,,,,,
9468,,,,,,,,,,,,,,,,,
9469,,,,,,,,,,,,,,,,,


In [4]:
air.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9471 entries, 0 to 9470
Data columns (total 17 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Date           9357 non-null   object 
 1   Time           9357 non-null   object 
 2   CO(GT)         9357 non-null   object 
 3   PT08.S1(CO)    9357 non-null   float64
 4   NMHC(GT)       9357 non-null   float64
 5   C6H6(GT)       9357 non-null   object 
 6   PT08.S2(NMHC)  9357 non-null   float64
 7   NOx(GT)        9357 non-null   float64
 8   PT08.S3(NOx)   9357 non-null   float64
 9   NO2(GT)        9357 non-null   float64
 10  PT08.S4(NO2)   9357 non-null   float64
 11  PT08.S5(O3)    9357 non-null   float64
 12  T              9357 non-null   object 
 13  RH             9357 non-null   object 
 14  AH             9357 non-null   object 
 15  Unnamed: 15    0 non-null      float64
 16  Unnamed: 16    0 non-null      float64
dtypes: float64(10), object(7)
memory usage: 1.2+ MB


Podemos ver que el dataframe está compuesto por 17 columnas y 9471 filas.  
Quitando las dos primeras, que son fecha y hora, y las dos últimas, que estám completamente vacías, todas las columnas contienen variables numéricas continuas.  
Observamos que habrá que tratar varias de las columnas ya que son tipo object, y deberán ser transformadas a integer o float.

In [5]:
air.describe(include='all')

Unnamed: 0,Date,Time,CO(GT),PT08.S1(CO),NMHC(GT),C6H6(GT),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),T,RH,AH,Unnamed: 15,Unnamed: 16
count,9357,9357,9357.0,9357.0,9357.0,9357.0,9357.0,9357.0,9357.0,9357.0,9357.0,9357.0,9357.0,9357.0,9357.0,0.0,0.0
unique,391,24,104.0,,,408.0,,,,,,,437.0,754.0,6684.0,,
top,21/09/2004,18.00.00,-200.0,,,-2000.0,,,,,,,-200.0,-200.0,-200.0,,
freq,24,390,1592.0,,,366.0,,,,,,,366.0,366.0,366.0,,
mean,,,,1048.990061,-159.090093,,894.595276,168.616971,794.990168,58.148873,1391.479641,975.072032,,,,,
std,,,,329.83271,139.789093,,342.333252,257.433866,321.993552,126.940455,467.210125,456.938184,,,,,
min,,,,-200.0,-200.0,,-200.0,-200.0,-200.0,-200.0,-200.0,-200.0,,,,,
25%,,,,921.0,-200.0,,711.0,50.0,637.0,53.0,1185.0,700.0,,,,,
50%,,,,1053.0,-200.0,,895.0,141.0,794.0,96.0,1446.0,942.0,,,,,
75%,,,,1221.0,-200.0,,1105.0,284.0,960.0,133.0,1662.0,1255.0,,,,,


# Limpieza

Pasamos a float las columnas str.  
Eliminamos las dos columnas completamente vacías, así como las filas que son todo Nan.

In [6]:
air.drop(columns=['Unnamed: 15', 'Unnamed: 16'], inplace=True)

In [7]:
air['CO(GT)'] = air['CO(GT)'].str.replace(',', '.').astype(float)
air['C6H6(GT)'] = air['C6H6(GT)'].str.replace(',', '.').astype(float)
air['T'] = air['T'].str.replace(',', '.').astype(float)
air['RH'] = air['RH'].str.replace(',', '.').astype(float)
air['AH'] = air['AH'].str.replace(',', '.').astype(float)

In [8]:
air = air.dropna(how='all', axis=0)

Mostramos de nuevo. Ya no hay Nans, pero vemos un valor '-200' que no tiene sentido.

In [9]:
air

Unnamed: 0,Date,Time,CO(GT),PT08.S1(CO),NMHC(GT),C6H6(GT),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),T,RH,AH
0,10/03/2004,18.00.00,2.6,1360.0,150.0,11.9,1046.0,166.0,1056.0,113.0,1692.0,1268.0,13.6,48.9,0.7578
1,10/03/2004,19.00.00,2.0,1292.0,112.0,9.4,955.0,103.0,1174.0,92.0,1559.0,972.0,13.3,47.7,0.7255
2,10/03/2004,20.00.00,2.2,1402.0,88.0,9.0,939.0,131.0,1140.0,114.0,1555.0,1074.0,11.9,54.0,0.7502
3,10/03/2004,21.00.00,2.2,1376.0,80.0,9.2,948.0,172.0,1092.0,122.0,1584.0,1203.0,11.0,60.0,0.7867
4,10/03/2004,22.00.00,1.6,1272.0,51.0,6.5,836.0,131.0,1205.0,116.0,1490.0,1110.0,11.2,59.6,0.7888
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9352,04/04/2005,10.00.00,3.1,1314.0,-200.0,13.5,1101.0,472.0,539.0,190.0,1374.0,1729.0,21.9,29.3,0.7568
9353,04/04/2005,11.00.00,2.4,1163.0,-200.0,11.4,1027.0,353.0,604.0,179.0,1264.0,1269.0,24.3,23.7,0.7119
9354,04/04/2005,12.00.00,2.4,1142.0,-200.0,12.4,1063.0,293.0,603.0,175.0,1241.0,1092.0,26.9,18.3,0.6406
9355,04/04/2005,13.00.00,2.1,1003.0,-200.0,9.5,961.0,235.0,702.0,156.0,1041.0,770.0,28.3,13.5,0.5139


Vemos si hay más '-200'

In [10]:
air[(air == -200).any(axis=1)]

Unnamed: 0,Date,Time,CO(GT),PT08.S1(CO),NMHC(GT),C6H6(GT),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),T,RH,AH
9,11/03/2004,03.00.00,0.6,1010.0,19.0,1.7,561.0,-200.0,1705.0,-200.0,1235.0,501.0,10.3,60.2,0.7517
10,11/03/2004,04.00.00,-200.0,1011.0,14.0,1.3,527.0,21.0,1818.0,34.0,1197.0,445.0,10.1,60.5,0.7465
33,12/03/2004,03.00.00,0.8,889.0,21.0,1.9,574.0,-200.0,1680.0,-200.0,1187.0,512.0,7.0,62.3,0.6261
34,12/03/2004,04.00.00,-200.0,831.0,10.0,1.1,506.0,21.0,1893.0,32.0,1134.0,384.0,6.1,65.9,0.6248
39,12/03/2004,09.00.00,-200.0,1545.0,-200.0,22.1,1353.0,-200.0,767.0,-200.0,2058.0,1588.0,9.2,56.2,0.6561
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9352,04/04/2005,10.00.00,3.1,1314.0,-200.0,13.5,1101.0,472.0,539.0,190.0,1374.0,1729.0,21.9,29.3,0.7568
9353,04/04/2005,11.00.00,2.4,1163.0,-200.0,11.4,1027.0,353.0,604.0,179.0,1264.0,1269.0,24.3,23.7,0.7119
9354,04/04/2005,12.00.00,2.4,1142.0,-200.0,12.4,1063.0,293.0,603.0,175.0,1241.0,1092.0,26.9,18.3,0.6406
9355,04/04/2005,13.00.00,2.1,1003.0,-200.0,9.5,961.0,235.0,702.0,156.0,1041.0,770.0,28.3,13.5,0.5139


In [11]:
from utils import info_nans
info_nans(air)

Unnamed: 0,Filas,NaNs,Porcentaje
Date,9357,0,0.0
Time,9357,0,0.0
CO(GT),9357,0,0.0
PT08.S1(CO),9357,0,0.0
NMHC(GT),9357,0,0.0
C6H6(GT),9357,0,0.0
PT08.S2(NMHC),9357,0,0.0
NOx(GT),9357,0,0.0
PT08.S3(NOx),9357,0,0.0
NO2(GT),9357,0,0.0


Observamos que la columna 'NMHC(GT)' está prácticamente vacía, por lo que será eliminada.

In [12]:
air.drop(columns='NMHC(GT)', inplace=True)

In [13]:
def min_max_df(df):
    max_min_df = pd.DataFrame(index=['min', 'max'], columns=df.columns)
    max_min_df.loc['min'] = df.min()
    max_min_df.loc['max'] = df.max()
    return max_min_df

resultado = min_max_df(air)
resultado.T

Unnamed: 0,min,max
Date,01/01/2005,31/12/2004
Time,00.00.00,23.00.00
CO(GT),-200.0,11.9
PT08.S1(CO),-200.0,2040.0
C6H6(GT),-200.0,63.7
PT08.S2(NMHC),-200.0,2214.0
NOx(GT),-200.0,1479.0
PT08.S3(NOx),-200.0,2683.0
NO2(GT),-200.0,340.0
PT08.S4(NO2),-200.0,2775.0


Se observa que el valor '-200' es una imputación de datos faltantes. Se procede a tratarlo.

In [14]:
air.replace(-200, np.nan, inplace=True)

In [15]:
air[air.isna().any(axis=1)]

Unnamed: 0,Date,Time,CO(GT),PT08.S1(CO),C6H6(GT),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),T,RH,AH
9,11/03/2004,03.00.00,0.6,1010.0,1.7,561.0,,1705.0,,1235.0,501.0,10.3,60.2,0.7517
10,11/03/2004,04.00.00,,1011.0,1.3,527.0,21.0,1818.0,34.0,1197.0,445.0,10.1,60.5,0.7465
33,12/03/2004,03.00.00,0.8,889.0,1.9,574.0,,1680.0,,1187.0,512.0,7.0,62.3,0.6261
34,12/03/2004,04.00.00,,831.0,1.1,506.0,21.0,1893.0,32.0,1134.0,384.0,6.1,65.9,0.6248
39,12/03/2004,09.00.00,,1545.0,22.1,1353.0,,767.0,,2058.0,1588.0,9.2,56.2,0.6561
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9058,23/03/2005,04.00.00,,993.0,2.3,604.0,85.0,848.0,65.0,1160.0,762.0,14.5,66.4,1.0919
9130,26/03/2005,04.00.00,,1122.0,6.0,811.0,181.0,641.0,92.0,1336.0,1122.0,16.2,71.2,1.3013
9202,29/03/2005,04.00.00,,883.0,1.3,530.0,63.0,997.0,46.0,1102.0,617.0,13.7,68.2,1.0611
9274,01/04/2005,04.00.00,,818.0,0.8,473.0,47.0,1257.0,41.0,898.0,323.0,13.7,48.8,0.7606


Podemos observar que hay otros valores negativos. Se analizarán por separado para entender si es un valor correcto, una imputación negativa errónea o un valor faltante.  
    - Se comprueba que 'T' corresponde a la temperatura, por lo que un valor negativo es posible.

Procedemos a tratar las columnas Date y Time para poder usarlas en el modelo.

Se crean columnas con día, mes, año y hora para que pueda gestionarlo el modelo.

In [16]:
air[['DAY', 'MONTH', 'YEAR']] = air['Date'].str.split('/', expand=True)
air.drop(columns=['Date'], inplace=True)

air['HOUR'] = air['Time'].str.split('.', expand=True)[0]
air.drop(columns='Time', inplace=True)

air['DAY'] = air['DAY'].str.lstrip('0')
air['MONTH'] = air['MONTH'].str.lstrip('0')

air[['DAY', 'MONTH', 'YEAR', 'HOUR']] = air[['DAY', 'MONTH', 'YEAR', 'HOUR']].astype(float)

In [17]:
air

Unnamed: 0,CO(GT),PT08.S1(CO),C6H6(GT),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),T,RH,AH,DAY,MONTH,YEAR,HOUR
0,2.6,1360.0,11.9,1046.0,166.0,1056.0,113.0,1692.0,1268.0,13.6,48.9,0.7578,10.0,3.0,2004.0,18.0
1,2.0,1292.0,9.4,955.0,103.0,1174.0,92.0,1559.0,972.0,13.3,47.7,0.7255,10.0,3.0,2004.0,19.0
2,2.2,1402.0,9.0,939.0,131.0,1140.0,114.0,1555.0,1074.0,11.9,54.0,0.7502,10.0,3.0,2004.0,20.0
3,2.2,1376.0,9.2,948.0,172.0,1092.0,122.0,1584.0,1203.0,11.0,60.0,0.7867,10.0,3.0,2004.0,21.0
4,1.6,1272.0,6.5,836.0,131.0,1205.0,116.0,1490.0,1110.0,11.2,59.6,0.7888,10.0,3.0,2004.0,22.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9352,3.1,1314.0,13.5,1101.0,472.0,539.0,190.0,1374.0,1729.0,21.9,29.3,0.7568,4.0,4.0,2005.0,10.0
9353,2.4,1163.0,11.4,1027.0,353.0,604.0,179.0,1264.0,1269.0,24.3,23.7,0.7119,4.0,4.0,2005.0,11.0
9354,2.4,1142.0,12.4,1063.0,293.0,603.0,175.0,1241.0,1092.0,26.9,18.3,0.6406,4.0,4.0,2005.0,12.0
9355,2.1,1003.0,9.5,961.0,235.0,702.0,156.0,1041.0,770.0,28.3,13.5,0.5139,4.0,4.0,2005.0,13.0


# Objetivo

Trataremos de clasificar la calidad del aire según el nivel de 'C6H6(GT)', BENZENE, presente en el aire. Para ellos se creará una columnas categórica homónima que servirá de guía para la clasificación futura.

# Transformaciones

Se imputan los Nans faltantes con un KNNImputer.

In [18]:
imputer = KNNImputer(n_neighbors=5)

air_imputed = pd.DataFrame(imputer.fit_transform(air), columns=air.columns)

Creamos la columna 'BENZENE' como target según la concentración de 'C6H6(GT)'

In [19]:
benzene_values = []

for valor in air['C6H6(GT)']:
    if valor < 1:
        benzene_values.append(0)
    elif 1 <= valor < 5:
        benzene_values.append(1)
    elif 5 <= valor < 10:
        benzene_values.append(2)
    elif 10 <= valor < 20:
        benzene_values.append(3)
    elif 20 <= valor < 50:
        benzene_values.append(4)
    else:
        benzene_values.append(5)

air_imputed['BENZENE'] = benzene_values

In [20]:
air_imputed

Unnamed: 0,CO(GT),PT08.S1(CO),C6H6(GT),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),T,RH,AH,DAY,MONTH,YEAR,HOUR,BENZENE
0,2.6,1360.0,11.9,1046.0,166.0,1056.0,113.0,1692.0,1268.0,13.6,48.9,0.7578,10.0,3.0,2004.0,18.0,3
1,2.0,1292.0,9.4,955.0,103.0,1174.0,92.0,1559.0,972.0,13.3,47.7,0.7255,10.0,3.0,2004.0,19.0,2
2,2.2,1402.0,9.0,939.0,131.0,1140.0,114.0,1555.0,1074.0,11.9,54.0,0.7502,10.0,3.0,2004.0,20.0,2
3,2.2,1376.0,9.2,948.0,172.0,1092.0,122.0,1584.0,1203.0,11.0,60.0,0.7867,10.0,3.0,2004.0,21.0,2
4,1.6,1272.0,6.5,836.0,131.0,1205.0,116.0,1490.0,1110.0,11.2,59.6,0.7888,10.0,3.0,2004.0,22.0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9352,3.1,1314.0,13.5,1101.0,472.0,539.0,190.0,1374.0,1729.0,21.9,29.3,0.7568,4.0,4.0,2005.0,10.0,3
9353,2.4,1163.0,11.4,1027.0,353.0,604.0,179.0,1264.0,1269.0,24.3,23.7,0.7119,4.0,4.0,2005.0,11.0,3
9354,2.4,1142.0,12.4,1063.0,293.0,603.0,175.0,1241.0,1092.0,26.9,18.3,0.6406,4.0,4.0,2005.0,12.0,3
9355,2.1,1003.0,9.5,961.0,235.0,702.0,156.0,1041.0,770.0,28.3,13.5,0.5139,4.0,4.0,2005.0,13.0,2


# Split

In [21]:
X = air_imputed.drop(columns='BENZENE')
y = air_imputed['BENZENE']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=23)

In [22]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(7485, 16)
(1872, 16)
(7485,)
(1872,)


In [23]:
pt = PowerTransformer()

X_train_norm = pt.fit_transform(X_train)
X_test_norm = pt.fit_transform(X_test)

# Baseline

In [24]:
modelos = {
    'Logistic Regression': LogisticRegression(),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'SVM': SVC(),
    'kNeighbour': KNeighborsClassifier(),
    'Naive Bayes': GaussianNB(),
    'Red Neuronal': MLPClassifier(),
    "Gradient Boosting": GradientBoostingClassifier(),
}

metricas = ['accuracy', 'precision_macro', 'recall_macro']

resultados_dict = {}

for nombre_modelo, modelo in modelos.items():
    cv_resultados = cross_validate(modelo, X_train_norm, y_train, cv=5, scoring=metricas)
    
    for metrica in metricas:
        clave = f"{nombre_modelo}_{metrica}"
        resultados_dict[clave] = cv_resultados[f"test_{metrica}"].mean()

resultados_df = pd.DataFrame([resultados_dict])

resultados_df.T


Unnamed: 0,0
Logistic Regression_accuracy,0.929459
Logistic Regression_precision_macro,0.919053
Logistic Regression_recall_macro,0.823381
Decision Tree_accuracy,0.962993
Decision Tree_precision_macro,0.913474
Decision Tree_recall_macro,0.911814
Random Forest_accuracy,0.976219
Random Forest_precision_macro,0.982985
Random Forest_recall_macro,0.904634
SVM_accuracy,0.924783


Los mejores modelo son GradientBoosting, RandomForest y DecissionTree.

# Modelo

Se entrenan los tres modelos

### GradientBoosting

In [25]:
gb = GradientBoostingClassifier()
gb.fit(X_train_norm, y_train)

In [26]:
gbpred = gb.predict(X_test_norm)
gbpred

array([3, 1, 3, ..., 3, 1, 2], dtype=int64)

In [27]:
print(classification_report(y_test, gbpred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        27
           1       0.99      0.99      0.99       489
           2       0.94      0.98      0.96       544
           3       0.96      0.97      0.97       553
           4       1.00      0.94      0.97       196
           5       0.67      0.51      0.58        63

    accuracy                           0.96      1872
   macro avg       0.93      0.90      0.91      1872
weighted avg       0.96      0.96      0.96      1872



### RandomForest

In [28]:
rf = RandomForestClassifier()
rf.fit(X_train_norm, y_train)

In [29]:
rfpred = rf.predict(X_test_norm)
rfpred

array([3, 1, 3, ..., 3, 1, 2], dtype=int64)

In [30]:
print(classification_report(y_test, rfpred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        27
           1       0.96      1.00      0.98       489
           2       0.93      0.97      0.95       544
           3       0.96      0.97      0.96       553
           4       1.00      0.93      0.97       196
           5       0.96      0.41      0.58        63

    accuracy                           0.95      1872
   macro avg       0.97      0.88      0.91      1872
weighted avg       0.96      0.95      0.95      1872



### DecisionTree

In [31]:
dt = DecisionTreeClassifier()
dt.fit(X_train_norm, y_train)

In [32]:
dtpred = dt.predict(X_test_norm)
dtpred

array([3, 1, 3, ..., 3, 5, 2], dtype=int64)

In [33]:
print(classification_report(y_test, dtpred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        27
           1       0.99      0.99      0.99       489
           2       0.93      0.95      0.94       544
           3       0.96      0.93      0.94       553
           4       1.00      0.94      0.97       196
           5       0.35      0.46      0.39        63

    accuracy                           0.94      1872
   macro avg       0.87      0.88      0.87      1872
weighted avg       0.94      0.94      0.94      1872



Se entrenan los tres modelos, se realiza la predicción y se imprime el ClassificationReport para evaluar el rendimiento de cada modelo antes de la optimización.  
Podemos observar que, en pricipio, el que mejor precision obtiene es el RandomForest.

# Optimización

Se aplicará a los tres modelos un RandomizedSearch, técnica de optimización que evalúa un número establecido de combinaciones aleatorias de los hiperparámetros seleccionados. Éstos se detallan más abajo.

### Grandient Boosting

In [34]:
param_grid = {
    'n_estimators': randint(1, 10),
    'max_depth': randint(2, 10),
    'min_samples_split': randint(2, 10),
    'min_samples_leaf': randint(1, 10),
    'max_features': ['auto', 'sqrt', 'log2'],
}

rsgb = RandomizedSearchCV(estimator=gb, param_distributions=param_grid, n_iter=50, cv=3, scoring='accuracy', random_state=42)

rsgb.fit(X_train_norm, y_train)

best_params = rsgb.best_params_
print("Mejores hiperparámetros:", best_params)

best_model_rsgb = rsgb.best_estimator_

Mejores hiperparámetros: {'max_depth': 8, 'max_features': 'log2', 'min_samples_leaf': 4, 'min_samples_split': 6, 'n_estimators': 7}


### Descripción de los hiperparámetros:
- **n_estimators**: Número de árboles del bosque.
- **max_depth**: Profundidad máxima de cara arbol.
- **min_samples_split**: Número mínimo de muestras para dividir un nodo.
- **min_samples_leaf**: Es el número mínimo de muestras requeridas para estar en un nodo hoja.
- **max_features**: Número máximo de características a considerar al buscar la mejor división. 

### Random Forest

In [35]:
param_grid = {
    'n_estimators': randint(1, 10),
    'max_depth': randint(2, 10),
    'min_samples_split': randint(2, 10),
    'min_samples_leaf': randint(1, 10),
    'max_features': ['auto', 'sqrt', 'log2'],
}

rsrf = RandomizedSearchCV(estimator=rf, param_distributions=param_grid, n_iter=50, cv=3, scoring='accuracy', random_state=42)

rsrf.fit(X_train_norm, y_train)

best_params = rsrf.best_params_
print("Mejores hiperparámetros:", best_params)

best_model_rsrf = rsrf.best_estimator_

Mejores hiperparámetros: {'max_depth': 8, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 9}


Mismos hiperparámetros que el modelo anterior.

### Decision Tree

In [36]:
param_grid = {
    'max_depth': [None, 10, 20, 30, 40, 50],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 'log2']
}

rsdt = RandomizedSearchCV(estimator=dt, param_distributions=param_grid, n_iter=50, cv=3, scoring='accuracy', random_state=42)

rsdt.fit(X_train_norm, y_train)

best_params = rsdt.best_params_
print("Mejores hiperparámetros:", best_params)

best_model_rsdt = rsdt.best_estimator_

Mejores hiperparámetros: {'min_samples_split': 5, 'min_samples_leaf': 4, 'max_features': 'sqrt', 'max_depth': 50}


Mismos hiperparámetros que el modelo anterior excepto n_estimators.

Se vuelven a evaluar los modelo para ver si han mejorado.

In [37]:
rsgb_y_pred = rsgb.predict(X_test_norm)

print(classification_report(y_test, rsgb_y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        27
           1       0.96      1.00      0.98       489
           2       0.93      0.97      0.95       544
           3       0.96      0.97      0.96       553
           4       1.00      0.93      0.97       196
           5       0.96      0.37      0.53        63

    accuracy                           0.95      1872
   macro avg       0.97      0.87      0.90      1872
weighted avg       0.95      0.95      0.95      1872



In [38]:
rsrf_y_pred = rsrf.predict(X_test_norm)

print(classification_report(y_test, rsrf_y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        27
           1       0.95      1.00      0.98       489
           2       0.91      0.97      0.94       544
           3       0.96      0.96      0.96       553
           4       1.00      0.94      0.97       196
           5       0.73      0.17      0.28        63

    accuracy                           0.95      1872
   macro avg       0.93      0.84      0.85      1872
weighted avg       0.94      0.95      0.94      1872



In [39]:
rsdt_y_pred = rsdt.predict(X_test_norm)

print(classification_report(y_test, rsdt_y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        27
           1       0.95      0.98      0.96       489
           2       0.93      0.94      0.94       544
           3       0.95      0.96      0.95       553
           4       1.00      0.94      0.97       196
           5       0.58      0.46      0.51        63

    accuracy                           0.94      1872
   macro avg       0.90      0.88      0.89      1872
weighted avg       0.94      0.94      0.94      1872



Como podemos observar, el GrandientBoosting ha mejorado y arroja los mejroes resultados, por lo que será este modelo el que guardemos.

In [42]:
dump(best_model_rsgb, '../model/GB_entrenado.joblib')

['../model/GB_entrenado.joblib']