<a href="https://colab.research.google.com/github/DCDPUAEM/DCDP_2022/blob/main/02-Machine-Learning/notebooks/11-Regresion-Logistica-SOL.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Regresion logística a dataset Iris

In [35]:
#imports 
import pandas as pd
import numpy as np
from sklearn.datasets import load_iris
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split

In [36]:
#cargar dataset
iris = load_iris()
xdata = iris.data
ydata = iris.target


In [37]:
#valores faltantes
missing = np.isnan(xdata).sum()
print(f"valores faltantes: {missing}")


valores faltantes: 0


In [38]:
#como no hay valores faltantes no hace falta imputar datos

#variables categóricas
feature_names = iris.feature_names

print(f"Nombres de las features: {feature_names}")

Nombres de las features: ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']


In [39]:
#obtenemos los conjuntos de prueba y entrenamiento
X_train, X_test, y_train, y_test = train_test_split(xdata, ydata, test_size=0.3, random_state=1)

In [40]:
#assert shapes
print("Forma de X_train:", X_train.shape)
print("Forma de X_test:", X_test.shape)
print("Forma de y_train:", y_train.shape)
print("Forma de y_test:", y_test.shape)


Forma de X_train: (105, 4)
Forma de X_test: (45, 4)
Forma de y_train: (105,)
Forma de y_test: (45,)


In [48]:
# imports para la regresión logística
# Definimos un pipeline con selector de features,
# escalador y regresión logística y croos validation, y grid_search 
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, VarianceThreshold
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

In [45]:
#selector = SelectKBest(k=5)
selector = VarianceThreshold(0.25)
scaler = MinMaxScaler()
clf = LogisticRegression()

pl = Pipeline([('selector',selector),
               ('escalador',scaler),
               ('clasificador',clf)])

pl.fit(X_train,y_train)

In [46]:
cvs = cross_val_score(pl,X_train,y_train)

print(f"Accuracy en el conjunto de entrenamiento: {pl.score(X_train,y_train)}")
print(f"CV Accuracy en el conjunto de entrenamiento: {np.mean(cvs)}")
print(f"Accuracy en el conjunto de prueba: {pl.score(X_test,y_test)}")

Accuracy en el conjunto de entrenamiento: 0.9142857142857143
CV Accuracy en el conjunto de entrenamiento: 0.8952380952380953
Accuracy en el conjunto de prueba: 0.9333333333333333


Hacemos grid search:

In [None]:


#"selector__k": [2,3,4,5,6], #número de características que se seleccionarán
param_grid = {
    "selector__threshold": [0,0.1,0.2,0.5], #Umbral para seleccionar características
    "clasificador__C": [0.1,0.5,1,2], #Parámetro de regularización; valores pequeños indica mayor regularización
    "clasificador__solver": ['lbfgs','newton-cg','sag'], #diferentes solvers: 'lbfgs', 'newton-cg' y 'sag'.
    "clasificador__penalty": [None,'l2'] #tipo de penalización
}

search = GridSearchCV(pl, param_grid, n_jobs=-1)
search.fit(X_train, y_train)
print("Best parameter (CV score=%0.3f):" % search.best_score_)
print(search.best_params_)

In [50]:
selector = VarianceThreshold(0)
scaler = MinMaxScaler()
clf = LogisticRegression(C=1,penalty='l2',solver='lbfgs')

best_pl = Pipeline([('selector',selector),
               ('escalador',scaler),
               ('clasificador',clf)])

best_pl.fit(X_train,y_train)

In [51]:
from sklearn.model_selection import cross_val_score

cvs = cross_val_score(best_pl,X_train,y_train)

print(f"Accuracy en el conjunto de entrenamiento: {best_pl.score(X_train,y_train)}")
print(f"CV Accuracy en el conjunto de entrenamiento: {np.mean(cvs)}")
print(f"Accuracy en el conjunto de prueba: {best_pl.score(X_test,y_test)}")

Accuracy en el conjunto de entrenamiento: 0.9238095238095239
CV Accuracy en el conjunto de entrenamiento: 0.9047619047619048
Accuracy en el conjunto de prueba: 0.9111111111111111


Veamos varios clasificadores al mismo tiempo. En cada clasificador realizamos un grid search y obtenemos el accuracy-cv para el entrenamiento y el accuracy en el conjunto de prueba.

In [None]:
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.feature_selection import SelectKBest
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.metrics import accuracy_score

selector = SelectKBest()
scaler = MinMaxScaler()


lr = LogisticRegression()
dt = DecisionTreeClassifier()
rf = RandomForestClassifier()
svm = SVC()
nb = GaussianNB()

params_lr = {
            "selector__k": [2,3,4,5,6],
            "clasificador__C": [0.1,0.5,1,2],
            "clasificador__solver": ['lbfgs','newton-cg','sag'],
            "clasificador__penalty": [None,'l2']}
params_dt = {
            "selector__k": [2,3,4,5,6],
            "clasificador__criterion": ['gini','entropy','log_loss'],
            "clasificador__max_depth": [None,10,20],
            "clasificador__min_samples_split": [2,3,4]}
params_rf = {
            "selector__k": [2,3,4,5,6],
            "clasificador__n_estimators": [25,50,75,100],
            "clasificador__max_depth": [None,5,10]
            }
params_svm = {
            "selector__k": [2,3,4,5,6],
            "clasificador__C": [0.5,1,2],
            "clasificador__kernel": ['linear', 'poly', 'rbf']
            }
params_nb = {
            "selector__k": [2,3,4,5,6]
            }

clasificadores = [lr,dt,rf,svm,nb]
params_grids = [params_lr,params_dt,params_rf,params_svm,params_nb]

training_cv_accs = []
testing_accs = []

for j,(clf,param_grid) in enumerate(zip(clasificadores,params_grids)):
    pl = Pipeline([('selector',selector),
                ('escalador',scaler),
                ('clasificador',clf)])
    search = GridSearchCV(pl, param_grid, n_jobs=-1)
    search.fit(X_train, y_train)
    best_option = search.best_estimator_
    cvs = cross_val_score(best_option,X_train,y_train)
    training_cv_accs.append(np.mean(cvs))
    y_pred = best_option.predict(X_test)
    testing_accs.append(accuracy_score(y_test,y_pred))
    print(f"Clasificador {j+1} explorado")


Veamos los resultados, ordenados por el accuracy de prueba.

In [53]:
results_df = pd.DataFrame(data={
                        'clasificador':['LogReg','DT','RF','SVM','G Naive Bayes'],
                        'Training CV accuracy': training_cv_accs,
                        'Test Accuracy':testing_accs
                        })
results_df.sort_values(by='Test Accuracy',ascending=False)

Unnamed: 0,clasificador,Training CV accuracy,Test Accuracy
2,RF,0.952381,0.977778
1,DT,0.952381,0.955556
3,SVM,0.961905,0.955556
4,G Naive Bayes,0.961905,0.955556
0,LogReg,0.971429,0.933333


Veamos los resultados, ordenados por el accuracy de prueba.

In [54]:
results_df = pd.DataFrame(data={
                        'clasificador':['LogReg','DT','RF','SVM','G Naive Bayes'],
                        'Training CV accuracy': training_cv_accs,
                        'Test Accuracy':testing_accs
                        })
results_df.sort_values(by='Test Accuracy',ascending=False)

Unnamed: 0,clasificador,Training CV accuracy,Test Accuracy
2,RF,0.952381,0.977778
1,DT,0.952381,0.955556
3,SVM,0.961905,0.955556
4,G Naive Bayes,0.961905,0.955556
0,LogReg,0.971429,0.933333
