In [1]:
#Habilitar intellisense
%config IPCompleter.greedy = True

## Regresión logística

Es un algoritmo para obtener un clasificador binario. 

La regresión logística es bastante efectiva en situaciones en las que la relación entre la **probabilidad** de lograr una meta/objetivo (Y) está vinculada a los recursos necesarios (X) de manera no lineal donde una disminución/aumento de cierto recurso más allá de cierto umbral disminuye/aumenta drásticamente la probabilidad de lograr el objetivo.


<img src="logistica\01-lineal-vs-logistica.png" style="width:600px"/>


<img src="logistica\02-regresion-logistica.png" style="width:600px"/>


Los clasificadores binaros basados en regresión logística clasifican las observaciones de acuerdo a un umbral típicamente 0.5 (50%).

Hay dos técnicas comunmente empleadas para obtener los coeficientes de regresión:
- __[MLE](https://es.wikipedia.org/wiki/M%C3%A1xima_verosimilitud)__  
- __[Mínimos cuadrados](https://es.wikipedia.org/wiki/M%C3%ADnimos_cuadrados)__ (luego de convertir la relación establecida por la curva "S" a una relación lineal)


__[Scikit Learn - Regresión logística](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html)__

In [2]:
import pandas as pd
import numpy as np
import os

In [2]:
df_entrenamiento = pd.read_csv(os.path.join("procesado", "train.csv"), index_col='PassengerId')

In [3]:
df_entrenamiento.head()

Unnamed: 0_level_0,Survived,Age,Fare,FamilySize,IsMother,IsMale,Deck_A,Deck_B,Deck_C,Deck_D,...,Title_Sir,Fare_Bin_very_low,Fare_Bin_low,Fare_Bin_high,Fare_Bin_very_high,Embarked_C,Embarked_Q,Embarked_S,AgeState_Adult,AgeState_Child
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,22.0,7.25,2,0,1,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
2,1,38.0,71.2833,2,0,0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0
3,1,26.0,7.925,1,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
4,1,35.0,53.1,2,0,0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0
5,0,35.0,8.05,1,0,1,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0


In [4]:
X = df_entrenamiento.loc[:,'Age':].to_numpy().astype('float')
y = df_entrenamiento['Survived'].ravel() 

In [25]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(15580, 13) (15580,)
(3895, 13) (3895,)


In [48]:
from sklearn.linear_model import LogisticRegression

In [7]:
# crear el clasificador
clasificador_reg_log = LogisticRegression(random_state=0, solver='liblinear')

In [8]:
# entrenar el clasificador
clasificador_reg_log.fit(X_train,y_train)

LogisticRegression(random_state=0, solver='liblinear')

In [9]:
print('accuracy del clasificador - version 1 : {0:.2f}'.format(clasificador_reg_log.score(X_test, y_test)))

accuracy del clasificador - version 1 : 0.83


### El hiperparámetro 'penalty'
__[L1 Norms versus L2 Norms](https://www.kaggle.com/residentmario/l1-norms-versus-l2-norms)__

__[L1 and L2 Regularization Methods](https://towardsdatascience.com/l1-and-l2-regularization-methods-ce25e7fc831c)__

__[The difference between L1 and L2 regularization](https://explained.ai/regularization/L1vsL2.html)__

In [10]:
#evaluar el desempeño
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score

In [11]:
# accuracy
print('accuracy del clasificador - version 1 : {0:.2f}'.format(accuracy_score(y_test, clasificador_reg_log.predict(X_test))))
# confusion matrix
print('matriz de confusión del clasificador - version 1: \n {0}'.format(confusion_matrix(y_test, clasificador_reg_log.predict(X_test))))
# precision 
print('precision del clasificador - version 1 : {0:.2f}'.format(precision_score(y_test, clasificador_reg_log.predict(X_test))))
# recall 
print('recall del clasificador - version 1 : {0:.2f}'.format(recall_score(y_test, clasificador_reg_log.predict(X_test))))
# f1
print('f1 del clasificador - version 1 : {0:.2f}'.format(f1_score(y_test, clasificador_reg_log.predict(X_test))))

accuracy del clasificador - version 1 : 0.83
matriz de confusión del clasificador - version 1: 
 [[95 15]
 [15 54]]
precision del clasificador - version 1 : 0.78
recall del clasificador - version 1 : 0.78
f1 del clasificador - version 1 : 0.78


In [12]:
# coeficientes del modelo
clasificador_reg_log.coef_

array([[-0.02842268,  0.00455451, -0.50009089,  0.6178132 , -0.81392331,
         0.12845079, -0.17281789, -0.39317834,  0.52159979,  1.09941224,
         0.40341217, -0.18345052, -0.30036043,  0.96533486,  0.48256744,
        -0.34483448,  0.28089598,  1.21761328,  0.56363966, -1.44586305,
         1.07245548, -0.11273708, -0.47293646,  0.16255648,  0.24716933,
         0.28009428,  0.41324773,  0.49183528,  0.46198829,  0.14924424,
         0.37283516,  0.73023265]])

In [13]:
df_entrenamiento.loc[:,'Age':].columns

Index(['Age', 'Fare', 'FamilySize', 'IsMother', 'IsMale', 'Deck_A', 'Deck_B',
       'Deck_C', 'Deck_D', 'Deck_E', 'Deck_F', 'Deck_G', 'Deck_Z', 'Pclass_1',
       'Pclass_2', 'Pclass_3', 'Title_Lady', 'Title_Master', 'Title_Miss',
       'Title_Mr', 'Title_Mrs', 'Title_Officer', 'Title_Sir',
       'Fare_Bin_very_low', 'Fare_Bin_low', 'Fare_Bin_high',
       'Fare_Bin_very_high', 'Embarked_C', 'Embarked_Q', 'Embarked_S',
       'AgeState_Adult', 'AgeState_Child'],
      dtype='object')

In [14]:
list(zip(df_entrenamiento.loc[:,'Age':].columns, clasificador_reg_log.coef_[0]))

[('Age', -0.028422683015196647),
 ('Fare', 0.004554507823394219),
 ('FamilySize', -0.5000908891268564),
 ('IsMother', 0.6178131963766433),
 ('IsMale', -0.8139233076723839),
 ('Deck_A', 0.12845079036009457),
 ('Deck_B', -0.17281789049758128),
 ('Deck_C', -0.39317834247455324),
 ('Deck_D', 0.521599791503767),
 ('Deck_E', 1.0994122448735741),
 ('Deck_F', 0.4034121665394925),
 ('Deck_G', -0.18345051736824694),
 ('Deck_Z', -0.300360429185816),
 ('Pclass_1', 0.9653348584493128),
 ('Pclass_2', 0.4825674358632859),
 ('Pclass_3', -0.3448344805617978),
 ('Title_Lady', 0.2808959813204045),
 ('Title_Master', 1.2176132826041954),
 ('Title_Miss', 0.5636396591879067),
 ('Title_Mr', -1.4458630469449079),
 ('Title_Mrs', 1.0724554809148932),
 ('Title_Officer', -0.11273707920989971),
 ('Title_Sir', -0.4729364641217851),
 ('Fare_Bin_very_low', 0.16255648291690958),
 ('Fare_Bin_low', 0.24716932703029296),
 ('Fare_Bin_high', 0.2800942772661382),
 ('Fare_Bin_very_high', 0.4132477265374808),
 ('Embarked_C', 0

## Ejercicio clasificación
Crear un clasificador basado en el algoritmo de regresión logistica para predecir si el valor de la vivienda supera la media

entrada: housing.csv

Procedimiento:
- Cargar los datos los datos a un DataFrame y explorar brevemente
- Eliminar las observaciones que tengan algun dato faltante
- Eliminar las observaciones con el valor atípico (max) para la variable 'expected_house_value'
- Aplicar one hot encoding a la variable 'ocean_proximity'
- Crear una nueva variable boolean 'above_median'
- Quitar 'expected_house_value'
- Aplicar los pasos train-test-split para poder entrenar y evaluar un clasificador basado en regresión logística

Cuáles los valores para accuracy, matriz de confusion, precision, recall y f1 del clasificador?

In [53]:
df_housing = pd.read_csv(os.path.join("housing.csv"))
df_housing.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,expected_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [54]:
df_housing.describe()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,expected_house_value
count,20640.0,20640.0,20640.0,20640.0,20433.0,20640.0,20640.0,20640.0,20640.0
mean,-119.569704,35.631861,28.639486,2635.763081,537.870553,1425.476744,499.53968,3.870671,206855.816909
std,2.003532,2.135952,12.585558,2181.615252,421.38507,1132.462122,382.329753,1.899822,115395.615874
min,-124.35,32.54,1.0,2.0,1.0,3.0,1.0,0.4999,14999.0
25%,-121.8,33.93,18.0,1447.75,296.0,787.0,280.0,2.5634,119600.0
50%,-118.49,34.26,29.0,2127.0,435.0,1166.0,409.0,3.5348,179700.0
75%,-118.01,37.71,37.0,3148.0,647.0,1725.0,605.0,4.74325,264725.0
max,-114.31,41.95,52.0,39320.0,6445.0,35682.0,6082.0,15.0001,500001.0


In [55]:
df_housing.shape

(20640, 10)

In [56]:
df_housing.dropna(inplace=True)
df_housing.shape

(20433, 10)

In [57]:
max_expected_house_value = df_housing["expected_house_value"].max()
max_expected_house_value

500001.0

In [58]:
df_housing["expected_house_value"] != max_expected_house_value

0        True
1        True
2        True
3        True
4        True
         ... 
20635    True
20636    True
20637    True
20638    True
20639    True
Name: expected_house_value, Length: 20433, dtype: bool

In [59]:
df_housing[df_housing["expected_house_value"] != max_expected_house_value].shape

(19475, 10)

In [60]:
df_housing = df_housing[df_housing["expected_house_value"] != max_expected_house_value]
df_housing.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,expected_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [34]:
df_housing.shape

(19475, 10)

In [61]:
df_housing = pd.get_dummies(df_housing,columns=['ocean_proximity'])

In [62]:
df_housing.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,expected_house_value,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,0,0,0,1,0
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,0,0,0,1,0
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,0,0,0,1,0
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,0,0,0,1,0
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,0,0,0,1,0


In [63]:
expected_house_value_mean = df_housing['expected_house_value'].mean()
expected_house_value_mean

192444.6519127086

In [64]:
df_housing['expected_house_value'] > expected_house_value_mean

0         True
1         True
2         True
3         True
4         True
         ...  
20635    False
20636    False
20637    False
20638    False
20639    False
Name: expected_house_value, Length: 19475, dtype: bool

In [65]:
df_housing['above_median'] = df_housing['expected_house_value'].map(lambda value: 1 if value > expected_house_value_mean  else 0)

In [66]:
df_housing

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,expected_house_value,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN,above_median
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,0,0,0,1,0,1
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,0,0,0,1,0,1
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,0,0,0,1,0,1
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,0,0,0,1,0,1
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,0,0,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20635,-121.09,39.48,25.0,1665.0,374.0,845.0,330.0,1.5603,78100.0,0,1,0,0,0,0
20636,-121.21,39.49,18.0,697.0,150.0,356.0,114.0,2.5568,77100.0,0,1,0,0,0,0
20637,-121.22,39.43,17.0,2254.0,485.0,1007.0,433.0,1.7000,92300.0,0,1,0,0,0,0
20638,-121.32,39.43,18.0,1860.0,409.0,741.0,349.0,1.8672,84700.0,0,1,0,0,0,0


In [67]:
df_housing.drop(columns=['expected_house_value'], inplace=True)

In [42]:
df_housing

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN,above_median
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,0,0,0,1,0,1
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,0,0,0,1,0,1
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,0,0,0,1,0,1
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,0,0,0,1,0,1
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,0,0,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20635,-121.09,39.48,25.0,1665.0,374.0,845.0,330.0,1.5603,0,1,0,0,0,0
20636,-121.21,39.49,18.0,697.0,150.0,356.0,114.0,2.5568,0,1,0,0,0,0
20637,-121.22,39.43,17.0,2254.0,485.0,1007.0,433.0,1.7000,0,1,0,0,0,0
20638,-121.32,39.43,18.0,1860.0,409.0,741.0,349.0,1.8672,0,1,0,0,0,0


In [68]:
y = df_housing['above_median'].ravel()
y

array([1, 1, 1, ..., 0, 0, 0], dtype=int64)

In [69]:
df_housing.drop(columns=['above_median'], inplace=True)
df_housing

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,0,0,0,1,0
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,0,0,0,1,0
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,0,0,0,1,0
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,0,0,0,1,0
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
20635,-121.09,39.48,25.0,1665.0,374.0,845.0,330.0,1.5603,0,1,0,0,0
20636,-121.21,39.49,18.0,697.0,150.0,356.0,114.0,2.5568,0,1,0,0,0
20637,-121.22,39.43,17.0,2254.0,485.0,1007.0,433.0,1.7000,0,1,0,0,0
20638,-121.32,39.43,18.0,1860.0,409.0,741.0,349.0,1.8672,0,1,0,0,0


In [70]:
X = df_housing.to_numpy().astype('float')
X

array([[-122.23,   37.88,   41.  , ...,    0.  ,    1.  ,    0.  ],
       [-122.22,   37.86,   21.  , ...,    0.  ,    1.  ,    0.  ],
       [-122.24,   37.85,   52.  , ...,    0.  ,    1.  ,    0.  ],
       ...,
       [-121.22,   39.43,   17.  , ...,    0.  ,    0.  ,    0.  ],
       [-121.32,   39.43,   18.  , ...,    0.  ,    0.  ,    0.  ],
       [-121.24,   39.37,   16.  , ...,    0.  ,    0.  ,    0.  ]])

In [71]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(15580, 13) (15580,)
(3895, 13) (3895,)


In [72]:
# crear el clasificador
clasificador_reg_log = LogisticRegression(random_state=0, solver='liblinear')
clasificador_reg_log

In [73]:
clasificador_reg_log.fit(X_train,y_train)

In [74]:
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score

In [75]:
# accuracy
print('accuracy del clasificador - version 1 : {0:.2f}'.format(accuracy_score(y_test, clasificador_reg_log.predict(X_test))))
# confusion matrix
print('matriz de confusión del clasificador - version 1: \n {0}'.format(confusion_matrix(y_test, clasificador_reg_log.predict(X_test))))
# precision 
print('precision del clasificador - version 1 : {0:.2f}'.format(precision_score(y_test, clasificador_reg_log.predict(X_test))))
# recall 
print('recall del clasificador - version 1 : {0:.2f}'.format(recall_score(y_test, clasificador_reg_log.predict(X_test))))
# f1
print('f1 del clasificador - version 1 : {0:.2f}'.format(f1_score(y_test, clasificador_reg_log.predict(X_test))))

accuracy del clasificador - version 1 : 0.82
matriz de confusión del clasificador - version 1: 
 [[1910  340]
 [ 380 1265]]
precision del clasificador - version 1 : 0.79
recall del clasificador - version 1 : 0.77
f1 del clasificador - version 1 : 0.78


## Referencias

__[The Basics: Logistic Regression and Regularization](https://towardsdatascience.com/the-basics-logistic-regression-and-regularization-828b0d2d206c)__