<img src="https://www.python.org/static/img/python-logo.png" width="300" alt="Python logo"  />
<font color='blue'>$\Large\text{Social Data Consulting}$</font>

$$\large\textbf{Regresión Logística}$$

_Tutor: Sergio LM_

<h2>Contenido</h2>

<div class="alert alert-block alert-info" style="margin-top: 20px">
<ol>
    <li><a href="#data_acquisition">Selección de variables</a>
    <li><a href="#data_export">Visualización</a></li>
    <li><a href="#data_missing">Normalización y Transformación</a></li>
    <li><a href="#data_missing">Regresión Logística</a></li>
</ol>

</div>
<hr>

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
filename = 'heart_failure.csv'
df = pd.read_csv(filename)
df.head(3)

Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,DEATH_EVENT
0,75.0,0,582,0,20,1,265000.0,1.9,130,1,0,4,1
1,55.0,0,7861,0,38,0,263358.03,1.1,136,1,0,6,1
2,65.0,0,146,0,20,0,162000.0,1.3,129,1,1,7,1


In [None]:
df.shape

In [None]:
df.dtypes

In [None]:
df['DEATH_EVENT'].value_counts()

In [None]:
df.describe()

In [None]:
df.groupby('DEATH_EVENT').size()

### Selección de variables

In [None]:
df.head(3)

In [None]:
df['sex'].value_counts()  # get_dummies

In [None]:
pd.DataFrame({'F':[1,0,1], 'M':[0,1,0]}) # Variables dummies

In [None]:
df.dtypes

In [None]:
variables = ['creatinine_phosphokinase', 'platelets', 'serum_creatinine', 'serum_sodium', 'DEATH_EVENT']

In [None]:
ds = df[variables]
ds.head()

In [None]:
ds.columns = map(str.lower, ds.columns)  # poner en minuscula a los nombrs de las variables

In [None]:
ds.head()

### Visualización de datos

In [None]:
ds.drop(['death_event'],1).hist()  # hitograma de las variables independientes
plt.show()

In [None]:
variables_continuas = ['creatinine_phosphokinase', 'platelets', 'serum_creatinine', 'serum_sodium']

sns.pairplot(ds.dropna(), hue='death_event',
            height = 4,
            vars = variables_continuas,
            kind = 'reg')
plt.show()

In [None]:
variables_categoricas = ["anaemia", "diabetes", "high_blood_pressure", "sex", "smoking"]

In [None]:
plt.figure(figsize=(12, 8))

for i, col in enumerate(variables_categoricas):
    plt.subplot(2, 3, i+1)
    plt.title(col)
    plt.subplots_adjust(hspace =.5, wspace=.3)
    sns.countplot(data=df, x=col, hue="DEATH_EVENT", palette = ['C0', 'C1'], alpha=0.8, edgecolor="k", linewidth=1)

### Normalización

In [None]:
ax = ds.boxplot()
ax.tick_params(axis='x', rotation=45)

In [None]:
from sklearn import preprocessing

In [None]:
pla_zs = preprocessing.scale(ds['platelets'])  # z-score   xnor = (x - mean(x))/std(x)

In [None]:
ds.loc[:,'platelets'] = pla_zs  # actualizando datos

In [None]:
ax = ds.boxplot()
ax.tick_params(axis='x', rotation=45)

In [None]:
cpk_zs = preprocessing.scale(ds['creatinine_phosphokinase'])

In [None]:
ds.loc[:,'creatinine_phosphokinase'] = cpk_zs

In [None]:
ax = ds.boxplot()
ax.tick_params(axis='x', rotation=45)

In [None]:
ss = preprocessing.scale(ds['serum_sodium'])

In [None]:
ds.loc[:,'serum_sodium'] = ss

In [None]:
ax = ds.boxplot()
ax.tick_params(axis='x', rotation=45)

In [None]:
# relacion entre variables con datos transformados y normalizados
sns.pairplot(ds.dropna(), hue='death_event',
            height = 4,
            vars = ['creatinine_phosphokinase', 'platelets', 'serum_creatinine', 'serum_sodium'],
            kind = 'reg')
plt.show()

### Transformación

In [None]:
from feature_engine import variable_transformers as vt

In [None]:
from scipy.stats import norm

In [None]:
sns.distplot(ds['creatinine_phosphokinase'], fit=norm)
plt.show()

In [None]:
min(ds['creatinine_phosphokinase'])

In [None]:
tf = vt.YeoJohnsonTransformer(variables = ['creatinine_phosphokinase'])

# fit the transformer
tf.fit(ds)

#Learns the optimal lambda for the Yeo-Johnson transformation.
# transform the data
dt = tf.transform(ds)

In [None]:
sns.distplot(dt['creatinine_phosphokinase'], fit=norm)
plt.show()

In [None]:
dt.head()

### Modelo de Regresión Logística

In [None]:
dt.describe()

In [None]:
ds.iloc[:,0:4]

In [None]:
x = dt.drop(['death_event'],1)  # variables independients
y = dt['death_event']  # variable dependiente

In [None]:
from sklearn import linear_model

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

In [None]:
model = linear_model.LogisticRegression()

In [None]:
model.fit(x,y)

In [None]:
model.coef_

In [None]:
model.intercept_

In [None]:
# Prediccion
pred = model.predict(x)

In [None]:
pred[:10]

In [None]:
dt['death_event'][:10]

In [None]:
# Precision media de las predicciones
model.score(x,y)

In [None]:
# Entrenamiento y prueba
from sklearn import model_selection

In [None]:
test_size = 0.20
seed = 7
x_train, x_test, y_train, y_test = model_selection.train_test_split(x, y, test_size=test_size, random_state=seed)

In [None]:
reglog = linear_model.LogisticRegression()

In [None]:
# train
result = reglog.fit(x_train, y_train)

In [None]:
result.coef_

In [None]:
result.intercept_

In [None]:
result.score(x_train, y_train)

In [None]:
# Prediciones
y_pred = reglog.predict(x_test)
accuracy_score(y_test, y_pred)

In [None]:
dt.head()

In [None]:
pred = pd.DataFrame({'y_test':y_test, 'y_pred': y_pred})
pred.head()

In [None]:
pred.tail()

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
sns.heatmap(pd.DataFrame(confusion_matrix(y_test,y_pred)), annot=True)
plt.show()

In [None]:
dt['death_event'].value_counts()

In [None]:
dt.head(3)

In [None]:
dt['death_event'].value_counts()

In [None]:
confusion_matrix(y_test, y_pred)

In [None]:
cm = confusion_matrix(y_test, y_pred)

fig, ax = plt.subplots(figsize=(8, 8))
ax.imshow(cm)
ax.grid(False)
ax.xaxis.set(ticks=(0, 1), ticklabels=('Predicted 0s', 'Predicted 1s'))
ax.yaxis.set(ticks=(0, 1), ticklabels=('Actual 0s', 'Actual 1s'))
ax.set_ylim(1.5, -0.5)
for i in range(2):
    for j in range(2):
        ax.text(j, i, cm[i, j], ha='center', va='center', color='white', size=16)
plt.show()

In [None]:
# Usando statsmodels
import statsmodels.api as sm

In [None]:
log_reg = sm.Logit(y_train, x_train).fit()

In [None]:
# tabla de resumen
print(log_reg.summary())

In [None]:
# Prediccion
y_pred = log_reg.predict(x_test)
y_pred = list(map(round, y_pred))
  
# comparacion de resultados
print('Acutal values', list(y_test.values))
print('Predictions :', list(y_pred))