In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn

from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Cargar datos y transformar Gender a dummy

In [2]:
churn_df = pd.read_csv("C:/Users/Usuario/Anaconda3/Tavo/BankChurn/BankData01.csv")
y = churn_df['Exited']
X1 = pd.get_dummies(churn_df['Gender'])
X2 = churn_df[["IsActiveMember", "Age", "HasCrCard", "NumOfProducts", "Balance"]]
X = pd.concat([X1, X2], axis=1)
X.head()

Unnamed: 0,Female,Male,IsActiveMember,Age,HasCrCard,NumOfProducts,Balance
0,1,0,1,42,1,1,0.0
1,1,0,1,41,0,1,83807.86
2,1,0,0,42,1,3,159660.8
3,1,0,0,39,0,2,0.0
4,1,0,1,43,1,1,125510.82


In [5]:
logclf = LogisticRegression(solver='liblinear')
logclf.fit(X, y)
accuracy = logclf.score(X, y)
print(accuracy)
With the confusion matrix we see the model says everything is not a churn. 
print(confusion_matrix(y, logclf.predict(X)))
Score of "Yes" churning. The predict_proba returns an array with 2 columns:
one with the probability of 0 (non churn) and the other with the probability of 1 (churn)
We are only interested in the second column, the probability of churn
scores = logclf.predict_proba(X)[:, 1]
print(scores[1:10])
Then we get some insights about the churn probabilities. 
print('Max score for churn:', scores.max().round(4))
print('Average score for churn:', scores.mean().round(4))

0.7963
[[7963    0]
 [2037    0]]
[0.23411032 0.22002944 0.24752053 0.22108236 0.21431372 0.19149273
 0.28637373 0.21227519 0.30297131]
Max score for churn: 0.3683
Average score for churn: 0.2462


In [9]:
#We establish a cutoff for the probability of 1 (churn). 
#Anything above the cutoff will be considered as a churner. 
churn_df['churn_pred'] = (scores > 0.25).astype(int)
print(churn_df.iloc[:, 13:15].head(10))
conf1 = pd.crosstab(churn_df['Exited'], churn_df['churn_pred'])
print(conf1)
tp1 = conf1.iloc[1, 1]/(conf1.iloc[1, 0] + conf1.iloc[1, 1])
print('True positive rate:', tp1.round(3))
fp1 = conf1.iloc[0, 1]/(conf1.iloc[0, 0] + conf1.iloc[0, 1])
print('False positive rate:', fp1.round(3))

   Exited  churn_pred
0       1           0
1       0           0
2       1           0
3       0           0
4       0           0
5       1           0
6       0           0
7       1           1
8       0           0
9       0           1
churn_pred     0     1
Exited                
0           3402  4561
1           1581   456
True positive rate: 0.224
False positive rate: 0.573


# Hacer datos para prueba y Escalar datos

In [65]:
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20)
#X_train_esc = escalador.fit_transform(x_train)
#X_test_esc = escalador.transform(x_test)

In [72]:
escalador = StandardScaler()
X_esc = escalador.fit_transform(X)
logclf2 = LogisticRegression(solver='liblinear')
logclf2.fit(X_esc, y)
accuracy = logclf2.score(X_esc, y)
scores2 = logclf2.predict_proba(X_esc)[:, 1]

In [73]:
churn_df['churn_pred_esc'] = (scores2 > 0.25).astype(int)
churn_df.head(50)
conf2 = pd.crosstab(churn_df['Exited'], churn_df['churn_pred_esc'])
print(conf2)
#True positive rate = True positive / (true positives + false negatives)
tp2 = conf2.iloc[1, 1]/(conf2.iloc[1, 0] + conf2.iloc[1, 1])
print('True positive rate:', tp2.round(3))
#False positive rate = False positive / (false positive + true negative)
fp2 = conf2.iloc[0, 1]/(conf2.iloc[0, 0] + conf2.iloc[0, 1])
print('False positive rate:', fp2.round(3))

#print(conf2.iloc[0, 1] + conf2.iloc[1, 1])
#print(conf2.iloc[1, 1] / (conf2.iloc[0, 1] + conf2.iloc[1, 1]))

churn_pred_esc     0     1
Exited                    
0               6245  1718
1                839  1198
Truse positive rate: 0.588
False positive rate: 0.216
