In [68]:
import google.datalab.bigquery as bq

In [69]:
from google.cloud import bigquery

client = bigquery.Client()

query =  """SELECT
   *
 FROM
   `customer-churn-prediction.data_flow_reworked_data.reworked_data_V1`"""

df = client.query(query).to_dataframe()

In [70]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf

In [71]:
print(df.columns)
print("===================================================================================")
print(df.info())

Index(['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure',
       'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity',
       'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV',
       'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod',
       'MonthlyCharges', 'TotalCharges', 'Churn'],
      dtype='object')
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7032 entries, 0 to 7031
Data columns (total 20 columns):
gender              7032 non-null object
SeniorCitizen       7032 non-null object
Partner             7032 non-null object
Dependents          7032 non-null object
tenure              7032 non-null float64
PhoneService        7032 non-null object
MultipleLines       7032 non-null object
InternetService     7032 non-null object
OnlineSecurity      7032 non-null object
OnlineBackup        7032 non-null object
DeviceProtection    7032 non-null object
TechSupport         7032 non-null object
StreamingTV         7032 non-null

In [72]:
df = pd.get_dummies(df, drop_first=True)
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7032 entries, 0 to 7031
Data columns (total 31 columns):
tenure                                   7032 non-null float64
MonthlyCharges                           7032 non-null float64
TotalCharges                             7032 non-null float64
Churn                                    7032 non-null int64
gender_Male                              7032 non-null uint8
SeniorCitizen_1                          7032 non-null uint8
Partner_Yes                              7032 non-null uint8
Dependents_Yes                           7032 non-null uint8
PhoneService_Yes                         7032 non-null uint8
MultipleLines_No phone service           7032 non-null uint8
MultipleLines_Yes                        7032 non-null uint8
InternetService_Fiber optic              7032 non-null uint8
InternetService_No                       7032 non-null uint8
OnlineSecurity_No internet service       7032 non-null uint8
OnlineSecurity_Yes               

In [73]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

Base model :

In [74]:
logreg = LogisticRegression()

In [75]:
X = df.drop('Churn', axis=1)
y = df['Churn']
print(X.shape)
print(y.shape)

(7032, 30)
(7032,)


In [76]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

In [77]:
print(X_train.shape)
print(X_test.shape)

(6328, 30)
(704, 30)


In [87]:
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)

In [88]:
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test, y_pred))
print("\n")
print(classification_report(y_test, y_pred))

[[481  43]
 [ 83  97]]


             precision    recall  f1-score   support

          0       0.85      0.92      0.88       524
          1       0.69      0.54      0.61       180

avg / total       0.81      0.82      0.81       704



Base model using cross-validation :

In [79]:
kfold = KFold(n_splits=5, random_state=None, shuffle=True)
cv_results = cross_val_score(logreg, X, y, cv=kfold)

In [80]:
print(cv_results)
print("\n")
print(np.mean(cv_results))

[0.79886283 0.80099502 0.78662873 0.8257468  0.8086771 ]


0.8041820970336288


Parameters tuning :

In [81]:
logreg_2 = LogisticRegression(penalty='l1', C=0.1)
logreg_2.fit(X_train, y_train)
cv_results_2 = cross_val_score(logreg_2, X, y, cv=kfold)
print(cv_results_2)
print(np.mean(cv_results_2))

[0.81663113 0.80881308 0.78876245 0.81009957 0.80156472]
0.805174190013153


In [None]:
Test of feature selection

In [82]:
from sklearn.linear_model import Lasso
lasso = Lasso(alpha=1)
lasso.fit(X_train, y_train).coef_

array([-0.00000000e+00,  6.24273666e-03, -9.25217527e-05, -0.00000000e+00,
        0.00000000e+00, -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  0.00000000e+00, -0.00000000e+00,
       -0.00000000e+00, -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,
       -0.00000000e+00, -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,
       -0.00000000e+00,  0.00000000e+00, -0.00000000e+00,  0.00000000e+00,
       -0.00000000e+00, -0.00000000e+00,  0.00000000e+00, -0.00000000e+00,
        0.00000000e+00, -0.00000000e+00])

In [83]:
logreg_3 = LogisticRegression()
X_3 = X[['tenure', 'MonthlyCharges', 'TotalCharges']]
X_train_3, X_test_3, y_train_3, y_test_3 = train_test_split(X_3, y, test_size=0.1, random_state=42)
logreg_2.fit(X_train, y_train)
cv_results_3 = cross_val_score(logreg_3, X_3, y, cv=kfold)
print(cv_results_3)
print(np.mean(cv_results_3))

[0.7938877  0.77896233 0.78236131 0.79374111 0.77311522]
0.7844135348455851


Pipeline to avoid data leakage in cross-validation

In [96]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
scaler = StandardScaler()
steps = [('scaler', scaler),('logistic_regression', logreg)]
pipeline = Pipeline(steps)
#parameters = {'penalty':['l1', 'l2'], 'C':[1, 10, 100]}
#cv = GridSearchCV(pipeline, param_grid=parameters)
pipeline.fit(X_train, y_train)
#print(cv.best_params_)
#If no cv, fit can also be called on pipeline
#pipeline.fit(X_train, y_train)
pipeline.score(X_test, y_test)


0.8224431818181818

In [91]:
logreg.get_params().keys()

dict_keys(['random_state', 'intercept_scaling', 'fit_intercept', 'dual', 'class_weight', 'warm_start', 'n_jobs', 'penalty', 'solver', 'tol', 'verbose', 'multi_class', 'C', 'max_iter'])