In [1]:
#Import mandatory libraries 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


#Crossvalidation methods
from sklearn.model_selection import train_test_split, KFold, cross_val_score

#ML Algorithms
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier   # Feature Selection
from sklearn.metrics import accuracy_score, roc_curve,auc
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

## Feature Imputing

In [169]:
df = pd.read_csv("./data/WA_Fn-UseC_-Telco-Customer-Churn.csv")
df.drop(columns="customerID",inplace=True)

df["TotalCharges"][df["TotalCharges"] == " "] = 0 
df["TotalCharges"] = df["TotalCharges"].astype("float")

#Hot one encoding using get dummines
df = pd.get_dummies(df, drop_first=True) #one hot encoding

X = df.drop(columns="Churn_Yes",axis=1)
y = df["Churn_Yes"]
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state = 1000, test_size = 0.3)

## Model

In [170]:
#################             Logistic Regression      ######################

lgr = LogisticRegression()
lgr.fit(X_train,y_train)
y_pred = lgr.predict(X_test)

fpr, tpr, thresholds = roc_curve(y_test, y_pred, pos_label=1)
auc = auc(fpr,tpr)

print("accuracy : {}".format(accuracy_score(y_test,y_pred)))
print("cross_val_score : {}".format(cross_val_score(lgr.fit(X_train,y_train),X_train,y_train)))
print("fpr : {}".format(fpr))
print("tpr : {}".format(tpr))
print("thresholds : {}".format(thresholds))
print("auc : {}".format(auc))

accuracy : 0.8026502602934217
cross_val_score : [0.80474453 0.79075426 0.81546894]
fpr : [0.         0.10114504 1.        ]
tpr : [0.         0.52310536 1.        ]
thresholds : [2 1 0]
auc : 0.7109801611378419


In [177]:
##########              Decision Tree       ######################

clf = DecisionTreeClassifier()
clf.fit(X_train,y_train)
y_pred1= clf.predict(X_test)

fpr1, tpr1, thresholds = roc_curve(y_test, y_pred1, pos_label=1)
#auc1 = auc(fpr1,tpr1)

print("accuracy : {}".format(accuracy_score(y_test,y_pred1)))
print("cross_val_score : {}".format(cross_val_score(clf.fit(X_train,y_train),X_train,y_train)))
print("fpr : {}".format(fpr1))
print("tpr : {}".format(tpr1))
print("thresholds : {}".format(thresholds))
#print("auc : {}".format(auc1))


accuracy : 0.722669190724089
cross_val_score : [0.71289538 0.70924574 0.70950061]
fpr : [0.         0.20229008 1.        ]
tpr : [0.         0.50462107 1.        ]
thresholds : [2 1 0]


In [179]:
##########              Grid Search        ######################

pipeline = Pipeline(steps=[('Decision_Tree', clf)])

params = {'Decision_Tree__criterion': ['gini','entropy'],
          'Decision_Tree__max_depth': [2,3,4,5],
          'Decision_Tree__min_samples_leaf': [1,2,3,4,5]}

grid_search = GridSearchCV(estimator=pipeline, param_grid=params)
grid_search.fit(X_train,y_train)
y_prediction = grid_search.predict(X_test)

fpr, tpr, thresholds = roc_curve(y_test, y_pred, pos_label=1)
#auc = auc(fpr,tpr)

print("accuracy : {}".format(accuracy_score(y_test,y_prediction)))
print("cross_val_score : {}".format(cross_val_score(grid_search.best_estimator_,X_train,y_train)))
print("fpr : {}".format(fpr))
print("tpr : {}".format(tpr))
print("thresholds : {}".format(thresholds))
#print("auc : {}".format(auc))
print(grid_search.best_params_)

accuracy : 0.792238523426408
cross_val_score : [0.79440389 0.78345499 0.78928136]
fpr : [0.        0.2086514 1.       ]
tpr : [0.         0.50277264 1.        ]
thresholds : [2 1 0]
{'Decision_Tree__criterion': 'gini', 'Decision_Tree__max_depth': 3, 'Decision_Tree__min_samples_leaf': 1}
