In [1]:
import imblearn
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from plotly import express as px
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import plot_tree, export_text
from sklearn.metrics import precision_score, recall_score, accuracy_score, roc_auc_score, roc_curve, confusion_matrix, f1_score, ConfusionMatrixDisplay
from sklearn.preprocessing import MinMaxScaler

# Round 1

In [2]:
churnData = pd.read_csv('DATA_Customer-Churn.txt')
churnData.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,MonthlyCharges,TotalCharges,Churn
0,Female,0,Yes,No,1,No,No,Yes,No,No,No,No,Month-to-month,29.85,29.85,No
1,Male,0,No,No,34,Yes,Yes,No,Yes,No,No,No,One year,56.95,1889.5,No
2,Male,0,No,No,2,Yes,Yes,Yes,No,No,No,No,Month-to-month,53.85,108.15,Yes
3,Male,0,No,No,45,No,Yes,No,Yes,Yes,No,No,One year,42.3,1840.75,No
4,Female,0,No,No,2,Yes,No,No,No,No,No,No,Month-to-month,70.7,151.65,Yes


In [3]:
churnData.shape

(7043, 16)

In [4]:
churnData.dtypes

gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
MonthlyCharges      float64
TotalCharges         object
Churn                object
dtype: object

In [5]:
churnData['TotalCharges'] = pd.to_numeric(churnData['TotalCharges'], errors='coerce')

In [6]:
churnData.isna().sum()

gender               0
SeniorCitizen        0
Partner              0
Dependents           0
tenure               0
PhoneService         0
OnlineSecurity       0
OnlineBackup         0
DeviceProtection     0
TechSupport          0
StreamingTV          0
StreamingMovies      0
Contract             0
MonthlyCharges       0
TotalCharges        11
Churn                0
dtype: int64

In [7]:
churnData.dropna(subset=['TotalCharges'], inplace=True)

In [8]:
features = ['tenure', 'SeniorCitizen', 'MonthlyCharges','TotalCharges']
X = churnData[features]
y = churnData['Churn']

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.25,random_state=123)

In [10]:
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [11]:
X_train = pd.DataFrame(X_train, columns=X.columns)
X_test = pd.DataFrame(X_test, columns=X.columns)

In [12]:
y_train = y_train.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)


 # Round 2

In [13]:
KN_model = KNeighborsClassifier()
KN_model.fit(X_train, y_train)

In [14]:
DT_model = DecisionTreeClassifier()
DT_model.fit(X_train, y_train)

In [15]:
y_pred_KN = KN_model.predict(X_test)

In [16]:
y_pred_DT = DT_model.predict(X_test)

In [17]:
def evaluate_model(note, model, X_test, y_test, results):
    pred = model.predict(X_test)
    score = model.score(X_test, y_test)
    precision = precision_score(y_test,pred, pos_label = 'No')
    recall = recall_score(y_test, pred,pos_label = 'No')
    f1 = f1_score(y_test, pred, pos_label = 'No')
    false_negatives = confusion_matrix(y_test, pred)[1][0]
    new_result = pd.DataFrame({'note':note,'accuracy':score,'precision':precision,'recall':recall,'f1_score':f1,'false_negatives':false_negatives},index=[0])
    return pd.concat([results,new_result],axis=0)

results = pd.DataFrame(columns=['note','accuracy','precision','recall','f1_score', 'false_negatives'])

In [18]:
results = evaluate_model('KNeighbors', KN_model, X_test, y_test, results)

  return pd.concat([results,new_result],axis=0)


In [19]:
results = evaluate_model('Decision Tree', DT_model, X_test, y_test, results)

In [20]:
results

Unnamed: 0,note,accuracy,precision,recall,f1_score,false_negatives
0,KNeighbors,0.763367,0.795948,0.89141,0.840979,282
0,Decision Tree,0.719568,0.791045,0.816045,0.803351,266


In [21]:
print("accuracy: ",accuracy_score(y_test,y_pred_KN))
print("precision: ", precision_score(y_test, y_pred_KN, pos_label='No'))
print("recall: ", recall_score(y_test, y_pred_KN, pos_label='No'))
print("f1: ", f1_score(y_test, y_pred_KN, pos_label='No'))

accuracy:  0.7633674630261661
precision:  0.7959479015918958
recall:  0.8914100486223663
f1:  0.8409785932721713


In [22]:
print("accuracy: ",accuracy_score(y_test,y_pred_DT))
print("precision: ", precision_score(y_test, y_pred_DT, pos_label='No'))
print("recall: ", recall_score(y_test, y_pred_DT, pos_label='No'))
print("f1: ", f1_score(y_test, y_pred_DT, pos_label='No'))

accuracy:  0.7195676905574516
precision:  0.7910447761194029
recall:  0.8160453808752026
f1:  0.8033506182688471


In [23]:
confusion_matrix(y_test, y_pred_KN)

array([[1100,  134],
       [ 282,  242]])

In [24]:
confusion_matrix(y_test, y_pred_DT)

array([[1007,  227],
       [ 266,  258]])

KN Model have a better accuracy and a similar precision but a significant more performance on recall. 
We can see that notably with KN model having 134 false negative versus 236, that may may be a big difference. 