In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor as VIF
from sklearn.preprocessing import StandardScaler,OrdinalEncoder
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier,plot_tree
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier,GradientBoostingClassifier,BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report,cohen_kappa_score,roc_auc_score,roc_curve
from sklearn.metrics import precision_score,recall_score,f1_score
from sklearn.feature_selection import SequentialFeatureSelector as sfs,RFE
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
import warnings
warnings.filterwarnings('ignore')
plt.rcParams['figure.figsize']=[20,10]

In [2]:
df = pd.read_csv('data1.csv')

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,0,0,0,1,0,0,1,0,0,2,0,0,0,0,0,1,2,-1.165523,-1.285566,0
1,1,1,0,0,0,1,0,0,2,0,2,0,0,0,1,0,3,-0.264071,0.060346,0
2,2,1,0,0,0,1,0,0,2,2,0,0,0,0,0,1,3,-0.367189,-1.244781,1
3,3,1,0,0,0,0,1,0,2,0,2,2,0,0,1,0,0,-0.751387,0.508983,0
4,4,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,2,0.193308,-1.244781,1


In [4]:
df.drop('Unnamed: 0',axis=1,inplace=True)

In [5]:
y = df.Churn
X = df.drop('Churn',axis=1)

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y , random_state=42, test_size=0.3)

In [8]:
print('X_train: ',X_train.shape)
print('X_test: ',X_test.shape)
print('y_train: ',y_train.shape)
print('y_test: ',y_test.shape)

X_train:  (4907, 18)
X_test:  (2103, 18)
y_train:  (4907,)
y_test:  (2103,)


In [10]:
knn = KNeighborsClassifier(n_neighbors=11)

In [11]:
knn_model = knn.fit(X_train, y_train)

#### Performace of train set

In [12]:
y_pred = knn_model.predict(X_train)

In [13]:
print(classification_report(y_train,y_pred))

              precision    recall  f1-score   support

           0       0.85      0.90      0.87      3589
           1       0.66      0.56      0.61      1318

    accuracy                           0.81      4907
   macro avg       0.76      0.73      0.74      4907
weighted avg       0.80      0.81      0.80      4907



#### Performance of test set

In [14]:
y_test_pred = knn_model.predict(X_test)

In [15]:
print(classification_report(y_test, y_test_pred))

              precision    recall  f1-score   support

           0       0.83      0.87      0.85      1564
           1       0.57      0.49      0.53       539

    accuracy                           0.77      2103
   macro avg       0.70      0.68      0.69      2103
weighted avg       0.76      0.77      0.77      2103



## Hyperparameter tuning using GridSearchCV

In [17]:
from sklearn.model_selection import KFold

In [22]:
params = [{'n_neighbors':[3,5,7,9,11],
          'metric':['euclidean','manhattan']}]

In [23]:
kf = KFold(n_splits=5,shuffle=True,random_state=42)

In [24]:
KNN = KNeighborsClassifier()

In [25]:
grd = GridSearchCV(estimator=KNN, param_grid=params, cv=kf)

In [26]:
grd_model = grd.fit(X_train, y_train)

In [27]:
print('Best parameters for KNN are: ',grd_model.best_params_)

Best parameters for KNN are:  {'metric': 'manhattan', 'n_neighbors': 11}


In [28]:
knn_final = KNeighborsClassifier(n_neighbors=11,metric='manhattan')

In [29]:
knn_final_model = knn_final.fit(X_train,y_train)

#### Performance of train set

In [30]:
y_pred = knn_final_model.predict(X_train)

In [32]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.86      0.89      0.87      3589
           1       0.66      0.61      0.63      1318

    accuracy                           0.81      4907
   macro avg       0.76      0.75      0.75      4907
weighted avg       0.81      0.81      0.81      4907



#### Performance of test set

In [33]:
y_test_pred = knn_final_model.predict(X_test)

In [34]:
print(classification_report(y_test, y_test_pred))

              precision    recall  f1-score   support

           0       0.84      0.86      0.85      1564
           1       0.56      0.53      0.55       539

    accuracy                           0.77      2103
   macro avg       0.70      0.69      0.70      2103
weighted avg       0.77      0.77      0.77      2103

