In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor as VIF
from sklearn.preprocessing import StandardScaler,OrdinalEncoder
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier,plot_tree
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier,GradientBoostingClassifier,BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report,cohen_kappa_score,roc_auc_score,roc_curve
from sklearn.metrics import precision_score,recall_score,f1_score
from sklearn.feature_selection import SequentialFeatureSelector as sfs,RFE
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
import warnings
warnings.filterwarnings('ignore')
plt.rcParams['figure.figsize']=[20,10]

In [2]:
df = pd.read_csv('data1.csv')

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,0,0,0,1,0,0,1,0,0,2,0,0,0,0,0,1,2,-1.165523,-1.285566,0
1,1,1,0,0,0,1,0,0,2,0,2,0,0,0,1,0,3,-0.264071,0.060346,0
2,2,1,0,0,0,1,0,0,2,2,0,0,0,0,0,1,3,-0.367189,-1.244781,1
3,3,1,0,0,0,0,1,0,2,0,2,2,0,0,1,0,0,-0.751387,0.508983,0
4,4,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,2,0.193308,-1.244781,1


In [4]:
df.drop('Unnamed: 0',axis=1,inplace=True)

In [6]:
y = df.Churn
X = df.drop('Churn',axis=1)

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y , random_state=42, test_size=0.3)

In [8]:
print('X_train: ',X_train.shape)
print('X_test: ',X_test.shape)
print('y_train: ',y_train.shape)
print('y_test: ',y_test.shape)

X_train:  (4907, 18)
X_test:  (2103, 18)
y_train:  (4907,)
y_test:  (2103,)


In [9]:
nb = GaussianNB()
nb_model = nb.fit(X_train, y_train)

#### Performance of train set

In [10]:
y_pred = nb_model.predict(X_train)

In [11]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.89      0.76      0.82      3589
           1       0.53      0.74      0.62      1318

    accuracy                           0.75      4907
   macro avg       0.71      0.75      0.72      4907
weighted avg       0.79      0.75      0.76      4907



#### Performance of test set

In [12]:
y_test_pred = nb_model.predict(X_test)

In [13]:
print(classification_report(y_test, y_test_pred))

              precision    recall  f1-score   support

           0       0.90      0.76      0.82      1564
           1       0.52      0.75      0.61       539

    accuracy                           0.76      2103
   macro avg       0.71      0.75      0.72      2103
weighted avg       0.80      0.76      0.77      2103



## Hyperparameter tuning using GridSearchCV

In [17]:
from sklearn.model_selection import KFold

In [14]:
params = {'var_smoothing':[1e-09,1e-08,1e-07,1e-06,1e-05]}

In [15]:
gnb = GaussianNB()

In [18]:
kf = KFold(n_splits=5,shuffle=True,random_state=42)

In [19]:
grid = GridSearchCV(estimator=gnb,param_grid=params,cv=kf)

In [21]:
nb_final = grid.fit(X_train, y_train)
print('Best parameters for GaussianNB are: ',nb_final.best_params_)

Best parameters for GaussianNB are:  {'var_smoothing': 1e-09}


In [23]:
# Getting same parameters after GridSearchCV