**Table of contents**<a id='toc0_'></a>    
- [X-y split](#toc1_)    
- [Encoding categoricals & scaling numericals](#toc2_)    
- [Train-test split](#toc3_)    
- [Model](#toc4_)    
- [Save models](#toc5_)    

<!-- vscode-jupyter-toc-config
	numbering=false
	anchor=true
	flat=false
	minLevel=1
	maxLevel=6
	/vscode-jupyter-toc-config -->
<!-- THIS CELL WILL BE REPLACED ON TOC UPDATE. DO NOT WRITE YOUR TEXT IN THIS CELL -->

In [5]:
# !pip install plotly
import plotly.express as px

In [3]:
import pandas as pd
import numpy as np
import pickle
pd.set_option('display.max_columns', None)


In [4]:
data = pd.read_csv("https://raw.githubusercontent.com/sabinagio/data-analytics/main/data/customer_churn.csv").dropna()
data.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [9]:
px.histogram(x=data.MonthlyCharges, facet_row=data.Churn)

In [10]:
data.drop('customerID', axis=1, inplace=True)

In [None]:
data.shape

(7032, 20)

# <a id='toc1_'></a>[X-y split](#toc0_)

In [11]:
X = data.drop('Churn', axis=1)
y = data['Churn']

# <a id='toc2_'></a>[Encoding categoricals & scaling numericals](#toc0_)

In [13]:
num_data = X.select_dtypes(np.number)
cat_data = X.select_dtypes(object)

In [14]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
num_data_scaled = pd.DataFrame(scaler.fit_transform(num_data), columns=scaler.get_feature_names_out())
num_data_scaled

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,TotalCharges
0,0.0,0.000000,0.115423,0.001275
1,0.0,0.464789,0.385075,0.215867
2,0.0,0.014085,0.354229,0.010310
3,0.0,0.619718,0.239303,0.210241
4,0.0,0.014085,0.521891,0.015330
...,...,...,...,...
7027,0.0,0.323944,0.662189,0.227521
7028,0.0,1.000000,0.845274,0.847461
7029,0.0,0.140845,0.112935,0.037809
7030,1.0,0.042254,0.558706,0.033210


In [15]:
cat_data_encoded = pd.get_dummies(cat_data).reset_index(drop=True)
cat_data_encoded

Unnamed: 0,gender_Female,gender_Male,Partner_No,Partner_Yes,Dependents_No,Dependents_Yes,PhoneService_No,PhoneService_Yes,MultipleLines_No,MultipleLines_No phone service,MultipleLines_Yes,InternetService_DSL,InternetService_Fiber optic,InternetService_No,OnlineSecurity_No,OnlineSecurity_No internet service,OnlineSecurity_Yes,OnlineBackup_No,OnlineBackup_No internet service,OnlineBackup_Yes,DeviceProtection_No,DeviceProtection_No internet service,DeviceProtection_Yes,TechSupport_No,TechSupport_No internet service,TechSupport_Yes,StreamingTV_No,StreamingTV_No internet service,StreamingTV_Yes,StreamingMovies_No,StreamingMovies_No internet service,StreamingMovies_Yes,Contract_Month-to-month,Contract_One year,Contract_Two year,PaperlessBilling_No,PaperlessBilling_Yes,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,True,False,False,True,True,False,True,False,False,True,False,True,False,False,True,False,False,False,False,True,True,False,False,True,False,False,True,False,False,True,False,False,True,False,False,False,True,False,False,True,False
1,False,True,True,False,True,False,False,True,True,False,False,True,False,False,False,False,True,True,False,False,False,False,True,True,False,False,True,False,False,True,False,False,False,True,False,True,False,False,False,False,True
2,False,True,True,False,True,False,False,True,True,False,False,True,False,False,False,False,True,False,False,True,True,False,False,True,False,False,True,False,False,True,False,False,True,False,False,False,True,False,False,False,True
3,False,True,True,False,True,False,True,False,False,True,False,True,False,False,False,False,True,True,False,False,False,False,True,False,False,True,True,False,False,True,False,False,False,True,False,True,False,True,False,False,False
4,True,False,True,False,True,False,False,True,True,False,False,False,True,False,True,False,False,True,False,False,True,False,False,True,False,False,True,False,False,True,False,False,True,False,False,False,True,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7027,False,True,False,True,False,True,False,True,False,False,True,True,False,False,False,False,True,True,False,False,False,False,True,False,False,True,False,False,True,False,False,True,False,True,False,False,True,False,False,False,True
7028,True,False,False,True,False,True,False,True,False,False,True,False,True,False,True,False,False,False,False,True,False,False,True,True,False,False,False,False,True,False,False,True,False,True,False,False,True,False,True,False,False
7029,True,False,False,True,False,True,True,False,False,True,False,True,False,False,False,False,True,True,False,False,True,False,False,True,False,False,True,False,False,True,False,False,True,False,False,False,True,False,False,True,False
7030,False,True,False,True,True,False,False,True,False,False,True,False,True,False,True,False,False,True,False,False,True,False,False,True,False,False,True,False,False,True,False,False,True,False,False,False,True,False,False,False,True


In [16]:
X_prep = pd.concat([num_data_scaled, cat_data_encoded], axis=1)
X_prep

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,TotalCharges,gender_Female,gender_Male,Partner_No,Partner_Yes,Dependents_No,Dependents_Yes,PhoneService_No,PhoneService_Yes,MultipleLines_No,MultipleLines_No phone service,MultipleLines_Yes,InternetService_DSL,InternetService_Fiber optic,InternetService_No,OnlineSecurity_No,OnlineSecurity_No internet service,OnlineSecurity_Yes,OnlineBackup_No,OnlineBackup_No internet service,OnlineBackup_Yes,DeviceProtection_No,DeviceProtection_No internet service,DeviceProtection_Yes,TechSupport_No,TechSupport_No internet service,TechSupport_Yes,StreamingTV_No,StreamingTV_No internet service,StreamingTV_Yes,StreamingMovies_No,StreamingMovies_No internet service,StreamingMovies_Yes,Contract_Month-to-month,Contract_One year,Contract_Two year,PaperlessBilling_No,PaperlessBilling_Yes,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,0.0,0.000000,0.115423,0.001275,True,False,False,True,True,False,True,False,False,True,False,True,False,False,True,False,False,False,False,True,True,False,False,True,False,False,True,False,False,True,False,False,True,False,False,False,True,False,False,True,False
1,0.0,0.464789,0.385075,0.215867,False,True,True,False,True,False,False,True,True,False,False,True,False,False,False,False,True,True,False,False,False,False,True,True,False,False,True,False,False,True,False,False,False,True,False,True,False,False,False,False,True
2,0.0,0.014085,0.354229,0.010310,False,True,True,False,True,False,False,True,True,False,False,True,False,False,False,False,True,False,False,True,True,False,False,True,False,False,True,False,False,True,False,False,True,False,False,False,True,False,False,False,True
3,0.0,0.619718,0.239303,0.210241,False,True,True,False,True,False,True,False,False,True,False,True,False,False,False,False,True,True,False,False,False,False,True,False,False,True,True,False,False,True,False,False,False,True,False,True,False,True,False,False,False
4,0.0,0.014085,0.521891,0.015330,True,False,True,False,True,False,False,True,True,False,False,False,True,False,True,False,False,True,False,False,True,False,False,True,False,False,True,False,False,True,False,False,True,False,False,False,True,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7027,0.0,0.323944,0.662189,0.227521,False,True,False,True,False,True,False,True,False,False,True,True,False,False,False,False,True,True,False,False,False,False,True,False,False,True,False,False,True,False,False,True,False,True,False,False,True,False,False,False,True
7028,0.0,1.000000,0.845274,0.847461,True,False,False,True,False,True,False,True,False,False,True,False,True,False,True,False,False,False,False,True,False,False,True,True,False,False,False,False,True,False,False,True,False,True,False,False,True,False,True,False,False
7029,0.0,0.140845,0.112935,0.037809,True,False,False,True,False,True,True,False,False,True,False,True,False,False,False,False,True,True,False,False,True,False,False,True,False,False,True,False,False,True,False,False,True,False,False,False,True,False,False,True,False
7030,1.0,0.042254,0.558706,0.033210,False,True,False,True,True,False,False,True,False,False,True,False,True,False,True,False,False,True,False,False,True,False,False,True,False,False,True,False,False,True,False,False,True,False,False,False,True,False,False,False,True


# <a id='toc3_'></a>[Train-test split](#toc0_)

In [17]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_prep, y, random_state=42, test_size=0.3)

# <a id='toc4_'></a>[Model](#toc0_)

In [18]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB

In [19]:
log_model = LogisticRegression()
knn_model = KNeighborsClassifier()
bayes_model = MultinomialNB()

In [None]:
# After we're satisfied with our models we can fit on the whole dataset
log_model.fit(X, y)
knn_model.fit(X, y)
bayes_model.fit(X, y)

In [21]:
# We will revisit these metrics in one of the AI classes
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

pred = log_model.predict(X_test)
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

          No       0.84      0.89      0.86      1549
         Yes       0.64      0.53      0.58       561

    accuracy                           0.80      2110
   macro avg       0.74      0.71      0.72      2110
weighted avg       0.79      0.80      0.79      2110



In [22]:
pred = knn_model.predict(X_test)
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

          No       0.83      0.84      0.84      1549
         Yes       0.54      0.52      0.53       561

    accuracy                           0.76      2110
   macro avg       0.69      0.68      0.68      2110
weighted avg       0.75      0.76      0.75      2110



In [23]:
pred = bayes_model.predict(X_test)
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

          No       0.91      0.67      0.77      1549
         Yes       0.47      0.81      0.60       561

    accuracy                           0.71      2110
   macro avg       0.69      0.74      0.69      2110
weighted avg       0.79      0.71      0.73      2110



# <a id='toc5_'></a>[Save models](#toc0_)

In [25]:
# creating pickle files (saving the models)
with open("models/log_reg.pkl", "wb") as li:  # wb: mode write
    pickle.dump(log_model, li)

with open("models/knn.pkl", "wb") as lo:
    pickle.dump(knn_model, lo)

with open("models/bayes.pkl", "wb") as sv:
    pickle.dump(bayes_model, sv)

In [None]:
with open("prep/scaler.pkl", "wb") as sc:
    pickle.dump(scaler, sc)