<h1>Classification Modeling

In [1]:
# list of imports I'll need for exploring the dataframe 'telco'

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import seaborn as sns
from scipy import stats

# import splitting and imputing functions
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer

# turn off pink boxes for demo
import warnings
warnings.filterwarnings("ignore")

# import our own acquire module
import env
import os
import pandas as pd
directory = os.getcwd()
import acquire as ac
import wrangle as wr

# imports for modeling:
# import Logistic regression
from sklearn.linear_model import LogisticRegression
# import K Nearest neighbors:
from sklearn.neighbors import KNeighborsClassifier
# import Decision Trees:
from sklearn.tree import DecisionTreeClassifier, export_text, plot_tree
# import Random Forest:
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import classification_report, confusion_matrix

# interpreting our models:
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

<h2>Acquire the data

In [2]:
SQL_query = """
        SELECT * FROM customers
        JOIN contract_types USING (contract_type_id)
        JOIN internet_service_types USING (internet_service_type_id)
        JOIN payment_types USING (payment_type_id)
        """

In [3]:
df = ac.get_telco_data(SQL_query, directory, filename = 'telco.csv')
df.head() 

Unnamed: 0,payment_type_id,internet_service_type_id,contract_type_id,customer_id,gender,senior_citizen,partner,dependents,tenure,phone_service,...,tech_support,streaming_tv,streaming_movies,paperless_billing,monthly_charges,total_charges,churn,contract_type,internet_service_type,payment_type
0,2,1,2,0002-ORFBO,Female,0,Yes,Yes,9,Yes,...,Yes,Yes,No,Yes,65.6,593.3,No,One year,DSL,Mailed check
1,2,1,1,0003-MKNFE,Male,0,No,No,9,Yes,...,No,No,Yes,No,59.9,542.4,No,Month-to-month,DSL,Mailed check
2,1,2,1,0004-TLHLJ,Male,0,No,No,4,Yes,...,No,No,No,Yes,73.9,280.85,Yes,Month-to-month,Fiber optic,Electronic check
3,1,2,1,0011-IGKFF,Male,1,Yes,No,13,Yes,...,No,Yes,Yes,Yes,98.0,1237.85,Yes,Month-to-month,Fiber optic,Electronic check
4,2,2,1,0013-EXCHZ,Female,1,Yes,No,3,Yes,...,Yes,Yes,No,Yes,83.9,267.4,Yes,Month-to-month,Fiber optic,Mailed check


<h2>Split the Data

In [4]:
#Prep and Split the data
train, validate, test = wr.prep_telco_data(df)

<h2>Scale the Data

In [5]:
from sklearn.preprocessing import MinMaxScaler

continuous_features = ['monthly_charges', 'total_charges', 'tenure']
scaler = MinMaxScaler()
scaler.fit(train[continuous_features])

train[['monthly_charges_scaled', 'total_charges_scaled', 'tenure_scaled']] = scaler.transform(train[continuous_features])
train = train.drop(columns = ['monthly_charges', 'total_charges', 'tenure'])
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3937 entries, 5919 to 4192
Data columns (total 36 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   senior_citizen                         3937 non-null   int64  
 1   gender_encoded                         3937 non-null   int64  
 2   partner_encoded                        3937 non-null   int64  
 3   dependents_encoded                     3937 non-null   int64  
 4   phone_service_encoded                  3937 non-null   int64  
 5   paperless_billing_encoded              3937 non-null   int64  
 6   churn_encoded                          3937 non-null   int64  
 7   gender_Male                            3937 non-null   uint8  
 8   partner_Yes                            3937 non-null   uint8  
 9   dependents_Yes                         3937 non-null   uint8  
 10  phone_service_Yes                      3937 non-null   uint8  
 11  p

In [6]:
from sklearn.preprocessing import MinMaxScaler

continuous_features = ['monthly_charges', 'total_charges', 'tenure']
scaler = MinMaxScaler()
scaler.fit(validate[continuous_features])

validate[['monthly_charges_scaled', 'total_charges_scaled', 'tenure_scaled']] = scaler.transform(validate[continuous_features])
validate = validate.drop(columns=['monthly_charges', 'total_charges', 'tenure'])
validate.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1688 entries, 816 to 3718
Data columns (total 36 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   senior_citizen                         1688 non-null   int64  
 1   gender_encoded                         1688 non-null   int64  
 2   partner_encoded                        1688 non-null   int64  
 3   dependents_encoded                     1688 non-null   int64  
 4   phone_service_encoded                  1688 non-null   int64  
 5   paperless_billing_encoded              1688 non-null   int64  
 6   churn_encoded                          1688 non-null   int64  
 7   gender_Male                            1688 non-null   uint8  
 8   partner_Yes                            1688 non-null   uint8  
 9   dependents_Yes                         1688 non-null   uint8  
 10  phone_service_Yes                      1688 non-null   uint8  
 11  pa

In [7]:
from sklearn.preprocessing import MinMaxScaler

continuous_features = ['monthly_charges', 'total_charges', 'tenure']
scaler = MinMaxScaler()
scaler.fit(test[continuous_features])

test[['monthly_charges_scaled', 'total_charges_scaled', 'tenure_scaled']] = scaler.transform(test[continuous_features])
test = test.drop(columns=['monthly_charges', 'total_charges', 'tenure'])
test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1407 entries, 3311 to 3690
Data columns (total 36 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   senior_citizen                         1407 non-null   int64  
 1   gender_encoded                         1407 non-null   int64  
 2   partner_encoded                        1407 non-null   int64  
 3   dependents_encoded                     1407 non-null   int64  
 4   phone_service_encoded                  1407 non-null   int64  
 5   paperless_billing_encoded              1407 non-null   int64  
 6   churn_encoded                          1407 non-null   int64  
 7   gender_Male                            1407 non-null   uint8  
 8   partner_Yes                            1407 non-null   uint8  
 9   dependents_Yes                         1407 non-null   uint8  
 10  phone_service_Yes                      1407 non-null   uint8  
 11  p

<h2>Establish the Baseline Accuracy

In [8]:
baseline = train.churn_encoded.mean()
baseline

0.2656845313690627

In [9]:
train.churn_encoded.value_counts()

0    2891
1    1046
Name: churn_encoded, dtype: int64

In [10]:
baseline_accuracy = (train.churn_encoded == 0).mean()
baseline_accuracy #0.73 or 73%

0.7343154686309372

In [11]:
print(f'We would say that a model is not useful to us unless it does better than {round(baseline_accuracy, 2)}')

We would say that a model is not useful to us unless it does better than 0.73


<h2>Now let's get to Modeling

> <h2><b>Decision Tree

> > <b>This is with the top 10 features

In [23]:
X_train2 = train[['internet_service_type_Fiber optic', 'payment_type_Electronic check', 'monthly_charges_scaled', 'paperless_billing_encoded','paperless_billing_Yes', 'senior_citizen', 'multiple_lines_Yes', 'streaming_movies_Yes', 'streaming_tv_Yes', 'phone_service_Yes' ]]
y_train2 = train.churn_encoded

X_validate2 = validate[['internet_service_type_Fiber optic', 'payment_type_Electronic check', 'monthly_charges_scaled', 'paperless_billing_encoded','paperless_billing_Yes', 'senior_citizen', 'multiple_lines_Yes', 'streaming_movies_Yes', 'streaming_tv_Yes', 'phone_service_Yes' ]]
y_validate2 = validate.churn_encoded

X_test2 = test[['internet_service_type_Fiber optic', 'payment_type_Electronic check', 'monthly_charges_scaled', 'paperless_billing_encoded','paperless_billing_Yes', 'senior_citizen', 'multiple_lines_Yes', 'streaming_movies_Yes', 'streaming_tv_Yes', 'phone_service_Yes' ]]
y_test2 = test.churn_encoded

In [24]:
clf2 = DecisionTreeClassifier(max_depth=7, random_state=123)

In [25]:
# model.fit(X, y)

clf2 = clf.fit(X_train2, y_train2)

In [26]:
clf2.predict(X_train2)[:20]

array([0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0])

In [27]:
clf2.score(X_train2, y_train2)

0.7978155956311913

In [28]:
y_pred2 = clf2.predict(X_train2)

In [29]:
conf2 = confusion_matrix(y_train2, y_pred2)
conf2

array([[2715,  176],
       [ 620,  426]])

In [30]:
labels2 = sorted(y_train2.unique())
labels2

[0, 1]

In [31]:
pd.DataFrame(conf2)

Unnamed: 0,0,1
0,2715,176
1,620,426


In [32]:
print(classification_report(y_train2, y_pred2))

              precision    recall  f1-score   support

           0       0.81      0.94      0.87      2891
           1       0.71      0.41      0.52      1046

    accuracy                           0.80      3937
   macro avg       0.76      0.67      0.69      3937
weighted avg       0.79      0.80      0.78      3937



In [33]:
print(f"Accuracy of Decision Tree on train data is {clf2.score(X_train2, y_train2)}")
print(f"Accuracy of Decision Tree on validate data is {clf2.score(X_validate2, y_validate2)}")

Accuracy of Decision Tree on train data is 0.7978155956311913
Accuracy of Decision Tree on validate data is 0.7381516587677726


> > <b>This is with the top 5 features

In [34]:
X_train3 = train[['internet_service_type_Fiber optic', 'payment_type_Electronic check', 'monthly_charges_scaled', 'paperless_billing_encoded','paperless_billing_Yes']]
y_train3 = train.churn_encoded

X_validate3 = validate[['internet_service_type_Fiber optic', 'payment_type_Electronic check', 'monthly_charges_scaled', 'paperless_billing_encoded','paperless_billing_Yes']]
y_validate3 = validate.churn_encoded

X_test3 = test[['internet_service_type_Fiber optic', 'payment_type_Electronic check', 'monthly_charges_scaled', 'paperless_billing_encoded','paperless_billing_Yes']]
y_test3 = test.churn_encoded

In [35]:
clf3 = DecisionTreeClassifier(max_depth=7, random_state=123)

In [36]:
# model.fit(X, y)

clf3 = clf.fit(X_train3, y_train3)

In [37]:
clf3.predict(X_train3)[:20]

array([0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0])

In [38]:
clf3.score(X_train3, y_train3)

0.792735585471171

In [39]:
y_pred3 = clf3.predict(X_train3)

In [40]:
conf3 = confusion_matrix(y_train3, y_pred3)
conf3

array([[2637,  254],
       [ 562,  484]])

In [41]:
labels3 = sorted(y_train3.unique())
labels3

[0, 1]

In [42]:
pd.DataFrame(conf3)

Unnamed: 0,0,1
0,2637,254
1,562,484


In [43]:
print(classification_report(y_train3, y_pred3))

              precision    recall  f1-score   support

           0       0.82      0.91      0.87      2891
           1       0.66      0.46      0.54      1046

    accuracy                           0.79      3937
   macro avg       0.74      0.69      0.70      3937
weighted avg       0.78      0.79      0.78      3937



In [44]:
print(f"Accuracy of Decision Tree on train data is {clf3.score(X_train3, y_train3)}")
print(f"Accuracy of Decision Tree on validate data is {clf3.score(X_validate3, y_validate3)}")

Accuracy of Decision Tree on train data is 0.792735585471171
Accuracy of Decision Tree on validate data is 0.7351895734597157


> > <b>This is with SelectK Best's top 5 features

In [45]:
X_train4 = train[['tenure_scaled', 'contract_type_Two year','internet_service_type_Fiber optic', 'internet_service_type_None','payment_type_Electronic check']]
y_train4 = train.churn_encoded

X_validate4 = validate[['tenure_scaled', 'contract_type_Two year','internet_service_type_Fiber optic', 'internet_service_type_None','payment_type_Electronic check']]
y_validate4 = validate.churn_encoded

X_test4 = test[['tenure_scaled', 'contract_type_Two year','internet_service_type_Fiber optic', 'internet_service_type_None','payment_type_Electronic check']]
y_test4 = test.churn_encoded

In [46]:
clf4 = DecisionTreeClassifier(max_depth=7, random_state=123)

In [47]:
# model.fit(X, y)

clf4 = clf.fit(X_train4, y_train4)

In [48]:
clf4.predict(X_train4)[:20]

array([0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0])

In [49]:
clf4.score(X_train4, y_train4)

0.8084836169672339

In [50]:
y_pred4 = clf3.predict(X_train4)

In [51]:
conf4 = confusion_matrix(y_train4, y_pred4)
conf4

array([[2574,  317],
       [ 437,  609]])

In [52]:
labels4 = sorted(y_train3.unique())
labels4

[0, 1]

In [53]:
pd.DataFrame(conf4)

Unnamed: 0,0,1
0,2574,317
1,437,609


In [54]:
print(classification_report(y_train4, y_pred4))

              precision    recall  f1-score   support

           0       0.85      0.89      0.87      2891
           1       0.66      0.58      0.62      1046

    accuracy                           0.81      3937
   macro avg       0.76      0.74      0.74      3937
weighted avg       0.80      0.81      0.80      3937



In [55]:
print(f"Accuracy of Decision Tree on train data is {clf4.score(X_train4, y_train4)}")
print(f"Accuracy of Decision Tree on validate data is {clf4.score(X_validate4, y_validate4)}")

Accuracy of Decision Tree on train data is 0.8084836169672339
Accuracy of Decision Tree on validate data is 0.773696682464455


> > <b>This is with RFE's top 5 features

In [56]:
X_train5 = train[['dependents_encoded', 'phone_service_encoded', 'dependents_Yes','online_security_No internet service','streaming_movies_No internet service']]
y_train5 = train.churn_encoded

X_validate5 = validate[['dependents_encoded', 'phone_service_encoded', 'dependents_Yes','online_security_No internet service','streaming_movies_No internet service']]
y_validate5 = validate.churn_encoded

X_test5 = test[['dependents_encoded', 'phone_service_encoded', 'dependents_Yes','online_security_No internet service','streaming_movies_No internet service']]
y_test5 = test.churn_encoded

In [57]:
clf5 = DecisionTreeClassifier(max_depth=7, random_state=123)

In [58]:
# model.fit(X, y)

clf5 = clf.fit(X_train5, y_train5)

In [59]:
clf5.predict(X_train5)[:20]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [60]:
clf5.score(X_train5, y_train5)

0.7343154686309372

In [61]:
y_pred5 = clf5.predict(X_train5)

In [62]:
conf5 = confusion_matrix(y_train5, y_pred5)
conf5

array([[2891,    0],
       [1046,    0]])

In [63]:
labels5 = sorted(y_train5.unique())
labels5

[0, 1]

In [64]:
pd.DataFrame(conf5)

Unnamed: 0,0,1
0,2891,0
1,1046,0


In [65]:
print(classification_report(y_train5, y_pred5))

              precision    recall  f1-score   support

           0       0.73      1.00      0.85      2891
           1       0.00      0.00      0.00      1046

    accuracy                           0.73      3937
   macro avg       0.37      0.50      0.42      3937
weighted avg       0.54      0.73      0.62      3937



In [66]:
print(f"Accuracy of Decision Tree on train data is {clf5.score(X_train5, y_train5)}")
print(f"Accuracy of Decision Tree on validate data is {clf5.score(X_validate5, y_validate5)}")

Accuracy of Decision Tree on train data is 0.7343154686309372
Accuracy of Decision Tree on validate data is 0.7340047393364929


> <h2><b>KNN

<h4><b>Top 10 Correlation Features

In [72]:
knn1 = KNeighborsClassifier(n_neighbors=5, weights='uniform')

In [73]:
knn1.fit(X_train2, y_train2)

In [74]:
y_pred1 = knn1.predict(X_train2)

In [75]:
y_pred_proba = knn1.predict_proba(X_train2)

In [76]:
print('Accuracy of KNN classifier on training set: {:.2f}'
     .format(knn1.score(X_train2, y_train2)))
print('Accuracy of KNN classifier on validate set: {:.2f}'
     .format(knn1.score(X_validate2, y_validate2)))

Accuracy of KNN classifier on training set: 0.82
Accuracy of KNN classifier on validate set: 0.74


<h4><b>Top 5 Correlation Features

In [77]:
knn2 = KNeighborsClassifier(n_neighbors=5, weights='uniform')

In [78]:
knn2.fit(X_train3, y_train3)

In [79]:
y_pred1 = knn2.predict(X_train3)

In [80]:
y_pred_proba = knn2.predict_proba(X_train3)

In [81]:
print('Accuracy of KNN classifier on training set: {:.2f}'
     .format(knn2.score(X_train3, y_train3)))
print('Accuracy of KNN classifier on validate set: {:.2f}'
     .format(knn2.score(X_validate3, y_validate3)))

Accuracy of KNN classifier on training set: 0.81
Accuracy of KNN classifier on validate set: 0.73


<h4><b>Top 5 Features of Select K Best

In [82]:
knn3 = KNeighborsClassifier(n_neighbors=5, weights='uniform')

In [83]:
knn3.fit(X_train4, y_train4)

In [84]:
y_pred1 = knn3.predict(X_train4)

In [85]:
y_pred_proba = knn3.predict_proba(X_train4)

In [86]:
print('Accuracy of KNN classifier on training set: {:.2f}'
     .format(knn3.score(X_train4, y_train4)))
print('Accuracy of KNN classifier on validate set: {:.2f}'
     .format(knn3.score(X_validate4, y_validate4)))

Accuracy of KNN classifier on training set: 0.80
Accuracy of KNN classifier on validate set: 0.75


<h4><b>Top 5 Features of RFE

In [87]:
knn4 = KNeighborsClassifier(n_neighbors=5, weights='uniform')

In [88]:
knn4.fit(X_train5, y_train5)

In [89]:
y_pred1 = knn4.predict(X_train5)

In [90]:
y_pred_proba = knn4.predict_proba(X_train5)

In [91]:
print('Accuracy of KNN classifier on training set: {:.2f}'
     .format(knn4.score(X_train5, y_train5)))
print('Accuracy of KNN classifier on validate set: {:.2f}'
     .format(knn4.score(X_validate5, y_validate5)))

Accuracy of KNN classifier on training set: 0.61
Accuracy of KNN classifier on validate set: 0.61


> <h2><b>Random Forest

<h4><b>Top 10 Correlation Features

In [98]:
rf1 = RandomForestClassifier(bootstrap=True, 
                            class_weight=None, 
                            criterion='gini',
                            min_samples_leaf=3,
                            n_estimators=100,
                            max_depth=7, 
                            random_state=123)

In [99]:
rf1.fit(X_train2, y_train2)

In [100]:
print(rf1.feature_importances_)

[0.25316276 0.16547026 0.37594088 0.03668024 0.02845953 0.03924768
 0.02342682 0.03190432 0.0328606  0.0128469 ]


In [101]:
y_pred = rf1.predict(X_train2)

In [102]:
y_pred_proba = rf1.predict_proba(X_train2)

In [103]:
print('Accuracy of random forest classifier on training set: {:.2f}'
     .format(rf1.score(X_train2, y_train2)))
print('Accuracy of random forest classifier on validate set: {:.2f}'
     .format(rf1.score(X_validate2, y_validate2)))

Accuracy of random forest classifier on training set: 0.80
Accuracy of random forest classifier on validate set: 0.76


<h4><b>Top 5 Correlation Features

In [104]:
rf2 = RandomForestClassifier(bootstrap=True, 
                            class_weight=None, 
                            criterion='gini',
                            min_samples_leaf=3,
                            n_estimators=100,
                            max_depth=7, 
                            random_state=123)

In [105]:
rf2.fit(X_train3, y_train3)

In [106]:
print(rf2.feature_importances_)

[0.22428246 0.17875816 0.54789925 0.0213155  0.02774464]


In [107]:
y_pred = rf2.predict(X_train3)

In [108]:
y_pred_proba = rf2.predict_proba(X_train3)

In [109]:
print('Accuracy of random forest classifier on training set: {:.2f}'
     .format(rf2.score(X_train3, y_train3)))
print('Accuracy of random forest classifier on validate set: {:.2f}'
     .format(rf2.score(X_validate3, y_validate3)))

Accuracy of random forest classifier on training set: 0.79
Accuracy of random forest classifier on validate set: 0.75


<h4><b>Top 5 SelectK Best Features

In [111]:
rf3 = RandomForestClassifier(bootstrap=True, 
                            class_weight=None, 
                            criterion='gini',
                            min_samples_leaf=3,
                            n_estimators=100,
                            max_depth=7, 
                            random_state=123)

In [112]:
rf3.fit(X_train4, y_train4)

In [113]:
print(rf3.feature_importances_)

[0.49379895 0.11985324 0.25117791 0.05290139 0.08226851]


In [114]:
y_pred = rf3.predict(X_train4)

In [115]:
y_pred_proba = rf3.predict_proba(X_train4)

In [116]:
print('Accuracy of random forest classifier on training set: {:.2f}'
     .format(rf3.score(X_train4, y_train4)))
print('Accuracy of random forest classifier on validate set: {:.2f}'
     .format(rf3.score(X_validate4, y_validate4)))

Accuracy of random forest classifier on training set: 0.81
Accuracy of random forest classifier on validate set: 0.78


<h4><b>Top 5 RFE Features

In [117]:
rf4 = RandomForestClassifier(bootstrap=True, 
                            class_weight=None, 
                            criterion='gini',
                            min_samples_leaf=3,
                            n_estimators=100,
                            max_depth=7, 
                            random_state=123)

In [118]:
rf4.fit(X_train5, y_train5)

In [119]:
print(rf4.feature_importances_)

[0.12985691 0.04802041 0.15348593 0.31848735 0.3501494 ]


In [120]:
y_pred = rf4.predict(X_train5)

In [121]:
y_pred_proba = rf4.predict_proba(X_train5)

In [122]:
print('Accuracy of random forest classifier on training set: {:.2f}'
     .format(rf4.score(X_train5, y_train5)))
print('Accuracy of random forest classifier on validate set: {:.2f}'
     .format(rf4.score(X_validate5, y_validate5)))

Accuracy of random forest classifier on training set: 0.73
Accuracy of random forest classifier on validate set: 0.73


<h1>Best Model on Test Data

<b><h2>The Best Model was Random Forest using SelectK Best Top 5 features

> Features Selected were: <b>tenure_scaled</b>, <b>contract_type_Two year</b>, <b>internet_service_type_Fiber optic</b>, <b>internet_service_type_None</b>, <b>payment_type_Electronic check</b>

In [123]:
rf5 = RandomForestClassifier(bootstrap=True, 
                            class_weight=None, 
                            criterion='gini',
                            min_samples_leaf=3,
                            n_estimators=100,
                            max_depth=7, 
                            random_state=123)

In [124]:
rf5.fit(X_train4, y_train4)

In [125]:
print(rf5.feature_importances_)

[0.49379895 0.11985324 0.25117791 0.05290139 0.08226851]


In [126]:
y_pred = rf5.predict(X_train4)

In [127]:
y_pred_proba = rf5.predict_proba(X_train4)

<b><h3>The Results

In [129]:
print('Accuracy of random forest classifier on Training set: {:.2f}'
     .format(rf5.score(X_train4, y_train4)))
print('Accuracy of random forest classifier on Validate set: {:.2f}'
     .format(rf5.score(X_validate4, y_validate4)))
print('Accuracy of random forest classifier on Test set: {:.2f}'
     .format(rf5.score(X_test4, y_test4)))

Accuracy of random forest classifier on Training set: 0.81
Accuracy of random forest classifier on Validate set: 0.78
Accuracy of random forest classifier on Test set: 0.79
