# Minimum Viable Product
- Can using security and support features alone predict if a customer churns or not?

In [1]:
# python files that contain the functions to aquire and prep the data
import acquire
import prepare

# import python libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

#imports for model object and metric functions
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

print('Imports complete.')

Imports complete.


In [2]:
df = acquire.get_churn_data()
df = prepare.prep_telco_df(df)
train, test, validate = prepare.train_test_validate(df)

No duplicates found.
Dummy variables for gender created as "male".
Yes/No column values changed to boolean, 0 as no and 1 as yes
Combined variable for phone_service + multiple lines created.
Combined variable for partner + dependents created.
Changed streaming tv and movies to 0 for no, 1 for streams.
Changed backup and security to 0 for no, 1 for having the feature.
Changed protection and support to 0 for no, 1 for having the feature.
Added feature for tenure in years.
Converted total_charges to float for easier manipulation.
Data prep complete.


train shape:  (5070, 23) , validate shape:  (1268, 23) , test shape:  (705, 23)

train percent:  72.0 , validate percent:  18.0 , test percent:  10.0


In [3]:
train.head(1).T

Unnamed: 0,855
customer_id,6923-EFPNL
senior_citizen,0
tenure_months,4
phone_service,2
internet_service_type_id,1
online_security,0
online_backup,0
device_protection,0
tech_support,0
streaming_tv,0


In [4]:
# our predictor/independent features split into train, test, validate
X_train = train.drop(columns = ['customer_id','churn','contract_type','payment_type','internet_service_type'],axis=1)
X_validate = validate.drop(columns = ['customer_id','churn','contract_type','payment_type','internet_service_type'],axis=1)
X_test = test.drop(columns = ['customer_id','churn','contract_type','payment_type','internet_service_type'],axis=1)

# our target variables split into train, test, validate
y_train = train.churn
y_validate = validate.churn
y_test = test.churn

# Creating the baseline
- will use as the comparison for the model created

In [5]:
# most of our customers are still with the company (0 = has not churned)
y_train.value_counts()

0    3720
1    1350
Name: churn, dtype: int64

In [6]:
baseline = pd.DataFrame(y_train)
baseline['baseline'] = 0

baseline.columns = ['actual','baseline']
baseline.head()

Unnamed: 0,actual,baseline
855,1,0
2823,1,0
5656,0,0
1016,0,0
1175,0,0


In [7]:
# cross tab of our baseline versus actual
pd.crosstab(baseline['baseline'], baseline['actual'])

actual,0,1
baseline,Unnamed: 1_level_1,Unnamed: 2_level_1
0,3720,1350


In [8]:
# let's calculate the accuracy
# positive will be not churned
# (TP + TN) / (TP + TN + FP + FN)

#predicting not churned and the customer has not churned
TP = 3720

#predicting not churned and the customer has churned
FP = 1350

#predicting the customer has churned and they have churned
TN = 0

#predicting the customer has churned and they have not churned
FN = 0

base_acc = (TP + TN) / (TP + TN + FP + FN)

print("The baseline model is",round(base_acc * 100, 2),"percent.")

The baseline model is 73.37 percent.


In [9]:
clf = DecisionTreeClassifier()

clf.fit(X_train, y_train)

y_pred = clf.predict(X_train)
y_pred_proba = clf.predict_proba(X_train)

#evaluate metrics
print('Accuracy of model with all features:', clf.score(X_train, y_train))
print('\nThe confusion matrix:\n',confusion_matrix(y_train, y_pred))
print('\nClassification report:\n',classification_report(y_train, y_pred))

Accuracy of model with all features: 0.9976331360946745

The confusion matrix:
 [[3720    0]
 [  12 1338]]

Classification report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      3720
           1       1.00      0.99      1.00      1350

    accuracy                           1.00      5070
   macro avg       1.00      1.00      1.00      5070
weighted avg       1.00      1.00      1.00      5070



# Using all features is probably overfitting to the train. Let's narrow it down to...
- tech support
- device protection
- online security
- online backup
    - these are all feautres for internet only, but I feel could be added as on option for phone service as well

In [10]:
# our predictor/independent features split into train, test, validate
X_train = train[['tech_support','online_security','device_protection','online_backup']]
X_validate = validate[['tech_support','online_security','device_protection','online_backup']]
X_test = test[['tech_support','online_security','device_protection','online_backup']]

y_train = train.churn
y_validate = validate.churn
y_test = test.churn

In [11]:
clf = DecisionTreeClassifier()

clf.fit(X_train, y_train)

y_pred = clf.predict(X_train)
y_pred_proba = clf.predict_proba(X_train)

#evaluate metrics
print('Accuracy of Decision Tree model with support/security/protection:', clf.score(X_train, y_train))
print('\nThe confusion matrix:\n',confusion_matrix(y_train, y_pred))
print('\nClassification report:\n',classification_report(y_train, y_pred))

Accuracy of Decision Tree model with support/security/protection: 0.7337278106508875

The confusion matrix:
 [[3720    0]
 [1350    0]]

Classification report:
               precision    recall  f1-score   support

           0       0.73      1.00      0.85      3720
           1       0.00      0.00      0.00      1350

    accuracy                           0.73      5070
   macro avg       0.37      0.50      0.42      5070
weighted avg       0.54      0.73      0.62      5070



  _warn_prf(average, modifier, msg_start, len(result))


In [12]:
rf = RandomForestClassifier()

rf.fit(X_train, y_train)

y_pred = rf.predict(X_train)

y_pred_proba = rf.predict_proba(X_train)

print('Accuracy of Random Forest model with support/security/protection:',rf.score(X_train, y_train))
print('\nThe confusion matrix:\n',confusion_matrix(y_train, y_pred))
print('\nClassification report:\n',classification_report(y_train, y_pred))

Accuracy of Random Forest model with support/security/protection: 0.7337278106508875

The confusion matrix:
 [[3720    0]
 [1350    0]]

Classification report:
               precision    recall  f1-score   support

           0       0.73      1.00      0.85      3720
           1       0.00      0.00      0.00      1350

    accuracy                           0.73      5070
   macro avg       0.37      0.50      0.42      5070
weighted avg       0.54      0.73      0.62      5070



  _warn_prf(average, modifier, msg_start, len(result))


In [13]:
knn = KNeighborsClassifier(n_neighbors=10)

knn.fit(X_train, y_train)

y_pred = knn.predict(X_train)

y_pred_proba = knn.predict_proba(X_train)

print('Accuracy of KNN model with support/security/protection:',knn.score(X_train, y_train))
print('\nThe confusion matrix:\n',confusion_matrix(y_train, y_pred))
print('\nClassification report:\n',classification_report(y_train, y_pred))

Accuracy of KNN model with support/security/protection: 0.7199211045364892

The confusion matrix:
 [[3560  160]
 [1260   90]]

Classification report:
               precision    recall  f1-score   support

           0       0.74      0.96      0.83      3720
           1       0.36      0.07      0.11      1350

    accuracy                           0.72      5070
   macro avg       0.55      0.51      0.47      5070
weighted avg       0.64      0.72      0.64      5070



# Takeaways
- online features are not enough to predict churn
- DF and RF overfit, only predict positive
- KNN overfits if k > 13, and accuracy is still less than baseline
- let's add more features

In [14]:
# our predictor/independent features split into train, test, validate
X_train = train[['streaming_movies','streaming_tv','tech_support','online_security','device_protection','online_backup']]
X_validate = validate[['streaming_movies','streaming_tv','tech_support','online_security','device_protection','online_backup']]
X_test = test[['streaming_movies','streaming_tv','tech_support','online_security','device_protection','online_backup']]

y_train = train.churn
y_validate = validate.churn
y_test = test.churn

In [15]:
clf = DecisionTreeClassifier(max_depth = 5)

clf.fit(X_train, y_train)

y_pred = clf.predict(X_train)
y_pred_proba = clf.predict_proba(X_train)

#evaluate metrics
print('Accuracy of Decision Tree model with support/security/protection/streaming on Train:', clf.score(X_train, y_train))
print('\nThe confusion matrix:\n',confusion_matrix(y_train, y_pred))
print('\nClassification report:\n',classification_report(y_train, y_pred))

Accuracy of Decision Tree model with support/security/protection/streaming on Train: 0.7512820512820513

The confusion matrix:
 [[3548  172]
 [1089  261]]

Classification report:
               precision    recall  f1-score   support

           0       0.77      0.95      0.85      3720
           1       0.60      0.19      0.29      1350

    accuracy                           0.75      5070
   macro avg       0.68      0.57      0.57      5070
weighted avg       0.72      0.75      0.70      5070



In [16]:
rf = RandomForestClassifier(max_depth=5)

rf.fit(X_train, y_train)

y_pred = rf.predict(X_train)

y_pred_proba = rf.predict_proba(X_train)

print('Accuracy of Random Forest model with support/security/protection/streaming on Train:',rf.score(X_train, y_train))
print('\nThe confusion matrix:\n',confusion_matrix(y_train, y_pred))
print('\nClassification report:\n',classification_report(y_train, y_pred))

Accuracy of Random Forest model with support/security/protection/streaming on Train: 0.7510848126232742

The confusion matrix:
 [[3559  161]
 [1101  249]]

Classification report:
               precision    recall  f1-score   support

           0       0.76      0.96      0.85      3720
           1       0.61      0.18      0.28      1350

    accuracy                           0.75      5070
   macro avg       0.69      0.57      0.57      5070
weighted avg       0.72      0.75      0.70      5070



In [17]:
knn = KNeighborsClassifier(n_neighbors=10)

knn.fit(X_train, y_train)

y_pred = knn.predict(X_train)

y_pred_proba = knn.predict_proba(X_train)

print('Accuracy of KNN model with support/security/protection/streaming on Train:',knn.score(X_train, y_train))
print('\nThe confusion matrix:\n',confusion_matrix(y_train, y_pred))
print('\nClassification report:\n',classification_report(y_train, y_pred))

Accuracy of KNN model with support/security/protection/streaming on Train: 0.7406311637080868

The confusion matrix:
 [[3643   77]
 [1238  112]]

Classification report:
               precision    recall  f1-score   support

           0       0.75      0.98      0.85      3720
           1       0.59      0.08      0.15      1350

    accuracy                           0.74      5070
   macro avg       0.67      0.53      0.50      5070
weighted avg       0.71      0.74      0.66      5070



# Takeaways
- model appears to work with additional features of streaming tv and streaming movies
- tuning parameters with validate

### Decision Tree

In [18]:
#y_pred = clf.predict(X_validate)
#y_pred_proba = clf.predict_proba(X_validate)

#evaluate metrics
print('Accuracy of Decision Tree model with support/security/protection/streaming on Validate:', clf.score(X_train, y_train))
#print('\nThe confusion matrix:\n',confusion_matrix(y_validate, y_pred))
#print('\nClassification report:\n',classification_report(y_validate, y_pred))

Accuracy of Decision Tree model with support/security/protection/streaming on Validate: 0.7512820512820513


### Random Forest

In [19]:
#y_pred = rf.predict(X_validate)

#y_pred_proba = rf.predict_proba(X_validate)

print('Accuracy of Random Forest model with support/security/protection/streaming on Validate:',rf.score(X_validate, y_validate))
#print('\nThe confusion matrix:\n',confusion_matrix(y_validate, y_pred))
#print('\nClassification report:\n',classification_report(y_validate, y_pred))

Accuracy of Random Forest model with support/security/protection/streaming on Validate: 0.7687943262411348


### KNN

In [20]:
#y_pred = knn.predict(X_validate)

#y_pred_proba = knn.predict_proba(X_validate)

print('Accuracy of KNN model with support/security/protection/streaming on Validate:',knn.score(X_validate, y_validate))
#print('\nThe confusion matrix:\n',confusion_matrix(y_validate, y_pred))
#print('\nClassification report:\n',classification_report(y_validate, y_pred))

Accuracy of KNN model with support/security/protection/streaming on Validate: 0.7432624113475177


# Takeaways
- The Random Forest is the most accurate on the train and validate data sets
- But is it over fitting? Check with the test data

In [21]:
print('Accuracy of Random Forest model with support/security/protection/streaming on Test:',rf.score(X_test, y_test))

Accuracy of Random Forest model with support/security/protection/streaming on Test: 0.7586750788643533
