In [1]:
import warnings
warnings.filterwarnings("ignore")

from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

import pandas as pd
import numpy as np
import env
import acquire
import prepare

## Project Planning

- Data dictionary

**internet_service_type_id:** type of internet service a customer has with the options of DSL = 1, Fiber Optic = 2, None = 3 

**payment_type_id:** type of payment setup for a customer the options of Electronic Check = 1, Mailed Check = 2, Bank Transfer = 3, Credit Card = 4 

**contract_type_id:** type of contract selected by a customer with the options of Month-to-Month = 1, One Year = 2, Two Year = 3

**customer_id:** unique identifier for each customer having 10 characters

**gender:** whether a customer is male or female with male = 0 and female = 1

**senior_citizen:** whether a customer is a senior or not with not being a senior = 0 and being a senior = 1 

**partner:** whether a customer has a partner or not with not having having a partner = 0 and having a partner = 1

**dependents:** whether a customer has any dependents or not with not having having any dependents = 0 and having a dependent = 1

**tenure:** number of months a customer was/or continues to be a customer

**tenure_years:** number of years a customer was/or continues to be a customer. A created feature rounding up the number of months to the next year 

**phone_service:** whether a customer has phone service or not with not having any phone service = 0 and having phone service = 1

**multiple_lines:** whether a customer has multiple phone lines or not with not having multiple phone lines = 0 and having multiple phone lines = 1

**online_security:** whether a customer has online security or not with not having any online security = 0 and having online security = 1

**online_backup:** whether a customer has online backup or not with not having any online backup = 0 and having online backup = 1

**device_security:** whether a customer has device security or not with not having any device security = 0 and having device security = 1

**tech_support:** whether a customer has tech support or not with not having any tech support = 0 and having tech support = 1

**streaming_tv:** whether a customer has streaming tv service or not with not having any streaming tv service = 0 and having streaming tv service = 1

**streaming_movies:** whether a customer has streaming movies service or not with not having any streaming movies service = 0 and having streaming movies service = 1

**paperless_billing:** whether a customer has paperless billing or not with not having paperless_billing active = 0 and having having paperless_billing active = 1

**monthly_charges:** amount of charges per month

**total_charges:** amount of charges over full tenure as customer

**churn:** whether or not a customer defected with having not churned = 0 and had churned = 1

## Acquire

- Acquire data from the customers table from the telco_churn database on the codeup data science database server

In [2]:
df = acquire.get_telco_data()

- General churn rate

In [3]:
# calculate the average rate of churn
basic_churn = df.churn.value_counts()
basic_churn_df = pd.DataFrame(basic_churn) 
basic_churn_df

Unnamed: 0,churn
No,5174
Yes,1869


In [4]:
# Find average churn rate
churn_rate_bl = 1869/7043
churn_rate_bl

0.2653698707936959

**Note:** If I was to selectively guess that every customer would not churn then I would be correct 74% of the time

## Prep Baseline Data

In [5]:
# Determine data types
# Note total_charges is an object
df.dtypes

internet_service_type_id      int64
payment_type_id               int64
contract_type_id              int64
customer_id                  object
gender                       object
senior_citizen                int64
partner                      object
dependents                   object
tenure                        int64
phone_service                object
multiple_lines               object
online_security              object
online_backup                object
device_protection            object
tech_support                 object
streaming_tv                 object
streaming_movies             object
paperless_billing            object
monthly_charges             float64
total_charges                object
churn                        object
contract_type                object
payment_type                 object
internet_service_type        object
dtype: object

In [6]:
# Clean data by converting total_charges to a float and drop null/nan
df = prepare.clean_data(df)

In [27]:
# Encode the churn column for the entire dataframe
df = prepare.encoder(df)

**Task:** I need to select which variables, pick a model and run the data

- Split my data into a .80 train / test .20 split and use a random state = 123
- Drop non numeric colums

In [47]:
# Function split_my_data_bl() in prepare.py:

# Drops columns = "customer_id", "gender", "partner", "dependents", "phone_service", \
# "multiple_lines", "online_security", "online_backup", "device_protection", "tech_support", \
# "streaming_tv", "streaming_movies" "paperless_billing", "contract_type", "payment_type", "internet_service_type"

# Splits data into X (mult. variables), y (target), and into train and test dataframes

X, y, X_train, X_test, y_train, y_test = prepare.split_my_data_bl(df)

## Model Baseline

- Use a Decision Tree Model with the criterion='gini', max depth = 3, and random state = 123

In [22]:
clf = DecisionTreeClassifier(criterion='gini', max_depth=3, random_state=123)
clf.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=3,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=123, splitter='best')

In [24]:
# Predict churn

y_pred = clf.predict(X_train)

array([0, 0, 1, ..., 0, 0, 1])

In [37]:
# Probability of churn

y_pred_proba = clf.predict_proba(X_train)
y_pred_proba

array([[0.91489362, 0.08510638],
       [0.91489362, 0.08510638],
       [0.30799476, 0.69200524],
       ...,
       [0.92966002, 0.07033998],
       [0.59355509, 0.40644491],
       [0.30799476, 0.69200524]])

- Evaluate your in-sample results using the model score, confusion matrix, and classification report.

In [28]:
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
     .format(clf.score(X_train, y_train)))

Accuracy of Decision Tree classifier on training set: 0.79


In [29]:
confusion_matrix(y_train, y_pred)


array([[3898,  235],
       [ 964,  528]])

In [17]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.80      0.94      0.87      4133
           1       0.69      0.35      0.47      1492

    accuracy                           0.79      5625
   macro avg       0.75      0.65      0.67      5625
weighted avg       0.77      0.79      0.76      5625



## Explore

1. Could the month in which they signed up influence churn? i.e. if a cohort is identified by tenure, is there a cohort or cohorts who have a higher rate of churn than other cohorts? (Plot the rate of churn on a line chart where x is the tenure and y is the rate of churn (customers churned/total customers))

In [38]:
# function to encode all variables and drop contract_type, internet_service_type, and payment_type
def encoder_all(df):
    encoder = LabelEncoder()
    df_encoded = df.drop(columns=['contract_type', 'internet_service_type','payment_type'])
    encode_list = ['gender','partner', 'dependents', 'phone_service','multiple_lines', 'online_security', \
                  'online_backup','device_protection','tech_support','streaming_tv', \
                  'streaming_movies', 'paperless_billing', 'churn']
    for c in encode_list:
        df_encoded[c] = encoder.fit_transform(df_encoded[c])
    return df_encoded

In [48]:
df_encoded = encoder_all(df)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7032 entries, 0 to 7042
Data columns (total 21 columns):
internet_service_type_id    7032 non-null int64
payment_type_id             7032 non-null int64
contract_type_id            7032 non-null int64
customer_id                 7032 non-null object
gender                      7032 non-null int64
senior_citizen              7032 non-null int64
partner                     7032 non-null int64
dependents                  7032 non-null int64
tenure                      7032 non-null int64
phone_service               7032 non-null int64
multiple_lines              7032 non-null int64
online_security             7032 non-null int64
online_backup               7032 non-null int64
device_protection           7032 non-null int64
tech_support                7032 non-null int64
streaming_tv                7032 non-null int64
streaming_movies            7032 non-null int64
paperless_billing           7032 non-null int64
monthly_charges             

2. Are there features that indicate a higher propensity to churn? like type of internet service, type of phone service, online security and backup, senior citizens, paying more than x% of customers with the same services, etc.?

3. Is there a price threshold for specific services where the likelihood of churn increases once price for those services goes past that point? If so, what is that point for what service(s)?

4. If we looked at churn rate for month-to-month customers after the 12th month and that of 1-year contract customers after the 12th month, are those rates comparable?