In [2]:
# Tom
# Package Imports

import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import plot_confusion_matrix, recall_score,\
    accuracy_score, precision_score, f1_score, classification_report
from sklearn.dummy import DummyRegressor, DummyClassifier

#New imports for our Pipeline workflows
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import ColumnTransformer

#New imports from imblearn
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImPipeline





In [3]:
df = pd.read_csv('../data/original_data.csv')

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3333 entries, 0 to 3332
Data columns (total 21 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   state                   3333 non-null   object 
 1   account length          3333 non-null   int64  
 2   area code               3333 non-null   int64  
 3   phone number            3333 non-null   object 
 4   international plan      3333 non-null   object 
 5   voice mail plan         3333 non-null   object 
 6   number vmail messages   3333 non-null   int64  
 7   total day minutes       3333 non-null   float64
 8   total day calls         3333 non-null   int64  
 9   total day charge        3333 non-null   float64
 10  total eve minutes       3333 non-null   float64
 11  total eve calls         3333 non-null   int64  
 12  total eve charge        3333 non-null   float64
 13  total night minutes     3333 non-null   float64
 14  total night calls       3333 non-null   

State, phone number, intl plan, voice mail plan are objects -- need to encode

Account Length - maybe days, how long customer has been with company

6 - 18 Usage Stats = number of calls number of minutes and charge for different time categories.

19 - # of customer service calls

Possible Churn Indicators:

- By State?
- High Customer Service Contact = high churn?
- Low Calls/Low minutes = higher churn?
- Can we identify a customer profile that is likely to churn for targeted marketing/incentives?
- Drop Phone number -- basically a unique identifer for every customer, not likely to help in predictions

- Location
    - State/Area Code
    
- Duration
    - Account Length
    
- Plan Types
    - Intl / Voicemail
    
- Usage Stats
    - minutes
    - num calls
    - charges
    - customer service calls

Drop Phone number

Encoding Process:
    - voicemail plan = labelencoder 1/0
    - international plan = 1/0
    - state - onehot encoding

Scale Numerical Features





In [5]:
# Drop phone number from the data set--in this context, it acts as a unique identifier with little meaningful context.
df.drop('phone number', axis=1, inplace=True)

# Train Test Split

In [6]:
X = df.drop('churn', axis=1)
y = df['churn']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.33)

In [7]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2233 entries, 2360 to 3174
Data columns (total 19 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   state                   2233 non-null   object 
 1   account length          2233 non-null   int64  
 2   area code               2233 non-null   int64  
 3   international plan      2233 non-null   object 
 4   voice mail plan         2233 non-null   object 
 5   number vmail messages   2233 non-null   int64  
 6   total day minutes       2233 non-null   float64
 7   total day calls         2233 non-null   int64  
 8   total day charge        2233 non-null   float64
 9   total eve minutes       2233 non-null   float64
 10  total eve calls         2233 non-null   int64  
 11  total eve charge        2233 non-null   float64
 12  total night minutes     2233 non-null   float64
 13  total night calls       2233 non-null   int64  
 14  total night charge      2233 non-null

In [8]:
X_train['voice mail plan'].value_counts()

no     1621
yes     612
Name: voice mail plan, dtype: int64

In [9]:
# Define datatype of columns
num_cols = [1, 2, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18]
cat_cols = [0, 3, 4]

# IMBLearn Pipeline for SMOTE

In [16]:
sm = SMOTE(sampling_strategy=0.75, random_state=42)

In [17]:
# Define functions to identify and select columns based on the datatype stored in that column.
def get_numeric(df):
    return df.select_dtypes(include=['float', 'int'])

def get_categorical(df):
    return df.select_dtypes(include=['bool', 'object'])

# Create transformer objects using our get functions
GetNumeric = FunctionTransformer(get_numeric)
GetCategories = FunctionTransformer(get_categorical)

In [18]:
# Subpipelines to select and scale our numeric data / select and one-hot encode our categorical data.
subpipe_num = Pipeline(steps=[('num', GetNumeric),
                        ('ss', StandardScaler())])
 
subpipe_ohe = Pipeline(steps=[('cat', GetCategories), 
                              ('ohe', OneHotEncoder(drop='if_binary', sparse=False))])

In [19]:
# Create lists of numeric and categorical columns.
num_cols = [1, 2, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18]
cat_cols = [0, 3, 4]

# Create ColumnTransformer object that contains our subpipes for column transformation
CT = ColumnTransformer(transformers=[
                                    ('subpipe_num', subpipe_num, num_cols),
                                    ('subpipe_ohe', subpipe_ohe, cat_cols)]
                      )

In [20]:
# Template, any model can be appended to the end.
template_model_pipe = ImPipeline(steps=[
                                        ('ct', CT),
                                        ('sm', sm),
                                        ('dc', DummyClassifier(strategy='most_frequent', random_state=42))
                                       ]
                                )

In [21]:
template_model_pipe.fit(X_train, y_train)
template_model_pipe.score(X_train, y_train)

0.8553515450067174

# DecisionTree

In [22]:
from sklearn.tree import DecisionTreeClassifier

In [23]:
dt_pipeline = ImPipeline(steps=[
                                        ('ct', CT),
                                        ('sm', sm),
                                        ('dt', DecisionTreeClassifier(random_state=42, min_samples_leaf=2))
                                       ]
                                )

In [24]:
dt_pipeline.fit(X_train, y_train)

Pipeline(steps=[('ct',
                 ColumnTransformer(transformers=[('subpipe_num',
                                                  Pipeline(steps=[('num',
                                                                   FunctionTransformer(func=<function get_numeric at 0x7fe0680a1f70>)),
                                                                  ('ss',
                                                                   StandardScaler())]),
                                                  [1, 2, 5, 6, 7, 8, 9, 10, 11,
                                                   12, 13, 14, 15, 16, 17,
                                                   18]),
                                                 ('subpipe_ohe',
                                                  Pipeline(steps=[('cat',
                                                                   FunctionTransformer(func=<function get_categorical at 0x7fe0a825e8b0>)),
                                                  

In [25]:
dt_y_hat = dt_pipeline.predict(X_test)
print(classification_report(y_test, dt_y_hat))

              precision    recall  f1-score   support

       False       0.95      0.95      0.95       940
        True       0.72      0.73      0.73       160

    accuracy                           0.92      1100
   macro avg       0.84      0.84      0.84      1100
weighted avg       0.92      0.92      0.92      1100



In [26]:
f1_score(y_test, dt_y_hat, dt_pipeline, zero_division=0)

                 ColumnTransformer(transformers=[('subpipe_num',
                                                  Pipeline(steps=[('num',
                                                                   FunctionTransformer(func=<function get_numeric at 0x7fe0680a1f70>)),
                                                                  ('ss',
                                                                   StandardScaler())]),
                                                  [1, 2, 5, 6, 7, 8, 9, 10, 11,
                                                   12, 13, 14, 15, 16, 17,
                                                   18]),
                                                 ('subpipe_ohe',
                                                  Pipeline(steps=[('cat',
                                                                   FunctionTransformer(func=<function get_categorical at 0x7fe0a825e8b0>)),
                                                                  ('ohe',

0.7267080745341615

## Grid Search for best params

In [28]:
params = {
                'dt__criterion' : ['gini', 'entropy'],    #entropy
                'dt__min_samples_leaf' : [3, 4, 5, 6]  # 5 min samples
}    

In [29]:
gs = GridSearchCV(estimator=dt_pipeline, param_grid=params, n_jobs=-4, verbose=3, cv=10)

In [30]:
gs.fit(X_train, y_train)

Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=-4)]: Using backend LokyBackend with 7 concurrent workers.
[Parallel(n_jobs=-4)]: Done  18 tasks      | elapsed:    1.3s
[Parallel(n_jobs=-4)]: Done 100 out of 100 | elapsed:    2.0s finished


GridSearchCV(cv=10,
             estimator=Pipeline(steps=[('ct',
                                        ColumnTransformer(transformers=[('subpipe_num',
                                                                         Pipeline(steps=[('num',
                                                                                          FunctionTransformer(func=<function get_numeric at 0x7fe0680a1f70>)),
                                                                                         ('ss',
                                                                                          StandardScaler())]),
                                                                         [1, 2,
                                                                          5, 6,
                                                                          7, 8,
                                                                          9, 10,
                                                              

In [31]:
gs.best_params_

{'dt__criterion': 'gini', 'dt__min_samples_leaf': 4}

# KNN

In [37]:
from sklearn.neighbors import KNeighborsClassifier

In [38]:
knn_pipeline = ImPipeline(steps=[
                                        ('ct', CT),
                                        ('sm', sm),
                                        ('knn', KNeighborsClassifier())
                                       ]
                                )

In [39]:
knn_pipeline.fit(X_train, y_train)

Pipeline(steps=[('ct',
                 ColumnTransformer(transformers=[('subpipe_num',
                                                  Pipeline(steps=[('num',
                                                                   FunctionTransformer(func=<function get_numeric at 0x7fe1fa8eae50>)),
                                                                  ('ss',
                                                                   StandardScaler())]),
                                                  [1, 2, 5, 6, 7, 8, 9, 10, 11,
                                                   12, 13, 14, 15, 16, 17,
                                                   18]),
                                                 ('subpipe_ohe',
                                                  Pipeline(steps=[('cat',
                                                                   FunctionTransformer(func=<function get_categorical at 0x7fe1fa8eaca0>)),
                                                  

In [40]:
knn_y_hat = knn_pipeline.predict(X_test)
print(classification_report(y_test, knn_y_hat))

              precision    recall  f1-score   support

       False       0.94      0.77      0.85       940
        True       0.34      0.70      0.46       160

    accuracy                           0.76      1100
   macro avg       0.64      0.73      0.65      1100
weighted avg       0.85      0.76      0.79      1100



# Logistic Regression

In [41]:
lr_pipeline = ImPipeline(steps=[
                                        ('ct', CT),
                                        ('sm', sm),
                                        ('lr', LogisticRegression(random_state=42))
                                       ]
                                )

In [42]:
lr_pipeline.fit(X_train, y_train)

Pipeline(steps=[('ct',
                 ColumnTransformer(transformers=[('subpipe_num',
                                                  Pipeline(steps=[('num',
                                                                   FunctionTransformer(func=<function get_numeric at 0x7fe1fa8eae50>)),
                                                                  ('ss',
                                                                   StandardScaler())]),
                                                  [1, 2, 5, 6, 7, 8, 9, 10, 11,
                                                   12, 13, 14, 15, 16, 17,
                                                   18]),
                                                 ('subpipe_ohe',
                                                  Pipeline(steps=[('cat',
                                                                   FunctionTransformer(func=<function get_categorical at 0x7fe1fa8eaca0>)),
                                                  

In [43]:
lr_y_hat = lr_pipeline.predict(X_test)
print(classification_report(y_test, lr_y_hat))

              precision    recall  f1-score   support

       False       0.95      0.78      0.86       940
        True       0.37      0.74      0.49       160

    accuracy                           0.77      1100
   macro avg       0.66      0.76      0.67      1100
weighted avg       0.86      0.77      0.80      1100



# Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [49]:
rf_pipeline = ImPipeline(steps=[
                                        ('ct', CT),
               #                         ('sm', sm),
                                        ('rf', RandomForestClassifier(random_state=42))
                                       ]
                                )

In [50]:
rf_pipeline.fit(X_train, y_train)

Pipeline(steps=[('ct',
                 ColumnTransformer(transformers=[('subpipe_num',
                                                  Pipeline(steps=[('num',
                                                                   FunctionTransformer(func=<function get_numeric at 0x7fb7bc890790>)),
                                                                  ('ss',
                                                                   StandardScaler())]),
                                                  [1, 2, 5, 6, 7, 8, 9, 10, 11,
                                                   12, 13, 14, 15, 16, 17,
                                                   18]),
                                                 ('subpipe_ohe',
                                                  Pipeline(steps=[('cat',
                                                                   FunctionTransformer(func=<function get_categorical at 0x7fb7bc890b80>)),
                                                  

In [51]:
rf_y_hat = rf_pipeline.predict(X_test)
print(classification_report(y_test, rf_y_hat))

              precision    recall  f1-score   support

       False       0.94      1.00      0.97       940
        True       0.97      0.61      0.75       160

    accuracy                           0.94      1100
   macro avg       0.95      0.80      0.86      1100
weighted avg       0.94      0.94      0.94      1100



# Cleaning Pipeline

In [None]:
# Define functions to identify and select columns based on the datatype stored in that column.
def get_numeric(df):
    return df.select_dtypes(include=['float', 'int'])

def get_categorical(df):
    return df.select_dtypes(include=['bool', 'object'])

# Create transformer objects using our get functions
GetNumeric = FunctionTransformer(get_numeric)
GetCategories = FunctionTransformer(get_categorical)

In [None]:
# Subpipelines to select and scale our numeric data / select and one-hot encode our categorical data.
subpipe_num = Pipeline(steps=[('num', GetNumeric),
                        ('ss', StandardScaler())])
 
subpipe_ohe = Pipeline(steps=[('cat', GetCategories), 
                              ('ohe', OneHotEncoder(sparse=False))])


In [None]:
# Create lists of numeric and categorical columns.
num_cols = [1, 2, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18]
cat_cols = [0, 3, 4]

# Create ColumnTransformer object that contains our subpipes for column transformation
CT = ColumnTransformer(transformers=[
                                    ('subpipe_num', subpipe_num, num_cols),
                                    ('subpipe_ohe', subpipe_ohe, cat_cols)]
                      )

# DummyRegressor Model Score

In [None]:
# Utilize DummyClassifier as our first model, guessing the most frequent value of y for all ys.
dummy_model_pipe = Pipeline(steps=[('ct', CT),
                                   ('dc', DummyClassifier(strategy='most_frequent', random_state=42))
                                  ])

In [None]:
# Fit dummy model on our training data.
dummy_model_pipe.fit(X_train, y_train)

In [None]:
dummy_model_pipe.score(X_train, y_train)

In [None]:
print(classification_report(y_true=y_train, y_pred=dummy_model_pipe.predict(X_train)))