In [1]:
# Tom
# Package Imports

import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import plot_confusion_matrix, recall_score,\
    accuracy_score, precision_score, f1_score, classification_report
from sklearn.dummy import DummyRegressor, DummyClassifier

#New imports for our Pipeline workflows
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import ColumnTransformer

#New imports from imblearn
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImPipeline





In [2]:
df = pd.read_csv('../data/original_data.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3333 entries, 0 to 3332
Data columns (total 21 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   state                   3333 non-null   object 
 1   account length          3333 non-null   int64  
 2   area code               3333 non-null   int64  
 3   phone number            3333 non-null   object 
 4   international plan      3333 non-null   object 
 5   voice mail plan         3333 non-null   object 
 6   number vmail messages   3333 non-null   int64  
 7   total day minutes       3333 non-null   float64
 8   total day calls         3333 non-null   int64  
 9   total day charge        3333 non-null   float64
 10  total eve minutes       3333 non-null   float64
 11  total eve calls         3333 non-null   int64  
 12  total eve charge        3333 non-null   float64
 13  total night minutes     3333 non-null   float64
 14  total night calls       3333 non-null   

State, phone number, intl plan, voice mail plan are objects -- need to encode

Account Length - maybe days, how long customer has been with company

6 - 18 Usage Stats = number of calls number of minutes and charge for different time categories.

19 - # of customer service calls

Possible Churn Indicators:

- By State?
- High Customer Service Contact = high churn?
- Low Calls/Low minutes = higher churn?
- Can we identify a customer profile that is likely to churn for targeted marketing/incentives?
- Drop Phone number -- basically a unique identifer for every customer, not likely to help in predictions

- Location
    - State/Area Code
    
- Duration
    - Account Length
    
- Plan Types
    - Intl / Voicemail
    
- Usage Stats
    - minutes
    - num calls
    - charges
    - customer service calls

Drop Phone number

Encoding Process:
    - voicemail plan = labelencoder 1/0
    - international plan = 1/0
    - state - onehot encoding

Scale Numerical Features





In [4]:
# Drop phone number from the data set--in this context, it acts as a unique identifier with little meaningful context.
df.drop('phone number', axis=1, inplace=True)

# Train Test Split

In [5]:
X = df.drop('churn', axis=1)
y = df['churn']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.33)

In [6]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2233 entries, 2360 to 3174
Data columns (total 19 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   state                   2233 non-null   object 
 1   account length          2233 non-null   int64  
 2   area code               2233 non-null   int64  
 3   international plan      2233 non-null   object 
 4   voice mail plan         2233 non-null   object 
 5   number vmail messages   2233 non-null   int64  
 6   total day minutes       2233 non-null   float64
 7   total day calls         2233 non-null   int64  
 8   total day charge        2233 non-null   float64
 9   total eve minutes       2233 non-null   float64
 10  total eve calls         2233 non-null   int64  
 11  total eve charge        2233 non-null   float64
 12  total night minutes     2233 non-null   float64
 13  total night calls       2233 non-null   int64  
 14  total night charge      2233 non-null

In [7]:
X_train['voice mail plan'].value_counts()

no     1621
yes     612
Name: voice mail plan, dtype: int64

In [8]:
# Define datatype of columns
num_cols = [1, 2, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18]
cat_cols = [0, 3, 4]

# IMBLearn Pipeline for SMOTE

In [9]:
from imblearn.over_sampling import SMOTENC

In [10]:
sm = SMOTENC(cat_cols, sampling_strategy='auto', random_state=42)

In [11]:
#X_clean_resmp, y_clean_resmp = sm.fit_resample(X_train, y_train)

In [12]:
#y_clean_resmp.value_counts()

In [13]:
# Define functions to identify and select columns based on the datatype stored in that column.
def get_numeric(df):
    return df.select_dtypes(include=['float', 'int'])

def get_categorical(df):
    return df.select_dtypes(include=['bool', 'object'])

# Create transformer objects using our get functions
GetNumeric = FunctionTransformer(get_numeric)
GetCategories = FunctionTransformer(get_categorical)

In [14]:
# Subpipelines to select and scale our numeric data / select and one-hot encode our categorical data.
subpipe_num = Pipeline(steps=[('num', GetNumeric),
                        ('ss', StandardScaler())])
 
subpipe_ohe = Pipeline(steps=[('cat', GetCategories), 
                              ('ohe', OneHotEncoder(sparse=False))])

In [15]:
# Create lists of numeric and categorical columns.
num_cols = [1, 2, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18]
cat_cols = [0, 3, 4]

# Create ColumnTransformer object that contains our subpipes for column transformation
CT = ColumnTransformer(transformers=[
                                    ('subpipe_num', subpipe_num, num_cols),
                                    ('subpipe_ohe', subpipe_ohe, cat_cols)]
                      )

In [16]:
# Template, any model can be appended to the end.
template_model_pipe = ImPipeline(steps=[
                                        ('ct', CT),
                                        ('sm', sm),
                                        ('dc', DummyClassifier(strategy='most_frequent', random_state=42))
                                       ]
                                )

In [17]:
template_model_pipe.fit(X_train, y_train)
template_model_pipe.score(X_train, y_train)

0.8553515450067174

# DecisionTree

# KNN

# Logistic Regression

# Random Forest

# Cleaning Pipeline

In [None]:
# Define functions to identify and select columns based on the datatype stored in that column.
def get_numeric(df):
    return df.select_dtypes(include=['float', 'int'])

def get_categorical(df):
    return df.select_dtypes(include=['bool', 'object'])

# Create transformer objects using our get functions
GetNumeric = FunctionTransformer(get_numeric)
GetCategories = FunctionTransformer(get_categorical)

In [None]:
# Subpipelines to select and scale our numeric data / select and one-hot encode our categorical data.
subpipe_num = Pipeline(steps=[('num', GetNumeric),
                        ('ss', StandardScaler())])
 
subpipe_ohe = Pipeline(steps=[('cat', GetCategories), 
                              ('ohe', OneHotEncoder(sparse=False))])


In [None]:
# Create lists of numeric and categorical columns.
num_cols = [1, 2, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18]
cat_cols = [0, 3, 4]

# Create ColumnTransformer object that contains our subpipes for column transformation
CT = ColumnTransformer(transformers=[
                                    ('subpipe_num', subpipe_num, num_cols),
                                    ('subpipe_ohe', subpipe_ohe, cat_cols)]
                      )

# DummyRegressor Model Score

In [None]:
# Utilize DummyClassifier as our first model, guessing the most frequent value of y for all ys.
dummy_model_pipe = Pipeline(steps=[('ct', CT),
                                   ('dc', DummyClassifier(strategy='most_frequent', random_state=42))
                                  ])

In [None]:
# Fit dummy model on our training data.
dummy_model_pipe.fit(X_train, y_train)

In [None]:
dummy_model_pipe.score(X_train, y_train)

In [None]:
print(classification_report(y_true=y_train, y_pred=dummy_model_pipe.predict(X_train)))

In [None]:
df.head()

In [None]:
df.isna().sum()

In [None]:
#Target Variable is pretty imbalanced.
df['churn'].value_counts()

In [None]:
ch_true = df["churn"][df["churn"] == True].count()
ch_false = df["churn"][df["churn"] == False].count()

avg_churn = ch_true / df['churn'].count()

In [None]:
avg_churn

In [None]:
# Drop phone number bc unique record
df = df.drop(['phone number'],axis=1)
df.shape

In [None]:
df.isnull().sum()

In [None]:
df_state = df['state']

In [None]:
df_cat = [key for key in dict(df.dtypes) if dict(df.dtypes)[key] in ['bool', 'object']]

In [None]:
df_num = [key for key in dict(df.dtypes) if dict(df.dtypes)[key] in ['int64', 'float64']]

In [None]:
df_cat

In [None]:
df_num

In [None]:
# Should train/test split first then transform after.

In [None]:
le = LabelEncoder()
for i in df_cat:
    le.fit(df[i])
    df[i] = le.transform(df[i])

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df['international plan'].value_counts() 

In [None]:
# 0 False, 1 True -- Imbalanced Data we will need to address.
df['churn'].value_counts()

In [None]:
X_train.head()

In [None]:
#Standard Scaler
ss = StandardScaler()
ss.fit(X_train, y_train)
for i in X_train:
    X_train[i] = ss.transform(X_train[i])


In [None]:
ss.fit(X_train, y_train)

In [None]:
## Come back to scaling w/ column headers
X_train_scaled = ss.transform(X_train)


In [None]:
X_train_scaled.head()

In [None]:
X_train_scaled.info()

In [None]:
y_train.value_counts()

In [None]:
y_test.value_counts()

In [None]:


from sklearn.preprocessing import StandardScaler
# from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier



from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import roc_auc_score, plot_roc_curve
from sklearn.tree import export_graphviz

In [None]:
dr = DummyRegressor()
dr.fit(X_train, y_train)


In [None]:
dr.score(X_test, y_test)

In [None]:
dc = DummyClassifier()
dc.fit(X_train, y_train)

In [None]:
dr.score(X_test, y_test)

# Logistic Regression

In [None]:
lr = LogisticRegression()
lr.fit(X_train, y_train)
lrpred = lr.predict(X_test)

In [None]:
lrpred.sum()

# KNN

# DT Classifier