In [71]:
# Import dependencies
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

Function "addition" took 0.05387212400091812 seconds to complete.


In [72]:
""" A simple decorator that times the duration of a function's execution. More info on Decorators at https://pythonconquerstheuniverse.wordpress.com/2009/08/06/introduction-to-python-decorators-part-1/"""
import timeit

def timer(function):
  def new_function():
    start_time = timeit.default_timer()
    function()
    elapsed = timeit.default_timer() - start_time
    print('Function "{name}" took {time} seconds to complete.'.format(name=function.__name__, time=elapsed))
  return new_function()

@timer
def addition():
  total = 0
  for i in range(0,1000000):
    total += i
  return total

Function "addition" took 0.05265680900265579 seconds to complete.


In [None]:
# Import Data
main_file_path = '../data/data_customer.csv'
data = pd.read_csv(main_file_path)

print(data.shape)
data.head()

### Data Preprocessing

In [13]:
# Display head to get an idea of columns and data type
data.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [14]:
# Display all column headers
data.columns.values

array(['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents',
       'tenure', 'PhoneService', 'MultipleLines', 'InternetService',
       'OnlineSecurity', 'OnlineBackup', 'DeviceProtection',
       'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract',
       'PaperlessBilling', 'PaymentMethod', 'MonthlyCharges',
       'TotalCharges', 'Churn'], dtype=object)

In [15]:
# Display Data types of each column
data.dtypes

customerID           object
gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges         object
Churn                object
dtype: object

In [16]:
# Variable TotalCharges should be a continuous numeric data type rather than a discrete object data type
data.TotalCharges = pd.to_numeric(data.TotalCharges, errors='coerce')

# Check for NULL entries
data.isnull().sum()

customerID           0
gender               0
SeniorCitizen        0
Partner              0
Dependents           0
tenure               0
PhoneService         0
MultipleLines        0
InternetService      0
OnlineSecurity       0
OnlineBackup         0
DeviceProtection     0
TechSupport          0
StreamingTV          0
StreamingMovies      0
Contract             0
PaperlessBilling     0
PaymentMethod        0
MonthlyCharges       0
TotalCharges        11
Churn                0
dtype: int64

In [95]:
# Remove all entries that contain NULL
data.dropna(inplace=True)

# Remove customer ID
df = data.iloc[:,1:]

# Upsample
#https://chrisalbon.com/machine_learning/preprocessing_structured_data/handling_imbalanced_classes_with_upsampling/
# Indicies of each class' observations
i_class0 = np.where(df.Churn == 'Yes')[0]
i_class1 = np.where(df.Churn == 'No')[0]

# Number of observations in each class
n0 = len(i_class0)
n1 = len(i_class1)

# For every observation in class 1, randomly sample from class 0 with replacement
i_class0_upsampled = np.random.choice(i_class0, size=n1, replace=True)

# Join together class 0's upsampled target vector with class 1's target vector
i_upsampled = np.concatenate((i_class0_upsampled, i_class1))
df_upsampled = df.iloc[i_upsampled.tolist(),:]

# Separate X and Y
ydata = df_upsampled.Churn
xdata_raw = df_upsampled.drop(['Churn'], axis=1)

# Convert Chrun into binary numeric variable
ydata.replace(to_replace='Yes', value=1, inplace=True)
ydata.replace(to_replace='No', value=0, inplace=True)

# Convert all categorial variables in xdata into dummy variables
xdata = pd.get_dummies(xdata_raw)



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


# Scale all features from 0 to 1
from sklearn.preprocessing import MinMaxScaler
features = xdata.columns.values
scaler = MinMaxScaler(feature_range = (0,1))
scaler.fit(xdata)
xdata = pd.DataFrame(scaler.transform(xdata))
xdata.columns = features

xdata.head()


### Predictive Models

In [96]:
# Split data into test and training sets
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix 
X_train, X_test, y_train, y_test = train_test_split(xdata, ydata, test_size=0.3, random_state=101)


#### - Decision Tree
Learning curve: # Train samples
Complexity curve: Max_depth

In [97]:
from sklearn.tree import DecisionTreeClassifier  

def decision_tree(X_train, X_test, y_train, y_test, max_depth):
    dtree = DecisionTreeClassifier(max_depth=5)  
    dtree.fit(X_train, y_train) 
    y_pred = dtree.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    return accuracy

In [98]:
# Learning curve based off of number of Samples in training set
accuracy = []
precentages = [.1,.2,.3,.4,.5,.6,.7,.8,.9]
for precent in precentages:
    X_train, X_test, y_train, y_test = train_test_split(xdata, ydata, test_size=precent, random_state=101)
    start_time = timeit.default_timer()
    acc = (decision_tree(X_train, X_test, y_train, y_test, max_depth=5))
    elapsed = timeit.default_timer() - start_time
    print('This "{name}" took {time} seconds to complete.'.format(name='decision tree', time=elapsed))
    accuracy.append(acc)
print(accuracy)

This "decision tree" took 1.4592362080002204 seconds to complete.
This "decision tree" took 0.916603057001339 seconds to complete.
This "decision tree" took 0.7054675609979313 seconds to complete.
This "decision tree" took 0.6363503669999773 seconds to complete.
This "decision tree" took 0.5631404699997802 seconds to complete.
This "decision tree" took 0.5232436179976503 seconds to complete.
This "decision tree" took 0.46434713099733926 seconds to complete.
This "decision tree" took 0.4137524139987363 seconds to complete.
This "decision tree" took 0.35775479700168944 seconds to complete.
[0.7603864734299517, 0.7739130434782608, 0.778743961352657, 0.7842995169082125, 0.7761886354851179, 0.760669995168304, 0.7519326339039205, 0.7531102790192052, 0.7522009877603607]


In [99]:
# Learning curve based off of number of Samples in training set
accuracy = []
depths = [1,2,3,4,5,6,7,8,9,10]
for depth in depths:
    X_train, X_test, y_train, y_test = train_test_split(xdata, ydata, test_size=.3, random_state=101)
    start_time = timeit.default_timer()
    acc = (decision_tree(X_train, X_test, y_train, y_test, max_depth=depth))
    elapsed = timeit.default_timer() - start_time
    print('This "{name}" took {time} seconds to complete.'.format(name='decision tree', time=elapsed))
    accuracy.append(acc)
print(accuracy)

This "decision tree" took 0.6863733390018751 seconds to complete.
This "decision tree" took 0.7238955259999784 seconds to complete.
This "decision tree" took 0.707517648999783 seconds to complete.
This "decision tree" took 0.7092672329999914 seconds to complete.
This "decision tree" took 0.7213830110013078 seconds to complete.
This "decision tree" took 0.7120156119999592 seconds to complete.
This "decision tree" took 0.7998558149993187 seconds to complete.
This "decision tree" took 0.7067228039995825 seconds to complete.
This "decision tree" took 0.68541065199679 seconds to complete.
This "decision tree" took 0.6811699899990344 seconds to complete.
[0.7790660225442834, 0.7784219001610306, 0.7784219001610306, 0.7790660225442834, 0.778743961352657, 0.7784219001610306, 0.778743961352657, 0.778743961352657, 0.7784219001610306, 0.7784219001610306]


In [100]:
 
print(confusion_matrix(y_test, y_pred))  
print(classification_report(y_test, y_pred))
print('Overall Accuracy = ', accuracy_score(y_test, y_pred))

ValueError: Found input variables with inconsistent numbers of samples: [3105, 2113]

#### - Neural Network

In [41]:
from sklearn.neural_network import MLPClassifier
NN = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(15,), random_state=1)
NN.fit(X_train, y_train)

y_pred = NN.predict(X_test)

In [42]:
print(confusion_matrix(y_test, y_pred))  
print(classification_report(y_test, y_pred))
print('Overall Accuracy = ', accuracy_score(y_test, y_pred))

[[1329  217]
 [ 266  301]]
             precision    recall  f1-score   support

          0       0.83      0.86      0.85      1546
          1       0.58      0.53      0.55       567

avg / total       0.77      0.77      0.77      2113

Overall Accuracy =  0.7714150496923805


#### - Boosting

In [68]:
from sklearn.ensemble import GradientBoostingClassifier
gb = GradientBoostingClassifier(n_estimators=100, max_depth = 5, random_state = 0)
gb.fit(X_train, y_train)

y_pred = gb.predict(X_test)

In [69]:
print(confusion_matrix(y_test, y_pred))  
print(classification_report(y_test, y_pred))
print('Overall Accuracy = ', accuracy_score(y_test, y_pred))

[[1403  143]
 [ 280  287]]
             precision    recall  f1-score   support

          0       0.83      0.91      0.87      1546
          1       0.67      0.51      0.58       567

avg / total       0.79      0.80      0.79      2113

Overall Accuracy =  0.7998106956933271


#### - Support Vector Machine

In [64]:
from sklearn.svm import SVC
svm = SVC(kernel='linear') 
svm.fit(X_train,y_train)

preds = svm.predict(X_test)

In [65]:
print(confusion_matrix(y_test, y_pred))  
print(classification_report(y_test, y_pred))
print('Overall Accuracy = ', accuracy_score(y_test, y_pred))

[[1317  229]
 [ 293  274]]
             precision    recall  f1-score   support

          0       0.82      0.85      0.83      1546
          1       0.54      0.48      0.51       567

avg / total       0.74      0.75      0.75      2113

Overall Accuracy =  0.7529578797917653


#### -  K-NN

In [61]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)

y_pred = knn.predict(X_test)

In [62]:
print(confusion_matrix(y_test, y_pred))  
print(classification_report(y_test, y_pred))
print('Overall Accuracy = ', accuracy_score(y_test, y_pred))

[[1317  229]
 [ 293  274]]
             precision    recall  f1-score   support

          0       0.82      0.85      0.83      1546
          1       0.54      0.48      0.51       567

avg / total       0.74      0.75      0.75      2113

Overall Accuracy =  0.7529578797917653
