# Importing Modules

In [1]:
from tpot import TPOTClassifier
import pandas as pd

from sklearn.model_selection import train_test_split

import numpy as np
import pandas as pd

from sklearn.ensemble import ExtraTreesClassifier

from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, make_union
from sklearn.preprocessing import PolynomialFeatures
from tpot.builtins import StackingEstimator
from tpot.export_utils import set_param_recursive



# Defining Functions

The first function we define is to modify raw data to the format that we use for our model.

In [2]:
#function to modify raw data
def data_modify(df):
    x='PhoneService'
    df[x] = df[x].replace({'No': 1, 'Yes': 0})
    x='Contract'
    df[x] = df[x].replace({'Month-to-month': 0, 'One year': 1, 'Two year':2})
    x='PaymentMethod'
    df[x] = df[x].replace({'Mailed check': 1, 'Credit card (automatic)': 2, 'Electronic check':3, 'Bank transfer (automatic)':4}) 
    df['TotalCharges_tenure_ratio'] = df['TotalCharges']/df['tenure']
    del df['customerID']
    return df

The second function we define is to fit the prepped data into the pipeline we had exported.

In [3]:
#function to use pipeline and predict output
def tpot_model(df):
    r=exported_pipeline.predict(df)
    return r

# Loading in Data and Pipeline

We load in the initial data as well as the pipeline. We split the data intp train and test data as always.
We then load the exported pipeline using the make_pipeline function, and specify the best pipeline we got previously as a parameter. 

In [4]:
df = pd.read_csv('prepped_churn.csv')
del df['Unnamed: 0']
df.head()

Unnamed: 0,tenure,PhoneService,Contract,PaymentMethod,MonthlyCharges,TotalCharges,Churn,TotalCharges_tenure_ratio
0,1,1,0,3,29.85,29.85,1,29.85
1,34,0,1,1,56.95,1889.5,1,55.573529
2,2,0,0,1,53.85,108.15,0,54.075
3,45,1,1,4,42.3,1840.75,1,40.905556
4,2,0,0,3,70.7,151.65,0,75.825


In [5]:
features = df.drop('Churn', axis=1)
targets = df['Churn']

x_train, x_test, y_train, y_test = train_test_split(features, targets, stratify=targets, random_state=42)

In [6]:
#retrieve the pipeline we previously stored
exported_pipeline = make_pipeline(
    ExtraTreesClassifier(bootstrap=False, criterion='gini', max_features=0.35000000000000003, min_samples_leaf=18, min_samples_split=9, n_estimators=100)
)
   
# Fix random state for all the steps in exported pipeline
set_param_recursive(exported_pipeline.steps, 'random_state', 42)

In [7]:
#fit the pipeline with our training data, predict test data results and compare
exported_pipeline.fit(x_train, y_train)
results = exported_pipeline.predict(x_test)
print(results[:5])
print(y_test[:5])

[1 1 0 0 1]
5270    1
5820    1
4675    1
4987    0
5058    1
Name: Churn, dtype: int64


# Predicting New Data Results

We first load in the new data. We then call the data_modify function to get the data to our desired format, and then we predict the Churn values.

In [8]:
#Loading in new data
df2=pd.read_csv('new_churn_data_unmodified.csv')
df2.head()

Unnamed: 0,customerID,tenure,PhoneService,Contract,PaymentMethod,MonthlyCharges,TotalCharges
0,9305-CKSKC,22,Yes,Month-to-month,Electronic check,97.4,811.7
1,1452-KNGVK,8,No,One year,Mailed check,77.3,1701.95
2,6723-OKKJM,28,Yes,Month-to-month,Credit card (automatic),28.25,250.9
3,7832-POPKP,62,Yes,Month-to-month,Electronic check,101.7,3106.56
4,6348-TACGU,10,No,Two year,Credit card (automatic),51.15,3440.97


In [9]:
#Modifying the data to the desired format
df2=data_modify(df2)
df2.head()

Unnamed: 0,tenure,PhoneService,Contract,PaymentMethod,MonthlyCharges,TotalCharges,TotalCharges_tenure_ratio
0,22,0,0,3,97.4,811.7,36.895455
1,8,1,1,1,77.3,1701.95,212.74375
2,28,0,0,2,28.25,250.9,8.960714
3,62,0,0,3,101.7,3106.56,50.105806
4,10,1,2,2,51.15,3440.97,344.097


In [10]:
#Predicting the Churn Results for new data
results = tpot_model(df2)
print(results)

[1 1 1 1 1]


# Summary

Here, we first loaded in our initial dataset and split it into train and test data. We then loaded in the pipeline we previously exported and fit the training data into it. We checked the accuracy of the model by predicting the results for our test data and then comparing it with the actual y_test values. Here, we can see from the first 5 values itself that while the model is mostly accurate, it does give us some errors. This is because the accuracy of our model was just 79.6%. 
We then load in the new churn data. As this is still raw unmodified data, we use a function to change the data into our desired format. Once this is done, we call in the function to predict the Churn results for this data with our exported pipeline. 
We got the result as five 1's. This means that all the customers are predicted NOT to churn. As expected, we did get a few false positive values.