In [1]:
import pandas as pd
df = pd.read_csv('prepped_churn_data_new.csv', index_col='customerID')
df

Unnamed: 0_level_0,tenure,PhoneService,Contract,PaymentMethod,MonthlyCharges,TotalCharges,Churn
customerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,1,0,0.083333,1,29.85,29.85,0
1,34,1,1.000000,2,56.95,1889.50,0
2,2,1,0.083333,2,53.85,108.15,1
3,45,0,1.000000,3,42.30,1840.75,0
4,2,1,0.083333,1,70.70,151.65,1
...,...,...,...,...,...,...,...
7038,24,1,1.000000,2,84.80,1990.50,0
7039,72,1,1.000000,4,103.20,7362.90,0
7040,11,0,0.083333,1,29.60,346.45,0
7041,4,1,0.083333,2,74.40,306.60,1


# Use pycaret to find an ML algorithm that performs best on the data
Following the PEP8 guide, we import the ClassificationExperiment class straight from the pycaret.classification module and initialize the class.

In [2]:
from pycaret.classification import ClassificationExperiment

In [3]:
automl = ClassificationExperiment()

Then we call the function 'setup' which initializes the training environment and creates the transformation pipeline.

In [4]:
automl.setup(df, target='Churn')

Unnamed: 0,Description,Value
0,Session id,2060
1,Target,Churn
2,Target type,Binary
3,Original data shape,"(7043, 7)"
4,Transformed data shape,"(7043, 7)"
5,Transformed train set shape,"(4930, 7)"
6,Transformed test set shape,"(2113, 7)"
7,Numeric features,6
8,Preprocess,True
9,Imputation type,simple


<pycaret.classification.oop.ClassificationExperiment at 0x7f8a87e7ac10>

In [5]:
automl

<pycaret.classification.oop.ClassificationExperiment at 0x7f8a87e7ac10>

# Choose a metric you think is best to use for finding the best model
Here, we chose the AUC method rather than the default accuracy metric because it found the model with the highest precision.

In [6]:
best_model = automl.compare_models(sort='AUC')

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
gbc,Gradient Boosting Classifier,0.7901,0.8349,0.4862,0.6375,0.551,0.4174,0.4243,0.522
lr,Logistic Regression,0.7919,0.8326,0.5092,0.6347,0.5645,0.43,0.4349,0.625
ada,Ada Boost Classifier,0.786,0.8303,0.4832,0.625,0.5442,0.4076,0.4138,0.295
lightgbm,Light Gradient Boosting Machine,0.7866,0.8249,0.5008,0.6235,0.5547,0.4167,0.4215,0.439
qda,Quadratic Discriminant Analysis,0.7465,0.8224,0.7416,0.5164,0.6086,0.4301,0.4456,0.06
lda,Linear Discriminant Analysis,0.7884,0.8181,0.4916,0.63,0.5518,0.4162,0.4219,0.048
nb,Naive Bayes,0.7168,0.804,0.7554,0.479,0.5861,0.387,0.4104,0.056
rf,Random Forest Classifier,0.7716,0.7946,0.4778,0.5856,0.5256,0.3774,0.3811,0.582
et,Extra Trees Classifier,0.7635,0.776,0.487,0.5645,0.5219,0.3661,0.3685,0.486
knn,K Neighbors Classifier,0.7538,0.7386,0.422,0.5492,0.4763,0.3191,0.3244,0.07


the best model is Gradient Boosting Classifier with AUC score of 0.8349. The reason the model works best for our data is because it provides high predictive accuracy by combining weak learners in an ensemble to create a strong predictor.

In [7]:
best_model

# Save the model to disk
We save the model as a pkl file. We used pickle to serialize our pycaret model into a binary format and deserialize it back to the python object.

In [8]:
automl.save_model(best_model, 'pycaret_model')

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=Memory(location=None),
          steps=[('numerical_imputer',
                  TransformerWrapper(exclude=None,
                                     include=['tenure', 'PhoneService',
                                              'Contract', 'PaymentMethod',
                                              'MonthlyCharges', 'TotalCharges'],
                                     transformer=SimpleImputer(add_indicator=False,
                                                               copy=True,
                                                               fill_value=None,
                                                               keep_empty_features=False,
                                                               missing_values=nan,
                                                               strategy='mean'))),
                 ('categorical_imputer',...
                                             criterion='friedman_mse', init=None,
                      

# Check our saved model

We then load our saved model again and use it to predict the churn probability of our second to last row of the churn dataset.

In [9]:
new_pycaret = ClassificationExperiment()
loaded_model = new_pycaret.load_model('pycaret_model')

Transformation Pipeline and Model Successfully Loaded


In [10]:
new_pycaret.predict_model(loaded_model, df.iloc[-2:-1])

Unnamed: 0_level_0,tenure,PhoneService,Contract,PaymentMethod,MonthlyCharges,TotalCharges,Churn,prediction_label,prediction_score
customerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
7041,4,1,0.083333,2,74.400002,306.600006,1,1,0.5825


# Create a Python script/file/module with a function that takes a pandas dataframe as an input and returns the probability of churn for each row in the dataframe
We create a file called 'predict_churn_pycaret.py' that predicts the churn probability of each row in any dataset similar to our prepared dataset. Our file contains 2 functions that loads the new dataset and predicts the churn probability of that dataset. We modified the names in the 'prediction_label' column to make it easier for understanding the prediction.  

In [None]:
from IPython.display import Code

Code('predict_churn_pycaret.py')

# Testing the Python module and function with the new data, new_churn_data.csv
Here, we can see the churn probability of the new data. Comparing the performance of our model with the true values for the new data, our model predicted 4 out of 5 correctly which is considerably good. However, if we look at each prediction probality, our model predicted the first correct answer with the probability score of 0.55 while also predicting the wrong answer with a probability score of 0.72. The reason for these mistakes is probably because of insufficient data similar to these 2 rows.  

In [13]:
%run predict_churn_pycaret.py

Transformation Pipeline and Model Successfully Loaded
predictions:
            tenure  PhoneService  Contract  PaymentMethod  MonthlyCharges  \
customerID                                                                  
9305-CKSKC      22             1         0              2       97.400002   
1452-KNGVK       8             0         1              1       77.300003   
6723-OKKJM      28             1         0              0       28.250000   
7832-POPKP      62             1         0              2      101.699997   
6348-TACGU      10             0         0              1       51.150002   

            TotalCharges  charge_per_tenure     Churn  prediction_score  
customerID                                                               
9305-CKSKC    811.700012          36.895454     Churn            0.5572  
1452-KNGVK   1701.949951         212.743744  No Churn            0.8458  
6723-OKKJM    250.899994           8.960714  No Churn            0.8755  
7832-POPKP   3106.56005