# Model

## Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.simplefilter(action='ignore', category=UserWarning)

from sklearn.model_selection import train_test_split

from sklearn.preprocessing import StandardScaler,OneHotEncoder

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier
from xgboost import XGBClassifier

from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score,recall_score, make_scorer

from catboost import CatBoostClassifier

from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix, classification_report
import matplotlib.pyplot as plt

DEBUG = False
SEED = 666

## Load Data

### Data for predictions

In [2]:
df_churn_test = pd.read_pickle("./orig/data/churn_test.pkl")
print(df_churn_test.shape)
df_churn_test.head(1)

(3542, 34)


Unnamed: 0_level_0,Credit Card,Current,Deposit,Investment,Joint,Loan,Mortgage,On Demand Deposit,Card,Customer_Service_Calls,...,Mar_Total,Apr_Total,May_Total,Jun_Total,Jul_Total,Aug_Total,Sep_Total,Oct_Total,Nov_Total,Dec_Total
Customer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
708182832,1,1,1,1,0,0,0,0,Silver,1,...,925.22,260.99,443.52,361.47,334.93,0.0,519.59,0.0,364.3,457.4


### Training data

In [3]:
df = pd.read_pickle("./orig/data/churn_train.pkl")
print(df.shape)
df.head(1)

(5831, 35)


Unnamed: 0_level_0,Credit Card,Current,Deposit,Investment,Joint,Loan,Mortgage,On Demand Deposit,Churn,Card,...,Mar_Total,Apr_Total,May_Total,Jun_Total,Jul_Total,Aug_Total,Sep_Total,Oct_Total,Nov_Total,Dec_Total
Customer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
708115009,1,0,0,0,0,0,0,0,Yes,Silver,...,0.0,25.44,38.82,391.74,89.15,0.0,282.99,7.23,397.94,0.0


In [4]:
df.dtypes

Credit Card                     int64
Current                         int64
Deposit                         int64
Investment                      int64
Joint                           int64
Loan                            int64
Mortgage                        int64
On Demand Deposit               int64
Churn                        category
Card                         category
Customer_Service_Calls          int64
Credit_Limit                  float64
Total_Revolving_Balance         int64
Average_Open_To_Buy           float64
Average_Utilisation_Ratio     float64
Age                           float64
Gender                       category
Education                    category
Marital_Status               category
Dependents                      int64
Income                       category
Start_Year                   category
Start_Day                    category
Jan_Total                     float64
Feb_Total                     float64
Mar_Total                     float64
Apr_Total   

#### Features and Target

In [5]:
target = "Churn"
cat_features = [c for c in df.select_dtypes("category").columns if c not in target]
num_features = [c for c in df.select_dtypes(["int","float"]).columns if c not in target]
features = cat_features + num_features


print(f"Target: {target}")

print(f"Categorical Features: {cat_features}")
print(f"Numerical Features: {num_features}")

Target: Churn
Categorical Features: ['Card', 'Gender', 'Education', 'Marital_Status', 'Income', 'Start_Year', 'Start_Day']
Numerical Features: ['Credit Card', 'Current', 'Deposit', 'Investment', 'Joint', 'Loan', 'Mortgage', 'On Demand Deposit', 'Customer_Service_Calls', 'Credit_Limit', 'Total_Revolving_Balance', 'Average_Open_To_Buy', 'Average_Utilisation_Ratio', 'Age', 'Dependents', 'Jan_Total', 'Feb_Total', 'Mar_Total', 'Apr_Total', 'May_Total', 'Jun_Total', 'Jul_Total', 'Aug_Total', 'Sep_Total', 'Oct_Total', 'Nov_Total', 'Dec_Total']


## Check Target Values

In [6]:
df[target].value_counts(normalize=True)

No     0.843595
Yes    0.156405
Name: Churn, dtype: float64

In [7]:
df.isna().sum().sum()

0

### Comments
- From the relative frequencies above we can see clearly that the dataset is unbalanced.
- We will attempt to use a Synthetic Minority Oversampling Technique (SMOTE) to balance out the dataset later on.

## Encode Categorical Values

### Train-Test Split

In [8]:
df_train, df_test, y_train, y_test = train_test_split(df[features], df[target], train_size=0.6, stratify=df[target], random_state=SEED)
df_train.shape, df_test.shape

((3498, 34), (2333, 34))

## Encoding Values

In [9]:
def encode_df(df_train, y_train):
    cat_e = OneHotEncoder()
    num_e = StandardScaler()
    
    data = cat_e.fit_transform(df_train[cat_features]).toarray()
    index = df_train.index
    columns = cat_e.get_feature_names_out()
    df_cat = pd.DataFrame(data=data, index=index, columns=columns)

    #data = num_e.fit_transform(df_train[num_features])
    #index = df_train.index
    #columns = num_features
    #df_num = pd.DataFrame(data=data, index=index, columns=num_features)
    df_num = df_train[num_features]
    
    df_train = pd.concat([df_cat, df_num], axis=1)
    
    y_train = y_train.map( {'No':0, "Yes":1} )
    
    return df_train, y_train

### Encode churn pred dataframe

In [10]:
df_churn_test["Churn"] = "No"
df_churn_test , y_churn = encode_df(df_churn_test.iloc[:, :-1],df_churn_test["Churn"])
df_churn_test.head(1)
#np.unique(df_churn_test.Marital_Status_Unknown)

Unnamed: 0_level_0,Card_Black,Card_Gold,Card_Platinum,Card_Silver,Gender_F,Gender_M,Education_Graduate,Education_MSc,Education_PhD,Education_Post LC,...,Mar_Total,Apr_Total,May_Total,Jun_Total,Jul_Total,Aug_Total,Sep_Total,Oct_Total,Nov_Total,Dec_Total
Customer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
708182832,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,...,925.22,260.99,443.52,361.47,334.93,0.0,519.59,0.0,364.3,457.4


### Encode Train

In [11]:
df_train, y_train = encode_df(df_train,y_train)
df_train.insert(loc=16, column='Marital_Status_Unknown',value=0)
display(df_train.head(1))

Unnamed: 0_level_0,Card_Black,Card_Gold,Card_Platinum,Card_Silver,Gender_F,Gender_M,Education_Graduate,Education_MSc,Education_PhD,Education_Post LC,...,Mar_Total,Apr_Total,May_Total,Jun_Total,Jul_Total,Aug_Total,Sep_Total,Oct_Total,Nov_Total,Dec_Total
Customer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
793319878,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,1337.32,285.37,55.78,0.0,1603.73,0.0,137.97,206.07,362.12


#### Comment:
- OHE still doesnt add columns for categorical values that dont show up in a dataframe so we have to manually add in a marital status column to match the testing dataframe for churn

### Encode Test

In [12]:
df_test, y_test = encode_df(df_test, y_test)
df_test.insert(loc=16, column='Marital_Status_Unknown',value=0)
display(df_test.head(1))

Unnamed: 0_level_0,Card_Black,Card_Gold,Card_Platinum,Card_Silver,Gender_F,Gender_M,Education_Graduate,Education_MSc,Education_PhD,Education_Post LC,...,Mar_Total,Apr_Total,May_Total,Jun_Total,Jul_Total,Aug_Total,Sep_Total,Oct_Total,Nov_Total,Dec_Total
Customer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
767479181,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,263.08,236.12,146.94,0.0,193.7,0.0,163.99,101.38,0.0,281.05


### Model Selection

In [13]:
models = {
    "LR": LogisticRegression(max_iter=1000),
    "DT": DecisionTreeClassifier(),
    "KNN": KNeighborsClassifier(),
    "RF": RandomForestClassifier(),
    "ET": ExtraTreesClassifier(),
    "XGB": XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=SEED),
    "CLF": CatBoostClassifier(silent=True),
    "ADA": AdaBoostClassifier(n_estimators=100,random_state=SEED)
}

In [14]:
scorer = make_scorer(recall_score)

cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=SEED)
for name,model in models.items():
    scores = cross_val_score(model, df_train, y_train, cv=cv, scoring=scorer)
    print(name, "Recall: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

LR Recall: 0.28 (+/- 0.17)
DT Recall: 0.43 (+/- 0.14)
KNN Recall: 0.38 (+/- 0.06)
RF Recall: 0.31 (+/- 0.15)
ET Recall: 0.24 (+/- 0.12)
XGB Recall: 0.49 (+/- 0.18)
CLF Recall: 0.46 (+/- 0.18)
ADA Recall: 0.48 (+/- 0.17)


In [15]:
model =  XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=SEED)
model.fit(df_train,y_train)
y_pred = model.predict(df_test)
y_pred

array([0, 0, 0, ..., 0, 0, 0])

In [16]:
np.unique(y_pred)

array([0, 1])

In [17]:
def showDiagnostics(y_test, y_pred):
  # Model Accuracy, how often is the classifier correct?
  print("Accuracy:",accuracy_score(y_test, y_pred))
  print("Confusion Matrix:")
  print(confusion_matrix(y_test, y_pred))
  print("Classification Report:")
  print(classification_report(y_test, y_pred, digits=3))

In [18]:
showDiagnostics(y_test,y_pred)

Accuracy: 0.8859837119588513
Confusion Matrix:
[[1914   54]
 [ 212  153]]
Classification Report:
              precision    recall  f1-score   support

           0      0.900     0.973     0.935      1968
           1      0.739     0.419     0.535       365

    accuracy                          0.886      2333
   macro avg      0.820     0.696     0.735      2333
weighted avg      0.875     0.886     0.872      2333



## Generate predictions

In [19]:
y_pred = model.predict(df_churn_test)

In [21]:
df_pred = pd.DataFrame({'Churn': y_pred}, index=df_churn_test.index)
df_pred = df_pred.Churn.map( {0:'No', 1:"Yes"} )
df_pred = df_pred.reset_index()
print(df_pred.shape)
print(df_pred.head(5))
df_pred.to_csv("./orig/output/pred.csv",index=False)


(3542, 2)
    Customer Churn
0  708182832    No
1  708240660    No
2  708249778    No
3  708259785    No
4  708264579    No
