In [32]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder , RobustScaler , MinMaxScaler
from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
import numpy as np
from imblearn.over_sampling import SMOTE

import matplotlib.pyplot as plt
import seaborn  as sns
import warnings
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
warnings.filterwarnings('ignore')

In [40]:
pd.set_option('display.max_columns',None)
df = pd.read_excel(r'CHURNDATA (1) (1).xlsx')
df.head(2)

Unnamed: 0,CIF,CUS_DOB,AGE,CUS_Month_Income,CUS_Gender,CUS_Marital_Status,CUS_Customer_Since,YEARS_WITH_US,# total debit transactions for S1,# total debit transactions for S2,# total debit transactions for S3,total debit amount for S1,total debit amount for S2,total debit amount for S3,# total credit transactions for S1,# total credit transactions for S2,# total credit transactions for S3,total credit amount for S1,total credit amount for S2,total credit amount for S3,total debit amount,total debit transactions,total credit amount,total credit transactions,total transactions,CUS_Target,TAR_Desc,Status
0,XXXXXX,Feb 13 1970 12:00AM,49,7116.64,MALE,MARRIED,1994-06-30,25,277,265,345,1459126.64,1230543.08,2068641.91,10,24,31,1516981.1,1764079.61,2378592.62,4758311.63,887,5659653.33,65,952,2231,EXECUTIVE,ACTIVE
1,XXXXXX,Sep 20 1973 12:00AM,46,1500000.0,FEMALE,SINGLE,2005-05-19,14,37,15,45,35372.55,20134.0,83856.67,2,4,4,10000.0,19500.0,57500.0,139363.22,97,87000.0,10,107,2223,LOW,ACTIVE


In [41]:
df.isna().sum()

CIF                                    0
CUS_DOB                                0
AGE                                    0
CUS_Month_Income                      11
CUS_Gender                             2
CUS_Marital_Status                     0
CUS_Customer_Since                     0
YEARS_WITH_US                          0
# total debit transactions for S1      0
# total debit transactions for S2      0
# total debit transactions for S3      0
total debit amount for S1              0
total debit amount for S2              0
total debit amount for S3              0
# total credit transactions for S1     0
# total credit transactions for S2     0
# total credit transactions for S3     0
total credit amount for S1             0
total credit amount for S2             0
total credit amount for S3             0
total debit amount                     0
total debit transactions               0
total credit amount                    0
total credit transactions              0
total transactio

In [6]:
# delete cols

del df['CIF']
del df['CUS_DOB']
del df['CUS_Customer_Since']
del df['total debit transactions']
del df['total credit transactions']
del df['total transactions']
del df['TAR_Desc']
del df['CUS_Target']

In [7]:
df.dropna(inplace = True)

In [8]:
df.columns

Index(['AGE', 'CUS_Month_Income', 'CUS_Gender', 'CUS_Marital_Status',
       'YEARS_WITH_US', '# total debit transactions for S1',
       '# total debit transactions for S2',
       '# total debit transactions for S3', 'total debit amount for S1',
       'total debit amount for S2', 'total debit amount for S3',
       '# total credit transactions for S1',
       '# total credit transactions for S2',
       '# total credit transactions for S3', 'total credit amount for S1',
       'total credit amount for S2', 'total credit amount for S3',
       'total debit amount', 'total credit amount', 'Status'],
      dtype='object')

In [9]:
df['Status'] = df['Status'].map({'ACTIVE':0 , 'CHURN':1})

In [10]:
X = df.drop(columns=['Status'],axis=1)
y = df['Status']

In [11]:

def scorer(model_name, model):
    
    output = []
    output.append(model_name)
    
   
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)

    # Use 'accuracy' as the scoring metric for classification
    scores = cross_val_score(model, X_resampled, y_resampled, cv=kfold, scoring='accuracy')
    scores  = scores[~np.isnan(scores)] 
    
    print(model)
    output.append(scores.mean())
    
    X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)
    
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_test)
    
    # Calculate accuracy on the test set
    accuracy = accuracy_score(y_test, y_pred)
    
    # Calculate F1 score on the test set
    f1 = f1_score(y_test, y_pred)
    
    # Calculate precision on the test set
    precision = precision_score(y_test, y_pred)
    
    # Calculate recall on the test set
    recall = recall_score(y_test, y_pred)
    train_acc = model.score(X_train ,y_train)
    output.extend([train_acc,accuracy, f1, precision, recall])
    
    return output

# One Hot Encoder

In [16]:
columns_to_encode = ['CUS_Gender','CUS_Marital_Status']
scaling = ['AGE','CUS_Month_Income','YEARS_WITH_US', '# total debit transactions for S1',
       '# total debit transactions for S2',
       '# total debit transactions for S3', 'total debit amount for S1',
       'total debit amount for S2', 'total debit amount for S3',
       '# total credit transactions for S1',
       '# total credit transactions for S2',
       '# total credit transactions for S3', 'total credit amount for S1',
       'total credit amount for S2', 'total credit amount for S3',
       ]


In [33]:
preprocessor = ColumnTransformer(
    transformers=[
        ('standardscaler', RobustScaler(), scaling),
        ('cat', OneHotEncoder(), columns_to_encode)
    ],
    remainder='passthrough'
)

# Apply the preprocessing to your DataFrame
X_preprocessed = preprocessor.fit_transform(X)

# Convert the transformed data back to a DataFrame
X_preprocessed_df = pd.DataFrame(X_preprocessed)
# Now, X_preprocessed_df contains the preprocessed data
X_preprocessed_df.head(1)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24
0,0.1875,-0.498886,11.0,4.415584,4.033473,5.099602,7.860893,5.775664,9.14114,0.5,2.0,2.5,8.387246,8.98267,11.886084,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,4758311.63,5659653.33


In [34]:
smote = SMOTE()
X_resampled , y_resampled = smote.fit_resample(X_preprocessed_df , y)

In [35]:
model_dict = {
    'logistic_regression': LogisticRegression(),
    'svm': SVC(),
    'decision_tree': DecisionTreeClassifier(),
    'random_forest': RandomForestClassifier(),
    'extra_trees': ExtraTreesClassifier(),
    'gradient_boosting': GradientBoostingClassifier(),
    'adaboost': AdaBoostClassifier(),
    'mlp': MLPClassifier(),
    'xgboost': XGBClassifier()
}


In [36]:
model_output = []
for model_name,model in model_dict.items():
    model_output.append(scorer(model_name, model))

LogisticRegression()
SVC()
DecisionTreeClassifier()
RandomForestClassifier()
ExtraTreesClassifier()
GradientBoostingClassifier()
AdaBoostClassifier()
MLPClassifier()
XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=None, n_jobs=None,
              num_parallel_tree=None, random_state=None, ...)


In [37]:
model_df = pd.DataFrame(model_output, columns=['Modelname','mean_acc','train-acc','test acc', 'f1', 'precision', 'recall'])
model_df.sort_values(['f1'] ,ascending = False)

Unnamed: 0,Modelname,mean_acc,train-acc,test acc,f1,precision,recall
4,extra_trees,0.933005,1.0,0.916256,0.921659,0.913242,0.930233
8,xgboost,0.907882,1.0,0.896552,0.903226,0.894977,0.911628
3,random_forest,0.908374,1.0,0.881773,0.890909,0.871111,0.911628
5,gradient_boosting,0.872414,0.953202,0.85468,0.864368,0.854545,0.874419
6,adaboost,0.860591,0.885468,0.834975,0.850112,0.818966,0.883721
2,decision_tree,0.835961,1.0,0.834975,0.84738,0.830357,0.865116
1,svm,0.719212,0.717365,0.70197,0.765957,0.655629,0.92093
7,mlp,0.614286,0.600985,0.564039,0.321839,0.913043,0.195349
0,logistic_regression,0.50936,0.507389,0.470443,0.0,0.0,0.0


# Ordinal Encoding

In [26]:
columns_to_encode = ['CUS_Gender','CUS_Marital_Status']
scaling = ['AGE','CUS_Month_Income','YEARS_WITH_US', '# total debit transactions for S1',
       '# total debit transactions for S2',
       '# total debit transactions for S3', 'total debit amount for S1',
       'total debit amount for S2', 'total debit amount for S3',
       '# total credit transactions for S1',
       '# total credit transactions for S2',
       '# total credit transactions for S3', 'total credit amount for S1',
       'total credit amount for S2', 'total credit amount for S3',
       ]

preprocessor = ColumnTransformer(
    transformers=[
        ('standardscaler', StandardScaler(), scaling),
        ('cat', OrdinalEncoder(), columns_to_encode)
    ],
    remainder='passthrough'
)

# Apply the preprocessing to your DataFrame
X_preprocessed = preprocessor.fit_transform(X)

# Convert the transformed data back to a DataFrame
X_preprocessed_df = pd.DataFrame(X_preprocessed ,columns=X.columns)
# Now, X_preprocessed_df contains the preprocessed data
X_preprocessed_df.head(1)

Unnamed: 0,AGE,CUS_Month_Income,CUS_Gender,CUS_Marital_Status,YEARS_WITH_US,# total debit transactions for S1,# total debit transactions for S2,# total debit transactions for S3,total debit amount for S1,total debit amount for S2,total debit amount for S3,# total credit transactions for S1,# total credit transactions for S2,# total credit transactions for S3,total credit amount for S1,total credit amount for S2,total credit amount for S3,total debit amount,total credit amount
0,0.203513,-0.510262,5.064423,2.746701,2.572269,3.389096,0.751661,0.707613,1.868364,0.171958,1.145401,1.576642,0.736022,1.372328,1.622259,1.0,1.0,4758311.63,5659653.33


In [27]:
smote = SMOTE()
X_resampled , y_resampled = smote.fit_resample(X_preprocessed_df , y)

In [28]:
model_output = []
for model_name,model in model_dict.items():
    model_output.append(scorer(model_name, model))

LogisticRegression()
SVC()
DecisionTreeClassifier()
RandomForestClassifier()
ExtraTreesClassifier()
GradientBoostingClassifier()
AdaBoostClassifier()
MLPClassifier()
XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=None, n_jobs=None,
              num_parallel_tree=None, random_state=None, ...)


In [29]:
model_df_ordinal_encode = pd.DataFrame(model_output, columns=['Modelname','mean_acc','train-acc','test acc', 'f1', 'precision', 'recall'])
model_df_ordinal_encode.sort_values(['f1'] ,ascending = False)

Unnamed: 0,Modelname,mean_acc,train-acc,test acc,f1,precision,recall
4,extra_trees,0.92069,1.0,0.913793,0.920993,0.894737,0.948837
3,random_forest,0.896059,1.0,0.891626,0.900901,0.873362,0.930233
8,xgboost,0.9,1.0,0.891626,0.900901,0.873362,0.930233
5,gradient_boosting,0.882759,0.951355,0.871921,0.881818,0.862222,0.902326
6,adaboost,0.866502,0.896552,0.852217,0.865471,0.835498,0.897674
2,decision_tree,0.844335,1.0,0.817734,0.829493,0.821918,0.837209
1,svm,0.725123,0.718596,0.716749,0.780115,0.662338,0.948837
7,mlp,0.612315,0.576355,0.546798,0.406452,0.663158,0.293023
0,logistic_regression,0.5,0.507389,0.470443,0.0,0.0,0.0
