In [100]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
import numpy as np
from imblearn.over_sampling import SMOTE

import matplotlib.pyplot as plt
import seaborn  as sns
import warnings
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
warnings.filterwarnings('ignore')

In [101]:
pd.set_option('display.max_columns',None)
df = pd.read_excel(r'CHURNDATA (1) (1).xlsx')
df.head(2)

Unnamed: 0,CIF,CUS_DOB,AGE,CUS_Month_Income,CUS_Gender,CUS_Marital_Status,CUS_Customer_Since,YEARS_WITH_US,# total debit transactions for S1,# total debit transactions for S2,# total debit transactions for S3,total debit amount for S1,total debit amount for S2,total debit amount for S3,# total credit transactions for S1,# total credit transactions for S2,# total credit transactions for S3,total credit amount for S1,total credit amount for S2,total credit amount for S3,total debit amount,total debit transactions,total credit amount,total credit transactions,total transactions,CUS_Target,TAR_Desc,Status
0,XXXXXX,Feb 13 1970 12:00AM,49,7116.64,MALE,MARRIED,1994-06-30,25,277,265,345,1459126.64,1230543.08,2068641.91,10,24,31,1516981.1,1764079.61,2378592.62,4758311.63,887,5659653.33,65,952,2231,EXECUTIVE,ACTIVE
1,XXXXXX,Sep 20 1973 12:00AM,46,1500000.0,FEMALE,SINGLE,2005-05-19,14,37,15,45,35372.55,20134.0,83856.67,2,4,4,10000.0,19500.0,57500.0,139363.22,97,87000.0,10,107,2223,LOW,ACTIVE


In [102]:
del df['CIF']
del df['CUS_DOB']
del df['CUS_Customer_Since']

In [103]:
df.dropna(inplace = True)

In [104]:
df.columns

Index(['AGE', 'CUS_Month_Income', 'CUS_Gender', 'CUS_Marital_Status',
       'YEARS_WITH_US', '# total debit transactions for S1',
       '# total debit transactions for S2',
       '# total debit transactions for S3', 'total debit amount for S1',
       'total debit amount for S2', 'total debit amount for S3',
       '# total credit transactions for S1',
       '# total credit transactions for S2',
       '# total credit transactions for S3', 'total credit amount for S1',
       'total credit amount for S2', 'total credit amount for S3',
       'total debit amount', 'total debit transactions', 'total credit amount',
       'total credit transactions', 'total transactions', 'CUS_Target',
       'TAR_Desc', 'Status'],
      dtype='object')

In [105]:
df['Status'] = df['Status'].map({'ACTIVE':0 , 'CHURN':1})

In [106]:
X = df.drop(columns=['Status'],axis=1)
y = df['Status']

In [107]:

def scorer(model_name, model):
    
    output = []
    output.append(model_name)
    
    pipeline = Pipeline([
        ('preprocessor', preprocessor),  # Data preprocessing steps
        ('SMOTE', SMOTE(random_state=42)),  # Apply SMOTE
        ('classifier', model)  # Classification model
    ])
    
    # K-fold cross-validation
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)

    # Use 'accuracy' as the scoring metric for classification
    scores = cross_val_score(pipeline, X_resampled, y_resampled, cv=kfold, scoring='accuracy')
    scores  = scores[~np.isnan(scores)] 
    
    print(model)
    output.append(scores.mean())
    
    X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)
    
    pipeline.fit(X_train, y_train)
    
    y_pred = pipeline.predict(X_test)
    
    # Calculate accuracy on the test set
    accuracy = accuracy_score(y_test, y_pred)
    
    # Calculate F1 score on the test set
    f1 = f1_score(y_test, y_pred)
    
    # Calculate precision on the test set
    precision = precision_score(y_test, y_pred)
    
    # Calculate recall on the test set
    recall = recall_score(y_test, y_pred)
    train_acc = pipeline.score(X_train ,y_train)
    output.extend([train_acc,accuracy, f1, precision, recall])
    
    return output

IndentationError: unexpected indent (3896705181.py, line 6)

# Ordinal Encoding

In [None]:
columns_to_encode = ['CUS_Gender','TAR_Desc','CUS_Target','CUS_Marital_Status']
scaling = ['AGE','CUS_Month_Income','YEARS_WITH_US', '# total debit transactions for S1',
       '# total debit transactions for S2',
       '# total debit transactions for S3', 'total debit amount for S1',
       'total debit amount for S2', 'total debit amount for S3',
       '# total credit transactions for S1',
       '# total credit transactions for S2',
       '# total credit transactions for S3', 'total credit amount for S1',
       'total credit amount for S2', 'total credit amount for S3',
       'total debit amount', 'total debit transactions', 'total credit amount',
       'total credit transactions', 'total transactions']


In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ('stanardscaler', StandardScaler(), scaling),
        ('cat', OrdinalEncoder(), columns_to_encode)
    ], 
    remainder='passthrough'
)



In [None]:
model_dict = {
    'logistic_regression': LogisticRegression(),
    'svm': SVC(),
    'decision_tree': DecisionTreeClassifier(),
    'random_forest': RandomForestClassifier(),
    'extra_trees': ExtraTreesClassifier(),
    'gradient_boosting': GradientBoostingClassifier(),
    'adaboost': AdaBoostClassifier(),
    'mlp': MLPClassifier(),
    'xgboost': XGBClassifier()
}


In [None]:
model_output = []
for model_name,model in model_dict.items():
    model_output.append(scorer(model_name, model))

In [None]:
model_df = pd.DataFrame(model_output, columns=['Modelname','mean_acc','train-acc','test acc', 'f1', 'precision', 'recall'])
model_df.sort_values(['f1'] ,ascending = False)