In [None]:
import pandas as pd
import pickle
import xgboost as xgb

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

## DataSet Processing

In [None]:
input_df = pd.read_csv('BankChurners.csv')

input_df.drop('CLIENTNUM', axis=1, inplace=True)

input_df.drop(['Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1'], 
          inplace=True, axis=1)

input_df.drop(['Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2'], 
          inplace=True, axis=1)

In [None]:
updated_df = pd.DataFrame()
cats = ['Attrition_Flag', 'Gender', 'Education_Level', 'Marital_Status', 'Income_Category', 'Card_Category']

numeric_columns = ['Customer_Age','Credit_Limit','Months_on_book','Avg_Utilization_Ratio','Avg_Open_To_Buy','Total_Trans_Amt','Dependent_count',
                  'Total_Relationship_Count','Months_Inactive_12_mon','Contacts_Count_12_mon','Total_Revolving_Bal',
                  'Total_Amt_Chng_Q4_Q1','Total_Trans_Ct','Total_Ct_Chng_Q4_Q1']

def tobinary():
    
    # full_df['Attrition_Flag'] = full_df.Attrition_Flag // same thing
    updated_df['Attrition'] = input_df.Attrition_Flag.map({'Existing Customer':0, 'Attrited Customer':1})
    
    updated_df['Gender'] = input_df.Gender.map({'M':1, 'F':0})

In [None]:
def stringtoint():
    missing_income = input_df['Income_Category'].replace({'Unknown': 1 , 'Less than $40K':0, '$40K - $60K':0, 
                                                      '$80K - $120K':0, '$60K - $80K':0, '$120K +':0})
    #missinng data will be replaced with mode:
    income_data    = input_df['Income_Category'].replace({'Unknown': 1 , 'Less than $40K':1, '$40K - $60K':2, 
                                                      '$80K - $120K':3, '$60K - $80K':4, '$120K +':5})
    
    
    
    missing_education = input_df['Education_Level'].replace({'Unknown': 1, 'High School':0, 'Graduate':0, 'Uneducated':0,
                                                         'College':0,'Post-Graduate':0,'Doctorate':0})
    #missinng data will be replaced with mode:
    education_data    = input_df['Education_Level'].replace({'Unknown': 2, 'High School':1, 'Graduate':2, 'Uneducated':3,
                                                         'College':4,'Post-Graduate':5,'Doctorate':6})
    
    card_data = input_df['Card_Category'].replace({'Blue': 1, 'Silver': 2, 'Gold': 3, 'Platinum': 4})
    
    updated_df['Card_Category']     = card_data
    updated_df['Missing_Income']    = missing_income
    updated_df['Income_Category']   = income_data
    updated_df['Missing_Education'] = missing_education
    updated_df['Education_Level']   = education_data

In [None]:
def encode():
    global updated_df
    marital_dummies = pd.get_dummies(input_df['Marital_Status'], prefix='Marital')
    updated_df = pd.concat([updated_df, marital_dummies], axis=1)
def concat_with_numerics():
    global updated_df
    updated_df = pd.concat([updated_df, input_df.loc[:, numeric_columns]], axis=1)

In [None]:
tobinary()
stringtoint()
encode()
concat_with_numerics()

In [None]:
X = updated_df.drop('Attrition', axis=1)
y = updated_df['Attrition']

In [None]:
X_small = X.drop(['Months_on_book', 'Total_Trans_Ct', 'Credit_Limit'], axis='columns')

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_small, y, random_state=14)

In [None]:
preprocessed_df = pd.concat([X_small,y], axis='columns')

In [None]:
preprocessed_df.to_csv('preprocessed_dataset.csv', index=False)

## XGBoost

In [None]:
xgb =  xgb.XGBClassifier(n_estimators=100,
                         max_depth=5, random_state=14)
xgb.fit(X_train, y_train)
pickle.dump(xgb, open( "xgb_model.p", "wb" ))

# Random Forest

In [None]:
random_forest = RandomForestClassifier(n_estimators=100)

random_forest.fit(X_train,y_train)

In [None]:
pickle.dump(random_forest, open("random_forest_model.p", 'wb'))

## Logistic Regression
with grid search of `C` parameter

In [None]:
parameters = {"C": [.01, .025, .05, .1, .2, .25, .3,.35, .4, .45, .475, .5, .525, .55, .6, .7, 1, 1.25, 1.5, 2, 5]}
lr = LogisticRegression(penalty='l1', solver='liblinear', max_iter=1000, random_state=123)
lr_clf = GridSearchCV(lr, parameters, scoring = 'accuracy')
lr_clf.fit(X_train, y_train)

In [None]:
display(accuracy_score(y_test, lr_clf.predict(X_test)))
display(lr_clf.best_params_)

$C = 0.5$ turned out to be the best of those.

In [None]:
parameters = {"C": [x/10000 for x in range(4900,5100,2)]}
lr = LogisticRegression(penalty='l1', solver='liblinear', max_iter=1000, random_state=123)
lr_clf = GridSearchCV(lr, parameters, scoring = 'accuracy')
lr_clf.fit(X_train, y_train)
display(accuracy_score(y_test, lr_clf.predict(X_test)))
display(lr_clf.best_params_)

In [None]:
pickle.dump(lr_clf.best_estimator_, open("l1_log_reg.p", "wb"))