In [1]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [2]:
df = pd.read_csv("./BankChurners.csv")

In [3]:
df.head()

Unnamed: 0,CLIENTNUM,Attrition_Flag,Customer_Age,Gender,Dependent_count,Education_Level,Marital_Status,Income_Category,Card_Category,Months_on_book,...,Credit_Limit,Total_Revolving_Bal,Avg_Open_To_Buy,Total_Amt_Chng_Q4_Q1,Total_Trans_Amt,Total_Trans_Ct,Total_Ct_Chng_Q4_Q1,Avg_Utilization_Ratio,Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1,Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2
0,768805383,Existing Customer,45,M,3,High School,Married,$60K - $80K,Blue,39,...,12691.0,777,11914.0,1.335,1144,42,1.625,0.061,9.3e-05,0.99991
1,818770008,Existing Customer,49,F,5,Graduate,Single,Less than $40K,Blue,44,...,8256.0,864,7392.0,1.541,1291,33,3.714,0.105,5.7e-05,0.99994
2,713982108,Existing Customer,51,M,3,Graduate,Married,$80K - $120K,Blue,36,...,3418.0,0,3418.0,2.594,1887,20,2.333,0.0,2.1e-05,0.99998
3,769911858,Existing Customer,40,F,4,High School,Unknown,Less than $40K,Blue,34,...,3313.0,2517,796.0,1.405,1171,20,2.333,0.76,0.000134,0.99987
4,709106358,Existing Customer,40,M,3,Uneducated,Married,$60K - $80K,Blue,21,...,4716.0,0,4716.0,2.175,816,28,2.5,0.0,2.2e-05,0.99998


In [4]:
df = df.drop(["CLIENTNUM", "Attrition_Flag", "Gender", "Education_Level",
             "Marital_Status", "Income_Category", "Card_Category"], axis=1)

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10127 entries, 0 to 10126
Data columns (total 16 columns):
 #   Column                                                                                                                              Non-Null Count  Dtype  
---  ------                                                                                                                              --------------  -----  
 0   Customer_Age                                                                                                                        10127 non-null  int64  
 1   Dependent_count                                                                                                                     10127 non-null  int64  
 2   Months_on_book                                                                                                                      10127 non-null  int64  
 3   Total_Relationship_Count                                                         

In [5]:
df.describe()
X = df.drop(['Total_Trans_Amt'], axis=1)
y = df['Total_Trans_Amt'].apply(lambda v: 0 if v <= 5000 else 1)

In [6]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3)

In [9]:
model = LogisticRegression(max_iter = 500)
model.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [10]:
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"""
    Accuracy: {accuracy}
    Precision: {precision}
    Recall: {recall}
    F1 score: {f1}
""")


    Accuracy: 0.9012833168805529
    Precision: 0.7984344422700587
    Recall: 0.6743801652892562
    F1 score: 0.7311827956989246



In [11]:
rf = RandomForestClassifier(random_state = 42)
param_grid = {'n_estimators' : [50, 100, 150, 200]}
grid_search = GridSearchCV(rf,param_grid,scoring = 'f1')
grid_search.fit(X_train, y_train)

In [12]:
best_params = grid_search.best_params_
rf_best = RandomForestClassifier(**best_params)
rf_best.fit(X_train, y_train)

In [13]:
y_pred_rf = rf_best.predict(X_test)
accuracy_rf = accuracy_score(y_test, y_pred_rf)
precision_rf = precision_score(y_test, y_pred_rf)
recall_rf = recall_score(y_test, y_pred_rf)
f1_rf = f1_score(y_test, y_pred_rf)

print(f"""
    Accuracy: {accuracy_rf}
    Precision: {precision_rf}
    Recall: {recall_rf}
    F1 score: {f1_rf}
""")


    Accuracy: 0.9305692662059888
    Precision: 0.9209401709401709
    Recall: 0.7123966942148761
    F1 score: 0.8033550792171481

