In [59]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import f1_score, precision_score, recall_score
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as imPipeline
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from datetime import date, timedelta, datetime, date

In [60]:
data = pd.read_csv('../data/final_data.csv', index_col=False)

In [61]:
def calc_day_of_birth (day_num):
    today = date.today() 
    birthDay = (today + timedelta(days=day_num)).strftime('%Y-%m-%d')
    return birthDay
    
def calc_day_of_employed(day_num):
    today = date.today() 
    employedDay = (today + timedelta(days=day_num)).strftime('%Y-%m-%d')
    result = 0
    if employedDay > date.today().strftime('%Y-%m-%d') :
         result = 0
    else:
         result = employedDay
    return result

def calculate_age(born):
    born = datetime.strptime(born, '%Y-%m-%d')  # Removed the second datetime
    today = date.today()
    return today.year - born.year - ((today.month, today.day) < (born.month, born.day))

    
def get_appartment(x):
    if x == 'House / apartment' :
       x= x.split(' /')[0]       
    return x
    
def get_ducational_type(x):
    if x == 'Secondary / secondary special' :
       x= x.split(' /')[0]       
    return x



In [62]:
data

Unnamed: 0,ID,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,...,FLAG_MOBIL,FLAG_WORK_PHONE,FLAG_PHONE,FLAG_EMAIL,OCCUPATION_TYPE,CNT_FAM_MEMBERS,Bad_Debt,Good_Debt,Neutral_Debt,label
0,5008804,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,...,1,1,0,0,Unknown,2.0,0,15,1,1
1,5008805,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,...,1,1,0,0,Unknown,2.0,0,14,1,1
2,5008806,M,Y,Y,0,112500.0,Working,Secondary / secondary special,Married,House / apartment,...,1,0,0,0,Security staff,2.0,0,30,0,1
3,5008808,F,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,...,1,0,1,1,Sales staff,1.0,0,5,0,1
4,5008809,F,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,...,1,0,1,1,Sales staff,1.0,0,5,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36452,5149828,M,Y,Y,0,315000.0,Working,Secondary / secondary special,Married,House / apartment,...,1,0,0,0,Managers,2.0,4,8,0,0
36453,5149834,F,N,Y,0,157500.0,Commercial associate,Higher education,Married,House / apartment,...,1,0,1,1,Medicine staff,2.0,9,6,9,0
36454,5149838,F,N,Y,0,157500.0,Pensioner,Higher education,Married,House / apartment,...,1,0,1,1,Medicine staff,2.0,9,15,9,1
36455,5150049,F,N,Y,0,283500.0,Working,Secondary / secondary special,Married,House / apartment,...,1,0,0,0,Sales staff,2.0,0,8,2,1


In [63]:
data['BIRTH_DAY'] = data['DAYS_BIRTH'].apply(calc_day_of_birth)
data['EMPLOYED_DAY']   = data['DAYS_EMPLOYED'].apply(calc_day_of_employed)
#enhance housing 
data['NAME_HOUSING_TYPE'] = data['NAME_HOUSING_TYPE'].apply(get_appartment)
#calculate age 
data['age'] = data['BIRTH_DAY'].apply(calculate_age)
#enhance educational type
data['NAME_EDUCATION_TYPE'] =data['NAME_EDUCATION_TYPE'].apply(get_ducational_type)

data = data.drop(['ID','DAYS_BIRTH','FLAG_WORK_PHONE','DAYS_EMPLOYED','EMPLOYED_DAY','BIRTH_DAY', 'Bad_Debt', 'Good_Debt', 'Neutral_Debt'],axis=1)

In [64]:
# Separate features and labels
X = data.drop(['label'], axis=1)
y = data['label']

In [65]:
X

Unnamed: 0,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,FLAG_MOBIL,FLAG_PHONE,FLAG_EMAIL,OCCUPATION_TYPE,CNT_FAM_MEMBERS,age
0,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,1,0,0,Unknown,2.0,32
1,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,1,0,0,Unknown,2.0,32
2,M,Y,Y,0,112500.0,Working,Secondary,Married,House,1,0,0,Security staff,2.0,58
3,F,N,Y,0,270000.0,Commercial associate,Secondary,Single / not married,House,1,1,1,Sales staff,1.0,52
4,F,N,Y,0,270000.0,Commercial associate,Secondary,Single / not married,House,1,1,1,Sales staff,1.0,52
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36452,M,Y,Y,0,315000.0,Working,Secondary,Married,House,1,0,0,Managers,2.0,47
36453,F,N,Y,0,157500.0,Commercial associate,Higher education,Married,House,1,1,1,Medicine staff,2.0,33
36454,F,N,Y,0,157500.0,Pensioner,Higher education,Married,House,1,1,1,Medicine staff,2.0,33
36455,F,N,Y,0,283500.0,Working,Secondary,Married,House,1,0,0,Sales staff,2.0,49


In [66]:
numerical_cols = X.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()

# Output the identified numerical and categorical columns
numerical_cols, categorical_cols

(['CNT_CHILDREN',
  'AMT_INCOME_TOTAL',
  'FLAG_MOBIL',
  'FLAG_PHONE',
  'FLAG_EMAIL',
  'CNT_FAM_MEMBERS',
  'age'],
 ['CODE_GENDER',
  'FLAG_OWN_CAR',
  'FLAG_OWN_REALTY',
  'NAME_INCOME_TYPE',
  'NAME_EDUCATION_TYPE',
  'NAME_FAMILY_STATUS',
  'NAME_HOUSING_TYPE',
  'OCCUPATION_TYPE'])

In [67]:
# Preprocessing for numerical data: standardization
numerical_transformer = StandardScaler()

# Preprocessing for categorical data: one-hot encoding
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Split data into train and test sets
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=0)



In [68]:
# Apply preprocessing
preprocessor.fit(X_train)
X_train = preprocessor.transform(X_train)
X_valid = preprocessor.transform(X_valid)

# SMOTE for class imbalance (Run this part locally)
smote = SMOTE(sampling_strategy='auto', random_state=0)
X_train, y_train = smote.fit_resample(X_train, y_train)

# Initialize models and hyperparameter grids
rf = RandomForestClassifier(random_state=0)
logistic = LogisticRegression(max_iter=1000, random_state=0)  # Increased max_iter
xgb = XGBClassifier(random_state=0, eval_metric='logloss')


param_grid_rf = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

param_grid_logistic = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'penalty': ['l2']
}
param_grid_xgb = {
    'learning_rate': [0.01, 0.05, 0.1],
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7]
}
# Initialize RandomizedSearchCV
n_iter_search = 5 # Number of parameter settings that are sampled
random_search_rf = RandomizedSearchCV(estimator=rf, param_distributions=param_grid_rf, 
                                      n_iter=n_iter_search, scoring='f1', verbose=2, random_state=0, n_jobs=-1)

random_search_logistic = RandomizedSearchCV(estimator=logistic, param_distributions=param_grid_logistic, 
                                            n_iter=n_iter_search, scoring='f1', verbose=2, random_state=0, n_jobs=-1)

random_search_xgb = RandomizedSearchCV(estimator=xgb, param_distributions=param_grid_xgb, 
                                       n_iter=n_iter_search, scoring='f1', verbose=2, random_state=0, n_jobs=-1)

# Initialize dictionary to store results
results_list = []

# Fit and evaluate
for model in [random_search_rf, random_search_logistic, random_search_xgb]:
    model_name = type(model.estimator).__name__
    print(f"Tuning {model_name}...")
    model.fit(X_train, y_train)
    
    # Use the best estimator to make predictions
    best_model = model.best_estimator_
    y_pred = best_model.predict(X_valid)
    
    # Compute metrics
    f1 = f1_score(y_valid, y_pred)
    precision = precision_score(y_valid, y_pred)
    recall = recall_score(y_valid, y_pred)
    
    # Create a dictionary of results
    result_dict = {
        'Model': model_name,
        'Best Parameters': model.best_params_,
        'F1 Score': f1,
        'Precision': precision,
        'Recall': recall
    }
    
    # Append the individual dictionary to the results list
    results_list.append(result_dict)

    print(f"Best Parameters: {model.best_params_}")
    print(f"F1 Score: {f1}")
    print(f"Precision: {precision}")
    print(f"Recall: {recall}\n")

# Convert the list of dictionaries to a DataFrame
model_results_df = pd.DataFrame(results_list)

Tuning RandomForestClassifier...
Fitting 5 folds for each of 5 candidates, totalling 25 fits
[CV] END max_depth=30, min_samples_leaf=1, min_samples_split=5, n_estimators=50; total time=   6.8s
[CV] END max_depth=30, min_samples_leaf=1, min_samples_split=5, n_estimators=50; total time=   7.6s
[CV] END max_depth=30, min_samples_leaf=1, min_samples_split=5, n_estimators=50; total time=   7.6s
[CV] END max_depth=30, min_samples_leaf=1, min_samples_split=5, n_estimators=50; total time=   7.7s
[CV] END max_depth=30, min_samples_leaf=1, min_samples_split=5, n_estimators=50; total time=   7.7s
[CV] END max_depth=20, min_samples_leaf=4, min_samples_split=5, n_estimators=50; total time=   5.9s
[CV] END max_depth=None, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time=  12.9s
[CV] END max_depth=20, min_samples_leaf=4, min_samples_split=5, n_estimators=50; total time=   6.3s
[CV] END max_depth=20, min_samples_leaf=4, min_samples_split=5, n_estimators=50; total time=   6.5s
[CV]

  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):


[CV] END ....learning_rate=0.1, max_depth=7, n_estimators=50; total time=   4.5s
[CV] END ....learning_rate=0.1, max_depth=7, n_estimators=50; total time=   4.5s
[CV] END ....learning_rate=0.1, max_depth=7, n_estimators=50; total time=   4.6s


  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):


[CV] END ....learning_rate=0.1, max_depth=7, n_estimators=50; total time=   4.7s
[CV] END ....learning_rate=0.1, max_depth=7, n_estimators=50; total time=   4.7s


  if is_sparse(data):
  if is_sparse(data):


[CV] END ..learning_rate=0.01, max_depth=3, n_estimators=200; total time=   8.6s
[CV] END ..learning_rate=0.01, max_depth=3, n_estimators=200; total time=   8.6s
[CV] END ..learning_rate=0.01, max_depth=3, n_estimators=200; total time=   8.8s


  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):


[CV] END ..learning_rate=0.01, max_depth=3, n_estimators=200; total time=   9.0s
[CV] END ..learning_rate=0.01, max_depth=3, n_estimators=200; total time=   9.1s


  if is_sparse(data):
  if is_sparse(data):


[CV] END ..learning_rate=0.05, max_depth=5, n_estimators=200; total time=  13.5s
[CV] END ..learning_rate=0.05, max_depth=5, n_estimators=200; total time=  13.7s
[CV] END ..learning_rate=0.05, max_depth=5, n_estimators=200; total time=  13.6s


  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):


[CV] END ..learning_rate=0.05, max_depth=5, n_estimators=200; total time=  13.7s
[CV] END ..learning_rate=0.05, max_depth=5, n_estimators=200; total time=  13.7s


  if is_sparse(data):
  if is_sparse(data):


[CV] END ..learning_rate=0.05, max_depth=7, n_estimators=200; total time=  18.2s
[CV] END ..learning_rate=0.05, max_depth=7, n_estimators=200; total time=  18.3s
[CV] END ..learning_rate=0.05, max_depth=7, n_estimators=200; total time=  18.2s
[CV] END ..learning_rate=0.05, max_depth=7, n_estimators=200; total time=  18.2s
[CV] END ..learning_rate=0.05, max_depth=7, n_estimators=200; total time=  18.3s
[CV] END ..learning_rate=0.01, max_depth=5, n_estimators=200; total time=  13.0s
[CV] END ..learning_rate=0.01, max_depth=5, n_estimators=200; total time=  13.0s
[CV] END ..learning_rate=0.01, max_depth=5, n_estimators=200; total time=  13.2s
[CV] END ..learning_rate=0.01, max_depth=5, n_estimators=200; total time=  12.8s
[CV] END ..learning_rate=0.01, max_depth=5, n_estimators=200; total time=  13.0s


  if is_sparse(data):


Best Parameters: {'n_estimators': 200, 'max_depth': 7, 'learning_rate': 0.05}
F1 Score: 0.995660260384377
Precision: 0.9957288509231194
Recall: 0.9955916792946687



In [69]:
model_results_df

Unnamed: 0,Model,Best Parameters,F1 Score,Precision,Recall
0,RandomForestClassifier,"{'n_estimators': 50, 'min_samples_split': 5, '...",0.996626,0.99642,0.996832
1,LogisticRegression,"{'penalty': 'l2', 'C': 100}",0.781859,0.995737,0.643615
2,XGBClassifier,"{'n_estimators': 200, 'max_depth': 7, 'learnin...",0.99566,0.995729,0.995592


**RandomForestClassifier**
* F1 Score: 0.997: This is an impressive F1 score, indicating a very high balance between precision and recall. In the context of deciding who to give a loan, this means that the model's decisions are both accurate and comprehensive
* Precision: 0.996: Almost 99.64% of the clients that the model labels as 'bad' are actually 'bad'. This high precision is critical in a financial setting as misclassifying a 'good' client as 'bad' can lead to lost business opportunities.
* Recall: 0.997: Almost 99.68% of the actual 'bad' clients are identified by the model. This is equally important because failing to identify a 'bad' client can lead to financial losses if they default on their loan.

**Logistic Regression**
* F1 Score: 0.782: The F1 score is significantly lower than the Random Forest model. This suggests that, while still decent, the balance between precision and recall isn't as high with the Logistic Regression model.
* Precision: 0.996: The precision is very high, indicating that almost 99.57% of the clients that the model labels as 'bad' are genuinely 'bad'. This is good for ensuring not to mislabel many 'good' clients.
* Recall: 0.644: This is where the Logistic Regression model falls short. A recall of 64.36% means that the model fails to identify a significant portion (around 35.64%) of 'bad' clients. From a financial perspective, this could be risky as these unidentified 'bad' clients might default on their loans.

**XGBClassifier**
* F1 Score: 0.996: The F1 score is very close to that of the Random Forest model, suggesting a high balance between precision and recall.
* Precision: 0.996: Again, a high precision, indicating a low rate of false positives.
* Recall: 0.996: A very high recall, indicating the model identifies almost all 'bad' clients.


**Implications:**
* Random Forest and XGBClassifier are both top-performing models with high precision and recall. They can be considered reliable in predicting 'bad' clients, thus minimizing potential loan defaults.
* Logistic Regression, while having high precision, lacks in recall, which could lead to potential risks in missing out on identifying clients who might default on their loans.

