In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_recall_fscore_support


In [2]:
df = pd.read_csv('dataset_after_EDA.csv')

In [3]:
df.head(3)

Unnamed: 0.1,Unnamed: 0,pct_tl_open_L6M,pct_tl_closed_L6M,Tot_TL_closed_L12M,pct_tl_closed_L12M,Tot_Missed_Pmnt,CC_TL,Home_TL,PL_TL,Secured_TL,...,pct_PL_enq_L6m_of_ever,pct_CC_enq_L6m_of_ever,HL_Flag,GL_Flag,MARITALSTATUS,EDUCATION,GENDER,last_prod_enq2,first_prod_enq2,Approved_Flag
0,0,0.0,0.0,0,0.0,0,0,0,4,1,...,0.0,0.0,1,0,Married,12TH,M,PL,PL,P2
1,1,0.0,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,0,0,Single,GRADUATE,F,ConsumerLoan,ConsumerLoan,P2
2,2,0.125,0.0,0,0.0,1,0,0,0,2,...,0.0,0.0,1,0,Married,SSC,M,ConsumerLoan,others,P2


In [4]:
df.shape

(42064, 44)

# Performing Encoding to the categorical data

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42064 entries, 0 to 42063
Data columns (total 44 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Unnamed: 0                 42064 non-null  int64  
 1   pct_tl_open_L6M            42064 non-null  float64
 2   pct_tl_closed_L6M          42064 non-null  float64
 3   Tot_TL_closed_L12M         42064 non-null  int64  
 4   pct_tl_closed_L12M         42064 non-null  float64
 5   Tot_Missed_Pmnt            42064 non-null  int64  
 6   CC_TL                      42064 non-null  int64  
 7   Home_TL                    42064 non-null  int64  
 8   PL_TL                      42064 non-null  int64  
 9   Secured_TL                 42064 non-null  int64  
 10  Unsecured_TL               42064 non-null  int64  
 11  Other_TL                   42064 non-null  int64  
 12  Age_Oldest_TL              42064 non-null  int64  
 13  Age_Newest_TL              42064 non-null  int

In [6]:
df['MARITALSTATUS'].unique()
df['EDUCATION'].unique()
df['GENDER'].unique()
df['last_prod_enq2'].unique()
df['first_prod_enq2'].unique()



array(['PL', 'ConsumerLoan', 'others', 'AL', 'HL', 'CC'], dtype=object)

Ordinal feature -- Education column

SSC :1

12TH :2

graduate : 3

Under Graduate = 3

post graduate = 4

other = 1

professional = 3


In [7]:
#Performing Label encoding best on the above descripiton
df.loc[df['EDUCATION'] == 'SSC',['EDUCATION']] = 1
df.loc[df['EDUCATION'] == '12TH',['EDUCATION']] = 2
df.loc[df['EDUCATION'] == 'GRADUATE',['EDUCATION']] = 3
df.loc[df['EDUCATION'] == 'UNDER GRADUATE',['EDUCATION']] = 3
df.loc[df['EDUCATION'] == 'POST-GRADUATE',['EDUCATION']] = 4
df.loc[df['EDUCATION'] == 'OTHERS',['EDUCATION']] = 1
df.loc[df['EDUCATION'] == 'PROFESSIONAL',['EDUCATION']] = 3


In [8]:
df['EDUCATION'].unique()

array([2, 3, 1, 4], dtype=object)

In [9]:
#performing one hot encoding to other column:
df=pd.get_dummies(df,columns=['MARITALSTATUS','GENDER','last_prod_enq2','first_prod_enq2'])

# Data Preprocessing

In [10]:
y = df['Approved_Flag']
X = df.drop(columns=['Approved_Flag'],axis=1)

In [11]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

# Random Forest

In [12]:
#Initally direclty feeding without any normailizing the data or doing any preprocessing 
rf_classifier = RandomForestClassifier(n_estimators = 200, random_state= 42)
rf_classifier.fit(X_train,y_train)
y_pred = rf_classifier.predict(X_test)


In [13]:
#finding all types of evaluation metircs
accuracy=accuracy_score(y_test,y_pred)
print(f'accuracy={accuracy}')
print()
precison,recall,fscore,_=precision_recall_fscore_support(y_test,y_pred)
for i, v in enumerate(['p1','p2','p3','p4']):
    print(f'Class {v}')
    print(f'Precision:{precison[i]}')
    print(f'recall:{recall[i]}')
    print(f'f1_score:{fscore[i]}')
    print()

accuracy=0.7648876738381077

Class p1
Precision:0.8439716312056738
recall:0.7041420118343196
f1_score:0.767741935483871

Class p2
Precision:0.7940283400809717
recall:0.9330029732408325
f1_score:0.8579239952610953

Class p3
Precision:0.4444444444444444
recall:0.20528301886792452
f1_score:0.2808466701084151

Class p4
Precision:0.7224926971762414
recall:0.7210884353741497
f1_score:0.7217898832684825



In [14]:
df['EDUCATION'] = df['EDUCATION'].astype(int)

# xgboost

In [15]:
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
# Perform label encoding on the target column
label_encoder = LabelEncoder()
df['Approved_Flag'] = label_encoder.fit_transform(df['Approved_Flag'])

# Separate features (X) and target (y)
y = df['Approved_Flag']
X = df.drop(['Approved_Flag'], axis=1)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize XGBoost classifier
xgb_classifier = xgb.XGBClassifier(objective='multi:softmax', num_class=4)

# Train the classifier
xgb_classifier.fit(X_train, y_train)

# Make predictions on the test set
y_pred = xgb_classifier.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

# Calculate precision, recall, and F1 score
precision, recall, f1_score, _ = precision_recall_fscore_support(y_test, y_pred)

for i, v in enumerate(['p1', 'p2', 'p3', 'p4']):
    print(f"Class {v}:")
    print(f"Precision: {precision[i]}")
    print(f"Recall: {recall[i]}")
    print(f"F1 Score: {f1_score[i]}")
    print()


Accuracy: 0.78
Class p1:
Precision: 0.8243386243386244
Recall: 0.7682445759368837
F1 Score: 0.7953037263910159

Class p2:
Precision: 0.8228051391862955
Recall: 0.9139742319127849
F1 Score: 0.8659968072119447

Class p3:
Precision: 0.4692874692874693
Recall: 0.28830188679245283
F1 Score: 0.35717625058438524

Class p4:
Precision: 0.7219047619047619
Recall: 0.7366375121477162
F1 Score: 0.7291967291967293



In [16]:
#Since we can see accuracy of random_Forest and xgboost
#random_state=0.76
#xgboost=0.78

# Hyperparameter tunning in xgboost

In [17]:
# from sklearn.model_selection import GridSearchCV
# from xgboost import XGBClassifier

# # Define the XGBoost classifier
# xgb = XGBClassifier()

# # Define the parameter grid
# param_grid = {
#     'colsample_bytree': [0.1, 0.3, 0.5, 0.7, 0.9],
#     'learning_rate': [0.001, 0.01, 0.1, 1],
#     'max_depth': [3, 5, 8, 10],
#     'alpha': [1, 10, 100],
#     'n_estimators': [10, 50, 100]
# }

# # Perform GridSearchCV with 5-fold cross-validation
# grid_search = GridSearchCV(estimator=xgb, param_grid=param_grid, cv=5)
# grid_search.fit(X_train, y_train)  

# # Print the best parameters and best score
# print("Best Parameters:", grid_search.best_params_)
# print("Best Score:", grid_search.best_score_)


In [20]:
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
# Perform label encoding on the target column
label_encoder = LabelEncoder()
df['Approved_Flag'] = label_encoder.fit_transform(df['Approved_Flag'])

# Separate features (X) and target (y)
y = df['Approved_Flag']
X = df.drop(['Approved_Flag'], axis=1)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize XGBoost classifier
xgb_classifier = xgb.XGBClassifier(
    objective='multi:softmax',
    colsample_bytree=0.9,
    learning_rate=1,
    max_depth=3,
    alpha=10,
    n_estimators=100,
    num_class=4)

# Train the classifier
xgb_classifier.fit(X_train, y_train)

# Make predictions on the test set
y_pred = xgb_classifier.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

# Calculate precision, recall, and F1 score
precision, recall, f1_score, _ = precision_recall_fscore_support(y_test, y_pred)

for i, v in enumerate(['p1', 'p2', 'p3', 'p4']):
    print(f"Class {v}:")
    print(f"Precision: {precision[i]}")
    print(f"Recall: {recall[i]}")
    print(f"F1 Score: {f1_score[i]}")
    print()


Accuracy: 0.78
Class p1:
Precision: 0.8361702127659575
Recall: 0.7751479289940828
F1 Score: 0.804503582395087

Class p2:
Precision: 0.8232574679943101
Recall: 0.9177403369672944
F1 Score: 0.8679351391883026

Class p3:
Precision: 0.46420047732696895
Recall: 0.29358490566037737
F1 Score: 0.3596856218215442

Class p4:
Precision: 0.744807121661721
Recall: 0.7317784256559767
F1 Score: 0.738235294117647

