In [1]:
import pickle
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import SelectKBest, chi2

In [29]:
data = pd.read_csv('loan_data.csv')
data.head()

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE_x,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT_x,AMT_ANNUITY_x,...,NAME_SELLER_INDUSTRY,CNT_PAYMENT,NAME_YIELD_GROUP,PRODUCT_COMBINATION,DAYS_FIRST_DRAWING,DAYS_FIRST_DUE,DAYS_LAST_DUE_1ST_VERSION,DAYS_LAST_DUE,DAYS_TERMINATION,NFLAG_INSURED_ON_APPROVAL
0,100002,Defaulters,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,...,Auto technology,24.0,low_normal,POS other with interest,365243,565,125,25,17,0
1,100003,Repayers,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,...,XNA,12.0,low_normal,Cash X-Sell: low,365243,716,386,536,527,1
2,100003,Repayers,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,...,Furniture,6.0,middle,POS industry with interest,365243,797,647,647,639,0
3,100003,Repayers,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,...,Consumer electronics,12.0,middle,POS household with interest,365243,2310,1980,1980,1976,1
4,100004,Repayers,Revolving loans,M,Y,Y,0,67500.0,135000.0,6750.0,...,Connectivity,4.0,middle,POS mobile without interest,365243,784,694,724,714,0


## Label Encoding the Categorical columns

In [3]:
cat_cols = data.select_dtypes(include='object').columns
cat_cols

Index(['TARGET', 'NAME_CONTRACT_TYPE_x', 'CODE_GENDER', 'FLAG_OWN_CAR',
       'FLAG_OWN_REALTY', 'NAME_TYPE_SUITE_x', 'NAME_INCOME_TYPE',
       'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE',
       'OCCUPATION_TYPE', 'WEEKDAY_APPR_PROCESS_START', 'ORGANIZATION_TYPE',
       'NAME_CONTRACT_TYPE_y', 'NAME_CASH_LOAN_PURPOSE',
       'NAME_CONTRACT_STATUS', 'NAME_PAYMENT_TYPE', 'CODE_REJECT_REASON',
       'NAME_TYPE_SUITE_y', 'NAME_CLIENT_TYPE', 'NAME_GOODS_CATEGORY',
       'NAME_PORTFOLIO', 'NAME_PRODUCT_TYPE', 'CHANNEL_TYPE',
       'NAME_SELLER_INDUSTRY', 'NAME_YIELD_GROUP', 'PRODUCT_COMBINATION'],
      dtype='object')

In [4]:
en = LabelEncoder()

for col in cat_cols:
    data[col] = en.fit_transform(data[col])

## Feature Selection

In [5]:
X = abs(data.drop('TARGET',axis=1))
y = data['TARGET']  
selection = SelectKBest(chi2, k=10)
data_select = selection.fit_transform(X,y)

In [6]:
best = pd.DataFrame({"columns":X.columns,"chi-sq-value": selection.scores_}).sort_values('chi-sq-value',ascending=False).head(15)
best

Unnamed: 0,columns,chi-sq-value
62,DAYS_FIRST_DRAWING,605099300.0
9,AMT_GOODS_PRICE_x,301675200.0
7,AMT_CREDIT_x,176874500.0
17,DAYS_EMPLOYED,35544160.0
6,AMT_INCOME_TOTAL,7114127.0
43,AMT_APPLICATION,6619216.0
18,DAYS_REGISTRATION,6592912.0
63,DAYS_FIRST_DUE,3311987.0
37,DAYS_LAST_PHONE_CHANGE,2971802.0
19,DAYS_ID_PUBLISH,2758406.0


In [7]:
top_columns = list(best['columns'])  # Selecting the top 15 columns for prediction

## Scaling the columns

In [8]:
from sklearn.preprocessing import StandardScaler
std = StandardScaler()
for col in top_columns:
   data[col] = std.fit_transform(np.array(data[col]).reshape(-1, 1))

# Checking if the target is balanced

In [9]:
data['TARGET'].value_counts()

TARGET
1    1291279
0     122350
Name: count, dtype: int64

##### The target column is highly imbalanced. Hence using the SMOTE technique for oversampling

In [10]:
X = data[top_columns]
y = data['TARGET']

## SMOTE -Oversampling

In [11]:
from imblearn.over_sampling import SMOTE
oversample = SMOTE()
X,y = oversample.fit_resample(X,y)

In [12]:
from collections import Counter
counter = Counter(y)
print(counter)

Counter({0: 1291279, 1: 1291279})


##### The Target is now balanced

# Train test split

In [13]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [14]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn import metrics

## Model Building

In [27]:
def classification(df, algorithm):
    model = algorithm().fit(X_train, y_train)
    train_pred = model.predict(X_train)
    test_pred = model.predict(X_test)

    print(algorithm.__name__)  
    Train_Classification_Report = metrics.classification_report(y_train,train_pred)
    print(Train_Classification_Report)
    print("\n\n")
    Test_Classification_Report = metrics.classification_report(y_test,test_pred)
    print(Test_Classification_Report)

In [29]:
print(classification(data, LogisticRegression))

LogisticRegression
              precision    recall  f1-score   support

           0       0.59      0.62      0.61   1032357
           1       0.60      0.57      0.59   1033689

    accuracy                           0.60   2066046
   macro avg       0.60      0.60      0.60   2066046
weighted avg       0.60      0.60      0.60   2066046




              precision    recall  f1-score   support

           0       0.59      0.62      0.61    258922
           1       0.60      0.57      0.59    257590

    accuracy                           0.60    516512
   macro avg       0.60      0.60      0.60    516512
weighted avg       0.60      0.60      0.60    516512

None


In [30]:
print(classification(data, DecisionTreeClassifier))

DecisionTreeClassifier
              precision    recall  f1-score   support

           0       1.00      1.00      1.00   1032357
           1       1.00      1.00      1.00   1033689

    accuracy                           1.00   2066046
   macro avg       1.00      1.00      1.00   2066046
weighted avg       1.00      1.00      1.00   2066046




              precision    recall  f1-score   support

           0       0.95      0.95      0.95    258922
           1       0.95      0.95      0.95    257590

    accuracy                           0.95    516512
   macro avg       0.95      0.95      0.95    516512
weighted avg       0.95      0.95      0.95    516512

None


In [35]:
print(classification(data, XGBClassifier))

XGBClassifier
              precision    recall  f1-score   support

           0       0.82      0.77      0.79   1032357
           1       0.78      0.83      0.80   1033689

    accuracy                           0.80   2066046
   macro avg       0.80      0.80      0.80   2066046
weighted avg       0.80      0.80      0.80   2066046




              precision    recall  f1-score   support

           0       0.82      0.76      0.79    258922
           1       0.78      0.83      0.80    257590

    accuracy                           0.80    516512
   macro avg       0.80      0.80      0.80    516512
weighted avg       0.80      0.80      0.80    516512

None


From the above Models' evaluation the Decision Tree has a better performance. Hence selecting the Decision Tree as the ideal algorithm and saving it as a pickle file.

In [15]:
model = DecisionTreeClassifier()
model.fit(X_train, y_train)
train_pred = model.predict(X_train)
test_pred = model.predict(X_test)

print("Train classification report")  
Train_Classification_Report = metrics.classification_report(y_train,train_pred)
print(Train_Classification_Report)
print("\n\n")
print("Test classification report") 
Test_Classification_Report = metrics.classification_report(y_test,test_pred)
print(Test_Classification_Report)

Train classification report
              precision    recall  f1-score   support

           0       1.00      1.00      1.00   1032357
           1       1.00      1.00      1.00   1033689

    accuracy                           1.00   2066046
   macro avg       1.00      1.00      1.00   2066046
weighted avg       1.00      1.00      1.00   2066046




Test classification report
              precision    recall  f1-score   support

           0       0.94      0.94      0.94    258922
           1       0.94      0.94      0.94    257590

    accuracy                           0.94    516512
   macro avg       0.94      0.94      0.94    516512
weighted avg       0.94      0.94      0.94    516512



# Predicting the Target

In [33]:
ip = [[0.877012,1.255301,0.947656,0.051587,0.473693,-0.526915,-0.957155,0.265246,0.316398,0.704997,0.114612,-0.038348,-0.059868,0.214962,-0.504138]]
classified = model.predict(np.array(ip))
if classified == 1:
    print("The Client is a Repayer")
else:
    print("The Client is a Defaulter")

The Client is a Defaulter




# Saving the model as pickle file

In [31]:
import pickle
with open('model.pkl','wb') as file:
    pickle.dump(model,file)