In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MinMaxScaler
import tensorflow as tf
from tensorflow import keras
from sklearn.metrics import confusion_matrix, classification_report

In [2]:
df = pd.read_csv('training_data.csv')
df.head()

Unnamed: 0,Id,Income,Age,Experience,Married/Single,House_Ownership,Car_Ownership,Profession,CITY,STATE,CURRENT_JOB_YRS,CURRENT_HOUSE_YRS,Risk_Flag
0,1,1303834,23,3,single,rented,no,Mechanical_engineer,Rewa,Madhya_Pradesh,3,13,0
1,2,7574516,40,10,single,rented,no,Software_Developer,Parbhani,Maharashtra,9,13,0
2,3,3991815,66,4,married,rented,no,Technical_writer,Alappuzha,Kerala,4,10,0
3,4,6256451,41,2,single,rented,yes,Software_Developer,Bhubaneswar,Odisha,2,12,1
4,5,5768871,47,11,single,rented,no,Civil_servant,Tiruchirappalli[10],Tamil_Nadu,3,14,1


In [3]:
df.describe()

Unnamed: 0,Id,Income,Age,Experience,CURRENT_JOB_YRS,CURRENT_HOUSE_YRS,Risk_Flag
count,252000.0,252000.0,252000.0,252000.0,252000.0,252000.0,252000.0
mean,126000.5,4997117.0,49.954071,10.084437,6.333877,11.997794,0.123
std,72746.278255,2878311.0,17.063855,6.00259,3.647053,1.399037,0.328438
min,1.0,10310.0,21.0,0.0,0.0,10.0,0.0
25%,63000.75,2503015.0,35.0,5.0,3.0,11.0,0.0
50%,126000.5,5000694.0,50.0,10.0,6.0,12.0,0.0
75%,189000.25,7477502.0,65.0,15.0,9.0,13.0,0.0
max,252000.0,9999938.0,79.0,20.0,14.0,14.0,1.0


In [4]:
df = df.drop({'Id'}, axis='columns')
df.head()

Unnamed: 0,Income,Age,Experience,Married/Single,House_Ownership,Car_Ownership,Profession,CITY,STATE,CURRENT_JOB_YRS,CURRENT_HOUSE_YRS,Risk_Flag
0,1303834,23,3,single,rented,no,Mechanical_engineer,Rewa,Madhya_Pradesh,3,13,0
1,7574516,40,10,single,rented,no,Software_Developer,Parbhani,Maharashtra,9,13,0
2,3991815,66,4,married,rented,no,Technical_writer,Alappuzha,Kerala,4,10,0
3,6256451,41,2,single,rented,yes,Software_Developer,Bhubaneswar,Odisha,2,12,1
4,5768871,47,11,single,rented,no,Civil_servant,Tiruchirappalli[10],Tamil_Nadu,3,14,1


### Check for Imbalance in Dataset

In [5]:
# Divide by risk_flag
df_flag_0 = df[df['Risk_Flag'] == 0]
df_flag_1 = df[df['Risk_Flag'] == 1]

In [6]:
df_flag_0.shape

(221004, 12)

In [7]:
df_flag_1.shape

(30996, 12)

### Use SMOTE to handle imbalance

In [8]:
def print_unique_col_values(data_frame):
    for column in df:
        if df[column].dtypes=='object': 
            print(f'{column} : {df[column].unique()}')

In [9]:
print_unique_col_values(df)

Married/Single : ['single' 'married']
House_Ownership : ['rented' 'norent_noown' 'owned']
Car_Ownership : ['no' 'yes']
Profession : ['Mechanical_engineer' 'Software_Developer' 'Technical_writer'
 'Civil_servant' 'Librarian' 'Economist' 'Flight_attendant' 'Architect'
 'Designer' 'Physician' 'Financial_Analyst' 'Air_traffic_controller'
 'Politician' 'Police_officer' 'Artist' 'Surveyor' 'Design_Engineer'
 'Chemical_engineer' 'Hotel_Manager' 'Dentist' 'Comedian'
 'Biomedical_Engineer' 'Graphic_Designer' 'Computer_hardware_engineer'
 'Petroleum_Engineer' 'Secretary' 'Computer_operator'
 'Chartered_Accountant' 'Technician' 'Microbiologist' 'Fashion_Designer'
 'Aviator' 'Psychologist' 'Magistrate' 'Lawyer' 'Firefighter' 'Engineer'
 'Official' 'Analyst' 'Geologist' 'Drafter' 'Statistician' 'Web_designer'
 'Consultant' 'Chef' 'Army_officer' 'Surgeon' 'Scientist' 'Civil_engineer'
 'Industrial_Engineer' 'Technology_specialist']
CITY : ['Rewa' 'Parbhani' 'Alappuzha' 'Bhubaneswar' 'Tiruchirappalli[

In [10]:
df['Married/Single'].replace({
    'single' : 0,
    'married' : 1
}, inplace=True)

In [11]:
df['Car_Ownership'].replace({
    'no' : 0,
    'yes' : 1
}, inplace=True)

In [12]:
df.head()

Unnamed: 0,Income,Age,Experience,Married/Single,House_Ownership,Car_Ownership,Profession,CITY,STATE,CURRENT_JOB_YRS,CURRENT_HOUSE_YRS,Risk_Flag
0,1303834,23,3,0,rented,0,Mechanical_engineer,Rewa,Madhya_Pradesh,3,13,0
1,7574516,40,10,0,rented,0,Software_Developer,Parbhani,Maharashtra,9,13,0
2,3991815,66,4,1,rented,0,Technical_writer,Alappuzha,Kerala,4,10,0
3,6256451,41,2,0,rented,1,Software_Developer,Bhubaneswar,Odisha,2,12,1
4,5768871,47,11,0,rented,0,Civil_servant,Tiruchirappalli[10],Tamil_Nadu,3,14,1


In [13]:
col_to_scale = ['Age', 'Experience', 'CURRENT_JOB_YRS', 'CURRENT_HOUSE_YRS']

scaler = MinMaxScaler()

df[col_to_scale] = scaler.fit_transform(df[col_to_scale])

In [14]:
df.sample(5)

Unnamed: 0,Income,Age,Experience,Married/Single,House_Ownership,Car_Ownership,Profession,CITY,STATE,CURRENT_JOB_YRS,CURRENT_HOUSE_YRS,Risk_Flag
28926,3984296,0.5,0.2,0,rented,1,Graphic_Designer,Jamnagar,Gujarat,0.285714,1.0,0
42981,1535929,0.603448,0.45,0,rented,0,Flight_attendant,Nagercoil,Tamil_Nadu,0.5,0.25,0
188653,2691866,0.482759,0.85,0,rented,0,Surgeon,Mira-Bhayandar,Maharashtra,0.785714,0.5,0
230879,3989805,0.637931,0.25,0,owned,0,Surgeon,Kalyan-Dombivli,Maharashtra,0.357143,1.0,0
49992,5824731,0.689655,0.35,0,rented,1,Financial_Analyst,Sirsa,Haryana,0.428571,1.0,0


In [15]:
df_new = pd.get_dummies(df, drop_first=True)
df_new.head()

Unnamed: 0,Income,Age,Experience,Married/Single,Car_Ownership,CURRENT_JOB_YRS,CURRENT_HOUSE_YRS,Risk_Flag,House_Ownership_owned,House_Ownership_rented,...,STATE_Punjab,STATE_Rajasthan,STATE_Sikkim,STATE_Tamil_Nadu,STATE_Telangana,STATE_Tripura,STATE_Uttar_Pradesh,STATE_Uttar_Pradesh[5],STATE_Uttarakhand,STATE_West_Bengal
0,1303834,0.034483,0.15,0,0,0.214286,0.75,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,7574516,0.327586,0.5,0,0,0.642857,0.75,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,3991815,0.775862,0.2,1,0,0.285714,0.0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,6256451,0.344828,0.1,0,1,0.142857,0.5,1,0,1,...,0,0,0,0,0,0,0,0,0,0
4,5768871,0.448276,0.55,0,0,0.214286,1.0,1,0,1,...,0,0,0,1,0,0,0,0,0,0


In [16]:
X = df_new.drop('Risk_Flag', axis='columns')
y = df_new.Risk_Flag

In [17]:
y.value_counts()

0    221004
1     30996
Name: Risk_Flag, dtype: int64

In [18]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(sampling_strategy='minority')
X_sm, y_sm = smote.fit_sample(X,y)

In [19]:
y_sm.value_counts()

0    221004
1    221004
Name: Risk_Flag, dtype: int64

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X_sm, y_sm, test_size=0.2, random_state=15, stratify=y_sm)

In [21]:
y_test.value_counts()

1    44201
0    44201
Name: Risk_Flag, dtype: int64

In [1]:
import tensorflow as tf
from tensorflow import keras
from sklearn.metrics import confusion_matrix, classification_report

ModuleNotFoundError: No module named 'tensorflow'

### Train the model

In [22]:
model = RandomForestClassifier()

In [23]:
model.fit(X_train, y_train)

RandomForestClassifier()

In [24]:
model.score(X_test, y_test)

0.9400352933191557

### Save model

In [25]:
import joblib

In [26]:
joblib.dump(model, 'loan_prediction_joblib')

['loan_prediction_joblib']

### Import Test data, clean and run predictions using saved model

In [27]:
df_td = pd.read_csv('test_data.csv')
df_td.head()

Unnamed: 0,ID,Income,Age,Experience,Married/Single,House_Ownership,Car_Ownership,Profession,CITY,STATE,CURRENT_JOB_YRS,CURRENT_HOUSE_YRS
0,1,7393090,59,19,single,rented,no,Geologist,Malda,West Bengal,4,13
1,2,1215004,25,5,single,rented,no,Firefighter,Jalna,Maharashtra,5,10
2,3,8901342,50,12,single,rented,no,Lawyer,Thane,Maharashtra,9,14
3,4,1944421,49,9,married,rented,yes,Analyst,Latur,Maharashtra,3,12
4,5,13429,25,18,single,rented,yes,Comedian,Berhampore,West Bengal,13,11


In [31]:
df_td1 = df_td.drop({'ID'}, axis='columns')
df_td1.head()

Unnamed: 0,Income,Age,Experience,Married/Single,House_Ownership,Car_Ownership,Profession,CITY,STATE,CURRENT_JOB_YRS,CURRENT_HOUSE_YRS
0,7393090,59,19,single,rented,no,Geologist,Malda,West Bengal,4,13
1,1215004,25,5,single,rented,no,Firefighter,Jalna,Maharashtra,5,10
2,8901342,50,12,single,rented,no,Lawyer,Thane,Maharashtra,9,14
3,1944421,49,9,married,rented,yes,Analyst,Latur,Maharashtra,3,12
4,13429,25,18,single,rented,yes,Comedian,Berhampore,West Bengal,13,11


In [32]:
df_test_dummies = pd.get_dummies(df_td1, drop_first=True)
df_test_dummies.head()

Unnamed: 0,Income,Age,Experience,CURRENT_JOB_YRS,CURRENT_HOUSE_YRS,Married/Single_single,House_Ownership_owned,House_Ownership_rented,Car_Ownership_yes,Profession_Analyst,...,STATE_Punjab,STATE_Rajasthan,STATE_Sikkim,STATE_Tamil Nadu,STATE_Telangana,STATE_Tripura,STATE_Uttar Pradesh,STATE_Uttar Pradesh[5],STATE_Uttarakhand,STATE_West Bengal
0,7393090,59,19,4,13,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
1,1215004,25,5,5,10,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,8901342,50,12,9,14,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1944421,49,9,3,12,0,0,1,1,1,...,0,0,0,0,0,0,0,0,0,0
4,13429,25,18,13,11,1,0,1,1,0,...,0,0,0,0,0,0,0,0,0,1


##### Make predictions

In [33]:
pr = joblib.load('loan_prediction_joblib')
test_pr = pr.predict(df_test_dummies)

In [34]:
df_td['Risk_Flag'] = test_pr
df_td.head()

Unnamed: 0,ID,Income,Age,Experience,Married/Single,House_Ownership,Car_Ownership,Profession,CITY,STATE,CURRENT_JOB_YRS,CURRENT_HOUSE_YRS,Risk_Flag
0,1,7393090,59,19,single,rented,no,Geologist,Malda,West Bengal,4,13,0
1,2,1215004,25,5,single,rented,no,Firefighter,Jalna,Maharashtra,5,10,0
2,3,8901342,50,12,single,rented,no,Lawyer,Thane,Maharashtra,9,14,0
3,4,1944421,49,9,married,rented,yes,Analyst,Latur,Maharashtra,3,12,0
4,5,13429,25,18,single,rented,yes,Comedian,Berhampore,West Bengal,13,11,0


In [35]:
df_td.to_csv('test_data_with_predicitons.csv')