In [35]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,accuracy_score,classification_report,roc_auc_score
import matplotlib.pyplot as plt # data visualization
%matplotlib inline

In [252]:
df=pd.read_csv('Documents/Training Data.csv')

In [253]:
df.shape

(252000, 13)

In [254]:
df.columns

Index(['Id', 'Income', 'Age', 'Experience', 'Married/Single',
       'House_Ownership', 'Car_Ownership', 'Profession', 'CITY', 'STATE',
       'CURRENT_JOB_YRS', 'CURRENT_HOUSE_YRS', 'Risk_Flag'],
      dtype='object')

In [255]:
df['Age'].describe()

count    252000.000000
mean         49.954071
std          17.063855
min          21.000000
25%          35.000000
50%          50.000000
75%          65.000000
max          79.000000
Name: Age, dtype: float64

In [256]:
def get_agegrp(age):
    if age<18:
        return '<18'
    elif (age>=18) & (age<=24):
        return '18-24'
    elif (age>=25) & (age<=34):
        return '25-34'
    elif (age>=35) & (age<=44):
        return '35-44'
    elif (age>=45) & (age<=54):
        return '45-54'
    elif (age>=55) & (age<=64):
        return '55-64' 
    else:
        return '>=65'

In [257]:
df.head()

Unnamed: 0,Id,Income,Age,Experience,Married/Single,House_Ownership,Car_Ownership,Profession,CITY,STATE,CURRENT_JOB_YRS,CURRENT_HOUSE_YRS,Risk_Flag
0,1,1303834,23,3,single,rented,no,Mechanical_engineer,Rewa,Madhya_Pradesh,3,13,0
1,2,7574516,40,10,single,rented,no,Software_Developer,Parbhani,Maharashtra,9,13,0
2,3,3991815,66,4,married,rented,no,Technical_writer,Alappuzha,Kerala,4,10,0
3,4,6256451,41,2,single,rented,yes,Software_Developer,Bhubaneswar,Odisha,2,12,1
4,5,5768871,47,11,single,rented,no,Civil_servant,Tiruchirappalli[10],Tamil_Nadu,3,14,1


In [260]:
def get_exp(exp):
    if (exp>=2) & (exp<=5):
        return '2-5'
    elif (exp>=6) & (exp<=10):
        return '6-10'
    elif (exp>10) :
        return '>10'
    else:
        return exp

In [261]:
df.head()

Unnamed: 0,Id,Income,Age,Experience,Married/Single,House_Ownership,Car_Ownership,Profession,CITY,STATE,CURRENT_JOB_YRS,CURRENT_HOUSE_YRS,Risk_Flag
0,1,1303834,23,3,single,rented,no,Mechanical_engineer,Rewa,Madhya_Pradesh,3,13,0
1,2,7574516,40,10,single,rented,no,Software_Developer,Parbhani,Maharashtra,9,13,0
2,3,3991815,66,4,married,rented,no,Technical_writer,Alappuzha,Kerala,4,10,0
3,4,6256451,41,2,single,rented,yes,Software_Developer,Bhubaneswar,Odisha,2,12,1
4,5,5768871,47,11,single,rented,no,Civil_servant,Tiruchirappalli[10],Tamil_Nadu,3,14,1


In [262]:
df['age_group']=df['Age'].apply(get_agegrp)
df['exp_group']=df['Experience'].apply(get_exp)
df['curr_job_group']=df['CURRENT_JOB_YRS'].apply(get_exp)
m_df=pd.get_dummies(df['Married/Single'])
h_df=pd.get_dummies(df['House_Ownership'])
c_df=pd.get_dummies(df['Car_Ownership'],prefix='car')
a_df=pd.get_dummies(df['age_group'],prefix='age')
e_df=pd.get_dummies(df['exp_group'],prefix='exp')
cj_df=pd.get_dummies(df['curr_job_group'],prefix='cur_job')
s_df=pd.get_dummies(df['STATE'])


In [263]:
df['Profession'].nunique()

51

In [264]:
res_df=pd.concat([df,m_df,h_df,c_df,a_df,e_df,cj_df,s_df],axis=1)

In [265]:
res_df.head()

Unnamed: 0,Id,Income,Age,Experience,Married/Single,House_Ownership,Car_Ownership,Profession,CITY,STATE,...,Punjab,Rajasthan,Sikkim,Tamil_Nadu,Telangana,Tripura,Uttar_Pradesh,Uttar_Pradesh[5],Uttarakhand,West_Bengal
0,1,1303834,23,3,single,rented,no,Mechanical_engineer,Rewa,Madhya_Pradesh,...,0,0,0,0,0,0,0,0,0,0
1,2,7574516,40,10,single,rented,no,Software_Developer,Parbhani,Maharashtra,...,0,0,0,0,0,0,0,0,0,0
2,3,3991815,66,4,married,rented,no,Technical_writer,Alappuzha,Kerala,...,0,0,0,0,0,0,0,0,0,0
3,4,6256451,41,2,single,rented,yes,Software_Developer,Bhubaneswar,Odisha,...,0,0,0,0,0,0,0,0,0,0
4,5,5768871,47,11,single,rented,no,Civil_servant,Tiruchirappalli[10],Tamil_Nadu,...,0,0,0,1,0,0,0,0,0,0


In [266]:
res_df['STATE'].nunique()

29

In [267]:
res_df.columns

Index(['Id', 'Income', 'Age', 'Experience', 'Married/Single',
       'House_Ownership', 'Car_Ownership', 'Profession', 'CITY', 'STATE',
       'CURRENT_JOB_YRS', 'CURRENT_HOUSE_YRS', 'Risk_Flag', 'age_group',
       'exp_group', 'curr_job_group', 'married', 'single', 'norent_noown',
       'owned', 'rented', 'car_no', 'car_yes', 'age_18-24', 'age_25-34',
       'age_35-44', 'age_45-54', 'age_55-64', 'age_>=65', 'exp_0', 'exp_1',
       'exp_2-5', 'exp_6-10', 'exp_>10', 'cur_job_0', 'cur_job_1',
       'cur_job_2-5', 'cur_job_6-10', 'cur_job_>10', 'Andhra_Pradesh', 'Assam',
       'Bihar', 'Chandigarh', 'Chhattisgarh', 'Delhi', 'Gujarat', 'Haryana',
       'Himachal_Pradesh', 'Jammu_and_Kashmir', 'Jharkhand', 'Karnataka',
       'Kerala', 'Madhya_Pradesh', 'Maharashtra', 'Manipur', 'Mizoram',
       'Odisha', 'Puducherry', 'Punjab', 'Rajasthan', 'Sikkim', 'Tamil_Nadu',
       'Telangana', 'Tripura', 'Uttar_Pradesh', 'Uttar_Pradesh[5]',
       'Uttarakhand', 'West_Bengal'],
      dtype='

In [268]:
y.value_counts(normalize=True)

0    0.877
1    0.123
Name: Risk_Flag, dtype: float64

In [269]:
sel_cols=['Id', 'Income',  'single', 'norent_noown',
       'owned', 'rented', 'car_yes', 'age_18-24', 'age_25-34',
       'age_35-44', 'age_45-54', 'age_55-64', 'age_>=65', 'exp_0', 'exp_1',
       'exp_2-5', 'exp_6-10', 'exp_>10', 'cur_job_0', 'cur_job_1',
       'cur_job_2-5', 'cur_job_6-10', 'cur_job_>10']

In [270]:
X=res_df[sel_cols]
X.set_index('Id',inplace=True)

In [271]:
y=res_df['Risk_Flag']


In [272]:
train_X,val_X,train_y,val_y=train_test_split(X,y,test_size=0.3,random_state=1)

In [273]:
##Model
rf=RandomForestClassifier(random_state=1,class_weight='balanced')
rf.fit(train_X,train_y)

RandomForestClassifier(class_weight='balanced', random_state=1)

In [274]:
rf.score(val_X,val_y)


0.8805687830687831

In [275]:
pred=rf.predict(val_X)


In [276]:
confusion_matrix(val_y,pred)


array([[58996,  7302],
       [ 1727,  7575]], dtype=int64)

In [277]:
print(classification_report(val_y,pred))


              precision    recall  f1-score   support

           0       0.97      0.89      0.93     66298
           1       0.51      0.81      0.63      9302

    accuracy                           0.88     75600
   macro avg       0.74      0.85      0.78     75600
weighted avg       0.91      0.88      0.89     75600



In [278]:
roc_auc_score(val_y,pred)


0.8521009664416056

In [279]:
imp_df=pd.DataFrame({'Columns':X.columns,'Importance':rf.feature_importances_})

In [280]:
imp_df

Unnamed: 0,Columns,Importance
0,Income,0.916177
1,single,0.009067
2,norent_noown,0.002689
3,owned,0.002539
4,rented,0.003662
5,car_yes,0.010731
6,age_18-24,0.004745
7,age_25-34,0.005632
8,age_35-44,0.004636
9,age_45-54,0.005058


### Predicting Test Data 

In [281]:
test_df=pd.read_csv('Documents/Test Data.csv')

In [282]:
test_df.shape

(28000, 12)

In [283]:
test_df.columns

Index(['ID', 'Income', 'Age', 'Experience', 'Married/Single',
       'House_Ownership', 'Car_Ownership', 'Profession', 'CITY', 'STATE',
       'CURRENT_JOB_YRS', 'CURRENT_HOUSE_YRS'],
      dtype='object')

In [284]:
test_df['age_group']=test_df['Age'].apply(get_agegrp)
test_df['exp_group']=test_df['Experience'].apply(get_exp)
test_df['curr_job_group']=test_df['CURRENT_JOB_YRS'].apply(get_exp)
m_test_df=pd.get_dummies(test_df['Married/Single'])
h_test_df=pd.get_dummies(test_df['House_Ownership'])
c_test_df=pd.get_dummies(test_df['Car_Ownership'],prefix='car')
a_test_df=pd.get_dummies(test_df['age_group'],prefix='age')
e_test_df=pd.get_dummies(test_df['exp_group'],prefix='exp')
cj_test_df=pd.get_dummies(test_df['curr_job_group'],prefix='cur_job')
s_test_df=pd.get_dummies(test_df['STATE'])
test_res_df=pd.concat([test_df,m_test_df,h_test_df,c_test_df,a_test_df,e_test_df,cj_test_df,s_test_df],axis=1)

In [285]:
test_res_df.shape

(28000, 67)

In [286]:
test_res_df.head()

Unnamed: 0,ID,Income,Age,Experience,Married/Single,House_Ownership,Car_Ownership,Profession,CITY,STATE,...,Punjab,Rajasthan,Sikkim,Tamil Nadu,Telangana,Tripura,Uttar Pradesh,Uttar Pradesh[5],Uttarakhand,West Bengal
0,1,7393090,59,19,single,rented,no,Geologist,Malda,West Bengal,...,0,0,0,0,0,0,0,0,0,1
1,2,1215004,25,5,single,rented,no,Firefighter,Jalna,Maharashtra,...,0,0,0,0,0,0,0,0,0,0
2,3,8901342,50,12,single,rented,no,Lawyer,Thane,Maharashtra,...,0,0,0,0,0,0,0,0,0,0
3,4,1944421,49,9,married,rented,yes,Analyst,Latur,Maharashtra,...,0,0,0,0,0,0,0,0,0,0
4,5,13429,25,18,single,rented,yes,Comedian,Berhampore,West Bengal,...,0,0,0,0,0,0,0,0,0,1


In [287]:
test_res_df.rename(columns={'ID':'Id'},inplace=True)

In [288]:
test_res_df.shape

(28000, 67)

In [290]:
test_X=test_res_df[sel_cols]
test_X.set_index('Id',inplace=True)
test_pred=rf.predict(test_X)


In [293]:
sub_df=pd.DataFrame({'id':test_df['ID'].values,'risk_flag':test_pred})

In [295]:
sub_df.to_csv('resultant_data.csv',index=False)