In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
import lightgbm as lgb

In [2]:
train = pd.read_csv('Training Data.csv')
test = pd.read_csv('Test Data.csv')

In [3]:
train.head()

Unnamed: 0,Id,income,age,experience,married,house_ownership,car_ownership,profession,city,state,current_job_years,current_house_years,risk_flag
0,1,1303835,23,3,single,rented,no,Mechanical_engineer,Rewa,Madhya_Pradesh,3,13,0
1,2,7574516,40,10,single,rented,no,Software_Developer,Parbhani,Maharashtra,9,13,0
2,3,3991815,66,4,married,rented,no,Technical_writer,Alappuzha,Kerala,4,10,0
3,4,6256451,41,2,single,rented,yes,Software_Developer,Bhubaneswar,Odisha,2,12,1
4,5,5768871,47,11,single,rented,no,Civil_servant,Tiruchirappalli[10],Tamil_Nadu,3,14,1


In [4]:
for index, row in test.iterrows():
    test.loc[index, 'profession'] = test.loc[index, 'profession'].replace(' ', '_')
    test.loc[index, 'state'] = test.loc[index, 'state'].replace(' ', '_')
    test.loc[index, 'city'] = test.loc[index, 'city'].replace(' ', '_')
    
prof = train.profession.value_counts()
prof = prof.to_dict()
count = {}
for index, row in train.iterrows():
    job = train.loc[index, 'profession']
    if not job in count:
        count[job] = 0
    if train.loc[index, 'risk_flag'] == 1:
        count[job] += 1

count_norm = {}
for c in count:
    count_norm[c] = 0
    count_norm[c] = count[c]/prof[c]
print(count_norm)

{'Mechanical_engineer': 0.11155836687751582, 'Software_Developer': 0.1484266772214526, 'Technical_writer': 0.134167468719923, 'Civil_servant': 0.11579424427826875, 'Librarian': 0.11257562662057044, 'Economist': 0.09927837305926088, 'Flight_attendant': 0.12363494539781592, 'Architect': 0.13120034356882113, 'Designer': 0.10917790343627665, 'Physician': 0.11918751049185831, 'Financial_Analyst': 0.10315463518482679, 'Air_traffic_controller': 0.1353910244271918, 'Politician': 0.11225728155339806, 'Police_officer': 0.16405163853028798, 'Artist': 0.1226085167660975, 'Surveyor': 0.15146372507424694, 'Design_Engineer': 0.1069993656164094, 'Chemical_engineer': 0.11162343900096061, 'Hotel_Manager': 0.13538045577443028, 'Dentist': 0.109577582601422, 'Comedian': 0.11960448754516068, 'Biomedical_Engineer': 0.12755997659449972, 'Graphic_Designer': 0.11536972512582269, 'Computer_hardware_engineer': 0.12844378257632166, 'Petroleum_Engineer': 0.08510216226939099, 'Secretary': 0.13040901007705988, 'Compu

In [5]:
def add_feats(df_train, df_test):

    for index, row in df_train.iterrows():
        df_train.loc[index, 'prob'] = count_norm[df_train.loc[index, 'profession']]
    for index, row in df_test.iterrows():
        df_test.loc[index, 'prob'] = count_norm[df_test.loc[index, 'profession']]
    
    return df_train, df_test
    
train, test = add_feats(train, test)

In [6]:
def clean(df_train, df_test):
    d1 = {'single': 0, 'married':1}
    df_train = df_train.replace({'married': d1})
    d1 = {'single': 0, 'married':1}
    df_test = df_test.replace({'married': d1})
    
    d1 = {'rented':0, 'owned':1, 'norent_noown':2}
    df_train = df_train.replace({'house_ownership': d1})
    d1 = {'rented':0, 'owned':1, 'norent_noown':2}
    df_test = df_test.replace({'house_ownership': d1})
    
    d1 = {'no': 0, 'yes':1}
    df_train = df_train.replace({'car_ownership': d1})
    d1 = {'no': 0, 'yes':1}
    df_test = df_test.replace({'car_ownership': d1})
    
    le = LabelEncoder()
    prof_types = df_train.profession.unique().tolist()

    for p in df_test.profession.unique():
        if p not in prof_types:
            print(p)
            prof_types.append(p)
    le.fit(prof_types)
    df_train['profession'] = le.transform(df_train['profession'])
    df_test['profession'] = le.transform(df_test['profession'])
    
    le1 = LabelEncoder()
    cities = df_train.city.unique().tolist()

    for p in df_test.city.unique():
        if p not in cities:
            print(p)
            cities.append(p)
    
    le1.fit(cities)
    df_train['city'] = le1.transform(df_train['city'])   
    df_test['city'] = le1.transform(df_test['city'])
    
    le2 = LabelEncoder()
    
    states = df_train.state.unique().tolist()

    for p in df_test.state.unique():
        if p not in states:
            print(p)
            states.append(p)
    
    le2.fit(states)
        
    df_train['state'] = le2.transform(df_train['state'])
    df_test['state'] = le2.transform(df_test['state'])
#     df_test['state'] = le2.transform(df_test['state'])

    ss = StandardScaler()
#     ss.fit(df_train[['income', 'city']])
#     df_train[['income', 'city']] = ss.transform(df_train[['income', 'city']])
#     df_test[['income', 'city']] = ss.transform(df_test[['income', 'city']])
    ss.fit(df_train[['income', 'age', 'experience', 'married', 'house_ownership',
       'car_ownership', 'profession', 'city', 'state', 'current_job_years',
       'current_house_years', 'prob']])
    df_train[['income', 'age', 'experience', 'married', 'house_ownership',
       'car_ownership', 'profession', 'city', 'state', 'current_job_years',
       'current_house_years', 'prob']] = ss.transform(df_train[['income', 'age', 'experience', 'married', 'house_ownership',
       'car_ownership', 'profession', 'city', 'state', 'current_job_years',
       'current_house_years', 'prob']])
    df_test[['income', 'age', 'experience', 'married', 'house_ownership',
       'car_ownership', 'profession', 'city', 'state', 'current_job_years',
       'current_house_years', 'prob']] = ss.transform(df_test[['income', 'age', 'experience', 'married', 'house_ownership',
       'car_ownership', 'profession', 'city', 'state', 'current_job_years',
       'current_house_years', 'prob']])
    
    
    return df_train, df_test

In [7]:
train, test = clean(train, test)
train.head()

Unnamed: 0,Id,income,age,experience,married,house_ownership,car_ownership,profession,city,state,current_job_years,current_house_years,risk_flag,prob
0,1,-1.283145,-1.579603,-1.180232,-0.3372,-0.276304,-0.657129,0.524374,1.007167,-0.086313,-0.914131,0.716356,0,-0.706047
1,2,0.895457,-0.583343,-0.014067,-0.3372,-0.276304,-0.657129,1.20333,0.746867,0.020384,0.731036,0.716356,0,1.569044
2,3,-0.349269,0.940347,-1.013637,2.965599,-0.276304,-0.657129,1.474912,-1.628364,-0.193011,-0.639936,-1.427981,0,0.689129
3,4,0.437526,-0.52474,-1.346827,-0.3372,-0.276304,1.521772,1.20333,-1.129457,0.340477,-1.188325,0.001577,1,1.569044
4,5,0.268128,-0.173119,0.152528,-0.3372,-0.276304,-0.657129,-0.969327,1.495228,0.873965,-0.914131,1.431135,1,-0.444657


In [8]:
test.head()

Unnamed: 0,id,income,age,experience,married,house_ownership,car_ownership,profession,city,state,current_job_years,current_house_years,prob
0,1,0.832425,0.530123,1.485289,-0.3372,-0.276304,-0.657129,0.049106,0.24796,1.514151,-0.639936,0.716356,1.312153
1,2,-1.314007,-1.462396,-0.847042,-0.3372,-0.276304,-0.657129,-0.086685,-0.29433,0.020384,-0.365742,-1.427981,0.789177
2,3,1.356432,0.002692,0.319123,-0.3372,-0.276304,-0.657129,0.320688,1.430153,0.020384,0.731036,1.431135,0.401989
3,4,-1.060588,-0.055912,-0.180662,2.965599,-0.276304,1.521772,-1.648283,0.139502,0.020384,-0.914131,0.001577,-0.094704
4,5,-1.731466,-1.462396,1.318694,-0.3372,-0.276304,1.521772,-0.901432,-1.292144,1.514151,1.827814,-0.713202,-0.209532


In [2]:
# train.to_csv('final_train.csv')
# test.to_csv('final_test.csv')
train = pd.read_csv('final_train.csv')
test = pd.read_csv('final_test.csv')

In [3]:
x = train[['income', 'age', 'experience', 'married', 'house_ownership',
       'car_ownership', 'profession', 'city', 'state', 'current_job_years',
       'current_house_years', 'prob']]
y = train['risk_flag']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=42)
X_test = test[['income', 'age', 'experience', 'married', 'house_ownership',
       'car_ownership', 'profession', 'city', 'state', 'current_job_years',
       'current_house_years', 'prob']]

In [4]:
knn = KNeighborsClassifier()
#create a dictionary of all values we want to test for n_neighbors
params_knn = {'n_neighbors': np.arange(1, 25)}
#use gridsearch to test all values for n_neighbors
knn_gs = GridSearchCV(knn, params_knn, cv=5)
#fit model to training data
knn_gs.fit(x_train, y_train)

#save best model
knn_best = knn_gs.best_estimator_
#check best n_neigbors value
print(knn_gs.best_params_)

{'n_neighbors': 12}


In [6]:
from sklearn.ensemble import RandomForestClassifier
#create a new random forest classifier
rf = RandomForestClassifier()
#create a dictionary of all values we want to test for n_estimators
params_rf = {'n_estimators': [50, 100, 200, 500, 1000]}
#use gridsearch to test all values for n_estimators
rf_gs = GridSearchCV(rf, params_rf, cv=5)
#fit model to training data
rf_gs.fit(x_train, y_train)

#save best model
rf_best = rf_gs.best_estimator_
#check best n_estimators value
print(rf_gs.best_params_)

{'n_estimators': 100}


In [7]:
from sklearn.linear_model import LogisticRegression
#create a new logistic regression model
log_reg = LogisticRegression()
#fit the model to the training data
log_reg.fit(x_train, y_train)

LogisticRegression()

In [None]:
from sklearn.svm import SVC

svc = SVC()
svc.fit(x_train, y_train)

In [None]:
from sklearn.naive_bayes import GaussianNB

gnb = GaussianNB()
gnb.fit(x_train, y_train)

In [None]:
log_reg.score()

In [None]:
estimators=[('knn', knn_best), ('rf', rf_best), ('log_reg', log_reg), ('svc', svc), ('gaussian', gnb)]
# #create our voting classifier, inputting our models
ensemble = VotingClassifier(estimators, voting=’hard’)

# #fit model to training data
ensemble.fit(x_train, y_train)
# #test our model on the test data
ensemble.score(x_test, y_test)