In [44]:
#First let's import the necessary modules
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

import os
from IPython.display import display, HTML

pd.set_option('display.max_columns', 500)

In [45]:
df=pd.read_csv('post_cleaning.csv')

In [46]:
#final data cleaning touches

df_cleaned = df[df.Had_Blanks != 1]
df_cleaned = df_cleaned.drop('Had_Blanks', axis=1)

In [47]:
df_cleaned.to_csv("for_submission.csv")

In [48]:
#making sex, marital status, education, and july repayment categorical
df_cleaned['Sex'] = pd.Categorical(df_cleaned['Sex'])
df_cleaned['Education'] = pd.Categorical(df_cleaned['Education'])
df_cleaned['Marital_Status'] = pd.Categorical(df_cleaned['Marital_Status'])
df_cleaned['July_Payment_Status'] = pd.Categorical(df_cleaned['July_Payment_Status'])

In [49]:
#importing training and testing
from sklearn.model_selection import train_test_split
train, test = train_test_split(df_cleaned, test_size = 0.2, random_state=42)

In [50]:
#importing kbestfeatures
from sklearn.feature_selection import SelectKBest, f_classif
selector = SelectKBest(f_classif, k=5)

In [51]:
#finding most predictive features
selector.fit(train.loc[:, train.columns != 'July_Payment_Status'], train['July_Payment_Status'].values)
idxs_selected = selector.get_support(indices=True)
df_selected_features = df_cleaned.columns[idxs_selected]

In [52]:
#looking at the most predictive features, surprise it's all the previous repayment statuses!
df_selected_features

Index([u'Jan_Repay_Status', u'Feb_Repay_Status', u'Mar_Repay_Status',
       u'Apr_Repay_Status', u'May_Repay_Status'],
      dtype='object')

In [53]:
#import logistic model
from sklearn import linear_model
lr = linear_model.LogisticRegression()

In [54]:
#run it
lr_model = lr.fit(train.loc[:, train.columns != 'July_Payment_Status'], train['July_Payment_Status'].values)

In [55]:
print('training set performance is {}'.format(lr_model.score(train.loc[:, train.columns != 'July_Payment_Status'], train['July_Payment_Status'].values)))
print('test set performance is {}'.format(lr_model.score(test.loc[:, train.columns != 'July_Payment_Status'], test['July_Payment_Status'].values)))

training set performance is 0.798145564946
test set performance is 0.802428618313


In [56]:
#making the resulting coefficients readable
export_values = pd.DataFrame(lr_model.coef_.reshape(-1, len(lr_model.coef_)))
export_values['Feature'] = list(train.loc[:, train.columns != 'July_Payment_Status'])
export_values['temp'] = export_values[0].abs()
export_values['rank'] = export_values['temp'].rank(ascending=False)
export_values = export_values.drop(['temp'], axis=1)

result = export_values.sort_values(by=['rank'])

In [57]:
result

Unnamed: 0,0,Feature,rank
5,0.4187882,Jan_Repay_Status,1.0
3,-0.2363916,Marital_Status,2.0
1,-0.1549166,Sex,3.0
2,-0.1461843,Education,4.0
8,0.13094,Feb_Repay_Status,5.0
11,0.07080868,Mar_Repay_Status,6.0
17,0.03998791,May_Repay_Status,7.0
14,-0.02314763,Apr_Repay_Status,8.0
20,0.006652913,Jun_Repay_Status,9.0
4,0.001047252,Age,10.0
