In [34]:
import pandas as pd

from collections import Counter

from sklearn.preprocessing import Imputer

In [81]:
df = pd.read_csv('../data/train.csv')
df_hold = pd.read_csv('../data/test.csv')

In [82]:
df.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [83]:
df.shape

(614, 13)

In [84]:
df_hold.shape

(367, 12)

In [85]:
Counter(df['Loan_Status'])

Counter({'N': 192, 'Y': 422})

In [86]:
df.isnull().sum()

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

In [87]:
df = pd.get_dummies(df, columns=['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 'Property_Area', 'Loan_Status'], drop_first=True)

df_hold = pd.get_dummies(df_hold, columns=['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 'Property_Area'], drop_first=True)

## Least Viable Product: Drop the NA rows and move on

In [88]:
df.isnull().sum()

Loan_ID                     0
ApplicantIncome             0
CoapplicantIncome           0
LoanAmount                 22
Loan_Amount_Term           14
Credit_History             50
Gender_Male                 0
Married_Yes                 0
Dependents_1                0
Dependents_2                0
Dependents_3+               0
Education_Not Graduate      0
Self_Employed_Yes           0
Property_Area_Semiurban     0
Property_Area_Urban         0
Loan_Status_Y               0
dtype: int64

In [89]:
df_hold.isnull().sum()

Loan_ID                     0
ApplicantIncome             0
CoapplicantIncome           0
LoanAmount                  5
Loan_Amount_Term            6
Credit_History             29
Gender_Male                 0
Married_Yes                 0
Dependents_1                0
Dependents_2                0
Dependents_3+               0
Education_Not Graduate      0
Self_Employed_Yes           0
Property_Area_Semiurban     0
Property_Area_Urban         0
dtype: int64

In [90]:
df.dropna(inplace=True)

In [92]:
impt = Imputer(strategy='most_frequent')
impt_matrix = impt.fit_transform(df_hold[['LoanAmount', 'Loan_Amount_Term', 'Credit_History']])

In [94]:
impt_df = pd.DataFrame(impt_matrix, columns=['LoanAmount_i', 'Loan_Amount_Term_i', 'Credit_History_i'])

In [96]:
df_hold = pd.concat([df_hold, impt_df], axis=1)

In [97]:
df_hold.drop(['LoanAmount', 'Loan_Amount_Term', 'Credit_History'], axis=1, inplace=True)

In [98]:
df_hold.isnull().sum()

Loan_ID                    0
ApplicantIncome            0
CoapplicantIncome          0
Gender_Male                0
Married_Yes                0
Dependents_1               0
Dependents_2               0
Dependents_3+              0
Education_Not Graduate     0
Self_Employed_Yes          0
Property_Area_Semiurban    0
Property_Area_Urban        0
LoanAmount_i               0
Loan_Amount_Term_i         0
Credit_History_i           0
dtype: int64

In [99]:
df_hold.shape

(367, 15)

In [100]:
df.shape

(529, 16)

## X and Y ish

In [103]:
df.head()

Unnamed: 0,Loan_ID,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Gender_Male,Married_Yes,Dependents_1,Dependents_2,Dependents_3+,Education_Not Graduate,Self_Employed_Yes,Property_Area_Semiurban,Property_Area_Urban,Loan_Status_Y
1,LP001003,4583,1508.0,128.0,360.0,1.0,1,1,1,0,0,0,0,0,0,0
2,LP001005,3000,0.0,66.0,360.0,1.0,1,1,0,0,0,0,1,0,1,1
3,LP001006,2583,2358.0,120.0,360.0,1.0,1,1,0,0,0,1,0,0,1,1
4,LP001008,6000,0.0,141.0,360.0,1.0,1,0,0,0,0,0,0,0,1,1
5,LP001011,5417,4196.0,267.0,360.0,1.0,1,1,0,1,0,0,1,0,1,1


In [117]:
X = df.drop(['Loan_ID', 'Loan_Status_Y'], axis=1)
y = df['Loan_Status_Y']

X_hold = df_hold.drop('Loan_ID', axis=1)

In [120]:
X.dtypes

ApplicantIncome              int64
CoapplicantIncome          float64
LoanAmount                 float64
Loan_Amount_Term           float64
Credit_History             float64
Gender_Male                  uint8
Married_Yes                  uint8
Dependents_1                 uint8
Dependents_2                 uint8
Dependents_3+                uint8
Education_Not Graduate       uint8
Self_Employed_Yes            uint8
Property_Area_Semiurban      uint8
Property_Area_Urban          uint8
dtype: object

In [124]:
X_hold.columns = ['ApplicantIncome', 'CoapplicantIncome', 'Gender_Male', 'Married_Yes',
       'Dependents_1', 'Dependents_2', 'Dependents_3+',
       'Education_Not Graduate', 'Self_Employed_Yes',
       'Property_Area_Semiurban', 'Property_Area_Urban', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History']

In [125]:
X_hold_new = pd.DataFrame(columns=X.columns)

for i in X.columns:
    X_hold_new[i] = X_hold[i]

In [126]:
X.dtypes

ApplicantIncome              int64
CoapplicantIncome          float64
LoanAmount                 float64
Loan_Amount_Term           float64
Credit_History             float64
Gender_Male                  uint8
Married_Yes                  uint8
Dependents_1                 uint8
Dependents_2                 uint8
Dependents_3+                uint8
Education_Not Graduate       uint8
Self_Employed_Yes            uint8
Property_Area_Semiurban      uint8
Property_Area_Urban          uint8
dtype: object

In [128]:
X_hold_new.dtypes

ApplicantIncome              int64
CoapplicantIncome            int64
LoanAmount                 float64
Loan_Amount_Term           float64
Credit_History             float64
Gender_Male                  uint8
Married_Yes                  uint8
Dependents_1                 uint8
Dependents_2                 uint8
Dependents_3+                uint8
Education_Not Graduate       uint8
Self_Employed_Yes            uint8
Property_Area_Semiurban      uint8
Property_Area_Urban          uint8
dtype: object

## End of Race to Viable Product

In [253]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

from sklearn.metrics import classification_report

from sklearn.linear_model import LogisticRegression

from sklearn.preprocessing import StandardScaler

In [198]:
df_charles = pd.read_csv('../data/train_mice.csv')
df_hold_charles = pd.read_csv('../data/test_mice.csv')

In [201]:
X_charles = df_charles.drop('Loan_Status', axis=1)
y_charles = df_charles['Loan_Status']

In [203]:
print(X_charles.shape)
print(df_hold_charles.shape)
print(y_charles.shape)

(614, 13)
(367, 13)
(614,)


In [226]:
ss = StandardScaler()
Xs_charles = ss.fit_transform(X_charles)
Xs_hold_charles = ss.transform(df_hold_charles)

In [227]:
X_train, X_test, y_train, y_test = train_test_split(Xs_charles, y_charles)

In [243]:
rfc_params = {'n_estimators':[2,5,10,20,50,75,150],
             'criterion':['gini', 'entropy'],
             'max_depth':[2,5,10,20,50,None],
             'min_samples_split':[2,5,10,20]}


grid_rfc = GridSearchCV(RandomForestClassifier(), rfc_params, cv=5, scoring='accuracy')
grid_rfc.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'n_estimators': [2, 5, 10, 20, 50, 75, 150], 'criterion': ['gini', 'entropy'], 'max_depth': [2, 5, 10, 20, 50, None], 'min_samples_split': [2, 5, 10, 20]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=0)

In [244]:
grid_rfc.best_params_

{'criterion': 'gini',
 'max_depth': None,
 'min_samples_split': 10,
 'n_estimators': 75}

In [245]:
grid_rfc.score(X_test, y_test)

0.7922077922077922

In [251]:
rfc = RandomForestClassifier(criterion='gini', max_depth=None, min_samples_split=10,
                            n_estimators=75)
rfc.fit(Xs_charles, y_charles)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=10,
            min_weight_fraction_leaf=0.0, n_estimators=75, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [246]:
y_preds = grid_rfc.predict(X_test)
print(classification_report(y_test, y_preds))
pd.DataFrame(confusion_matrix(y_test, y_preds), columns=['Pred -', 'Pred +'], index=['Act -', 'Act +'])

             precision    recall  f1-score   support

        0.0       0.76      0.37      0.50        43
        1.0       0.80      0.95      0.87       111

avg / total       0.79      0.79      0.77       154



Unnamed: 0,Pred -,Pred +
Act -,16,27
Act +,5,106


In [256]:
logreg_params = {'penalty':['l1', 'l2'],
             'tol':[.0001,.001,.01],
             'C':[.01,1.0]}


logreg_rfc = GridSearchCV(LogisticRegression(), logreg_params, cv=5, scoring='accuracy')
logreg_rfc.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'penalty': ['l1', 'l2'], 'tol': [0.0001, 0.001, 0.01], 'C': [0.01, 1.0]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=0)

In [257]:
logreg_rfc.score(X_test, y_test)

0.8181818181818182

In [258]:
y_preds = logreg_rfc.predict(Xs_hold_charles)
submission = pd.DataFrame(columns=['Loan_ID', 'Loan_Status'])
submission['Loan_Status'] = y_preds
submission['Loan_ID'] = df_hold['Loan_ID']
submission['Loan_Status'] = submission['Loan_Status'].map(lambda x: 'Y' if x ==1 else 'N')
submission.to_csv('../submissions/submission_5.csv', index=False)

# Submissions

In [232]:
y_preds = grid_rfc.predict(Xs_hold_charles)

In [233]:
y_preds.shape

(367,)

In [234]:
submission = pd.DataFrame(columns=['Loan_ID', 'Loan_Status'])

In [235]:
submission['Loan_Status'] = y_preds

In [236]:
submission['Loan_ID'] = df_hold['Loan_ID']

In [237]:
submission

Unnamed: 0,Loan_ID,Loan_Status
0,LP001015,1.0
1,LP001022,1.0
2,LP001031,1.0
3,LP001035,1.0
4,LP001051,1.0
5,LP001054,1.0
6,LP001055,1.0
7,LP001056,0.0
8,LP001059,1.0
9,LP001067,1.0


In [238]:
submission.shape

(367, 2)

In [239]:
Counter(submission['Loan_Status'])

Counter({0.0: 66, 1.0: 301})

In [240]:
submission['Loan_Status'] = submission['Loan_Status'].map(lambda x: 'Y' if x == 1 else 'N')

In [241]:
submission

Unnamed: 0,Loan_ID,Loan_Status
0,LP001015,Y
1,LP001022,Y
2,LP001031,Y
3,LP001035,Y
4,LP001051,Y
5,LP001054,Y
6,LP001055,Y
7,LP001056,N
8,LP001059,Y
9,LP001067,Y


In [242]:
submission.to_csv('../submissions/submission_3.csv', index=False)

In [252]:
y_preds = rfc.predict(Xs_hold_charles)
submission = pd.DataFrame(columns=['Loan_ID', 'Loan_Status'])
submission['Loan_Status'] = y_preds
submission['Loan_ID'] = df_hold['Loan_ID']
submission['Loan_Status'] = submission['Loan_Status'].map(lambda x: 'Y' if x ==1 else 'N')
submission.to_csv('../submissions/submission_4.csv', index=False)