In [1]:
# numpy and pandas imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# ML Libraries for scikit
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import RobustScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier 
from sklearn.feature_selection import SelectKBest, f_classif, chi2
from sklearn import tree
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix,\
                            precision_recall_curve,precision_score,recall_score,roc_auc_score,roc_curve,\
                            matthews_corrcoef, f1_score, make_scorer, auc
from scipy.stats import skew

#  Recursive Feature Elimination
from sklearn.feature_selection import RFE
        
# Python imports
from math import log, sqrt
import re
import collections
import warnings

# decision tree visualization related imports
import pydotplus
import graphviz

from IPython.display import Image

warnings.filterwarnings('ignore')

pd.options.display.max_columns = 400

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
sample= pd.read_csv('/content/drive/My Drive/DataSets/MLhack2/sample_submission.csv')
sample.head(1)

Unnamed: 0.1,Unnamed: 0,m13
0,68426,1


In [4]:
train = pd.read_csv('/content/drive/My Drive/DataSets/MLhack2/train.csv')
test = pd.read_csv('/content/drive/My Drive/DataSets/MLhack2/test.csv')
pd.set_option('display.max_rows', None)

In [5]:
train.head()

Unnamed: 0.1,Unnamed: 0,loan_id,source,financial_institution,interest_rate,unpaid_principal_bal,loan_term,origination_date,first_payment_date,loan_to_value,number_of_borrowers,debt_to_income_ratio,borrower_credit_score,loan_purpose,insurance_percent,co-borrower_credit_score,insurance_type,m1,m2,m3,m4,m5,m6,m7,m8,m9,m10,m11,m12,m13
0,81041,861482495205,Y,"Martinez, Duffy and Bird",3.375,272000,180,2012-01-01,03/2012,72,2.0,31.0,770.0,B12,0.0,786.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,51021,173139140902,Y,"Swanson, Newton and Miller",4.25,371000,360,2012-01-01,03/2012,95,1.0,45.0,727.0,C86,30.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,83866,481942560913,Y,OTHER,4.5,90000,360,2012-01-01,03/2012,69,1.0,39.0,726.0,B12,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,92311,676780245612,Y,"Turner, Baldwin and Rhodes",4.5,47000,240,2012-02-01,04/2012,75,1.0,43.0,688.0,B12,0.0,0.0,0.0,0,0,0,1,0,0,0,0,0,0,0,0,0
4,72274,433043514697,X,OTHER,4.875,177000,360,2012-01-01,03/2012,80,2.0,13.0,675.0,C86,0.0,672.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [6]:
##print(train['debt_to_income_ratio'].value_counts())
#print(test['debt_to_income_ratio'].value_counts())

In [7]:
test.head()

Unnamed: 0.1,Unnamed: 0,loan_id,source,financial_institution,interest_rate,unpaid_principal_bal,loan_term,origination_date,first_payment_date,loan_to_value,number_of_borrowers,debt_to_income_ratio,borrower_credit_score,loan_purpose,insurance_percent,co-borrower_credit_score,insurance_type,m1,m2,m3,m4,m5,m6,m7,m8,m9,m10,m11,m12
0,68426,780423564300,X,OTHER,3.375,144000,180,2012-01-01,03/2012,73,2.0,20.0,790.0,A23,0.0,797.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0
1,49544,472138003270,Y,Browning-Hart,4.375,110000,360,2012-01-01,03/2012,88,1.0,45.0,779.0,C86,25.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0
2,96751,233667097068,X,Browning-Hart,4.125,243000,240,2012-02-01,04/2012,70,2.0,40.0,727.0,B12,0.0,705.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0
3,112058,117580972208,X,OTHER,2.875,189000,120,2012-03-01,05/2012,64,2.0,35.0,798.0,A23,0.0,793.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0
4,85610,440010429529,Z,OTHER,4.0,141000,360,2012-03-01,05/2012,84,2.0,31.0,819.0,A23,12.0,799.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0


In [8]:
df = train.copy()
df_test = test.copy()

In [9]:
df['January'] = (df['origination_date']=='2012-01-01').astype('int')
df['February'] = (df['origination_date']=='2012-02-01').astype('int')
df['March'] = (df['origination_date']=='2012-03-01').astype('int')
df.drop('origination_date',axis=1,inplace=True)
#
df['fpay_1'] = (df['first_payment_date']=='02/2012').astype('int')
df['fpay_2'] = (df['first_payment_date']=='03/2012').astype('int')
df['fpay3'] = (df['first_payment_date']=='04/2012').astype('int')
df['fpay4'] = (df['first_payment_date']=='05/2012').astype('int')
df.drop('first_payment_date',axis=1,inplace=True)
#
df_1 = pd.get_dummies(df['financial_institution'])
df = pd.concat([df, df_1],axis=1)
df.drop('financial_institution',axis=1,inplace=True)
#
df['Source_X'] = (df['source']=='X').astype('int')
df['Source_Y'] = (df['source']=='Y').astype('int')
df.drop('source',axis=1,inplace=True)
#
df.drop('Unnamed: 0',axis=1,inplace=True)
df.drop('loan_id',axis=1,inplace=True)
#
df['borrowers_1'] = (df['number_of_borrowers']==1.0).astype('int')
df['borrowers_2'] = (df['number_of_borrowers']==2.0).astype('int')
df.drop('number_of_borrowers',axis=1,inplace=True)
#
df_12 = pd.get_dummies(df['loan_purpose'],prefix='pupose')
df = pd.concat([df,df_12],axis=1)
df.drop('loan_purpose',axis=1,inplace=True)
#
'''debt = pd.get_dummies(df['debt_to_income_ratio'],prefix='debitincome')
df = pd.concat([df,debt],axis=1)
df.drop('debt_to_income_ratio',axis=1,inplace=True)'''
#
scale = ['unpaid_principal_bal']
robust_scaler = RobustScaler()
df.loc[:,scale] = robust_scaler.fit_transform(df.loc[:,scale])


In [10]:
df_test['January'] = (df_test['origination_date']=='2012-01-01').astype('int')
df_test['February'] = (df_test['origination_date']=='2012-02-01').astype('int')
df_test['March'] = (df_test['origination_date']=='2012-03-01').astype('int')
df_test.drop('origination_date',axis=1,inplace=True)
#
df_test['fpay_1'] = (df_test['first_payment_date']=='02/2012').astype('int')
df_test['fpay_2'] = (df_test['first_payment_date']=='03/2012').astype('int')
df_test['fpay3'] = (df_test['first_payment_date']=='04/2012').astype('int')
df_test['fpay4'] = (df_test['first_payment_date']=='05/2012').astype('int')
df_test.drop('first_payment_date',axis=1,inplace=True)
#
df_test_1 = pd.get_dummies(df_test['financial_institution'])
df_test = pd.concat([df_test, df_test_1],axis=1)
df_test.drop('financial_institution',axis=1,inplace=True)
#
df_test['Source_X'] = (df_test['source']=='X').astype('int')
df_test['Source_Y'] = (df_test['source']=='Y').astype('int')
df_test.drop('source',axis=1,inplace=True)
#
df_test.drop('Unnamed: 0',axis=1,inplace=True)
df_test.drop('loan_id',axis=1,inplace=True)
#
df_test['borrowers_1'] = (df_test['number_of_borrowers']==1.0).astype('int')
df_test['borrowers_2'] = (df_test['number_of_borrowers']==2.0).astype('int')
df_test.drop('number_of_borrowers',axis=1,inplace=True)
#
df_test_12 = pd.get_dummies(df_test['loan_purpose'],prefix='pupose')
df_test = pd.concat([df_test,df_test_12],axis=1)
df_test.drop('loan_purpose',axis=1,inplace=True)
#
'''debt = pd.get_dummies(df_test['debt_to_income_ratio'],prefix='debitincome')
df_test = pd.concat([df_test,debt],axis=1)
df_test.drop('debt_to_income_ratio',axis=1,inplace=True)'''
#
scale = ['unpaid_principal_bal']
robust_scaler = RobustScaler()
df_test.loc[:,scale] = robust_scaler.fit_transform(df_test.loc[:,scale])

In [11]:
print(df_test.shape)
print(df.shape)

(23212, 54)
(92846, 55)


test 

In [12]:
X = df.drop(['m13'],axis=1)
Y = df['m13']
X_train,X_test,y_train,y_test=train_test_split(X,Y,test_size=0.20,random_state=0)

In [13]:
from sklearn.ensemble import RandomForestClassifier
rf_classifier=RandomForestClassifier(n_estimators=10).fit(X_train,y_train)
prediction=rf_classifier.predict(X_test)

In [14]:
y_pred = rf_classifier.predict(df_test)
pred=pd.DataFrame(y_pred)
datasets= pd.concat([test['Unnamed: 0'],pred],axis=1)
datasets.columns=['Unnamed: 0','m13']
datasets.to_csv('sample_submission.csv',index=False)

In [15]:
df_pred = pd.DataFrame({'Actual': y_test, 'Predicted': prediction})
print(df_pred.tail(10))

       Actual  Predicted
51652       0          0
3119        0          0
76241       0          0
20278       0          0
79937       0          0
52041       0          0
80545       0          0
89558       0          0
44263       0          0
21893       0          0


In [16]:

from sklearn.metrics import confusion_matrix,classification_report,accuracy_score
print(confusion_matrix(y_test,prediction))
print(accuracy_score(y_test,prediction))
print(classification_report(y_test,prediction))

[[18456    15]
 [   70    29]]
0.9954227248249865
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     18471
           1       0.66      0.29      0.41        99

    accuracy                           1.00     18570
   macro avg       0.83      0.65      0.70     18570
weighted avg       0.99      1.00      0.99     18570



In [17]:
model=RandomForestClassifier(n_estimators=300,criterion='entropy',
                             max_features='sqrt',min_samples_leaf=10,random_state=100).fit(X_train,y_train)
predictions=model.predict(X_test)
print(confusion_matrix(y_test,predictions))
print(accuracy_score(y_test,predictions))
print(classification_report(y_test,predictions))

[[18462     9]
 [   69    30]]
0.9957996768982229
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     18471
           1       0.77      0.30      0.43        99

    accuracy                           1.00     18570
   macro avg       0.88      0.65      0.72     18570
weighted avg       1.00      1.00      0.99     18570



In [18]:
y_pred2 = model.predict(df_test)
pred=pd.DataFrame(y_pred)
datasets= pd.concat([test['Unnamed: 0'],pred],axis=1)
datasets.columns=['Unnamed: 0','m13']
datasets.to_csv('sample_submission2.csv',index=False)

# Model3

In [19]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV

clf = RandomForestClassifier()
params = {'n_estimators': [10,50,100], 
              'criterion': ['entropy', 'gini'],
              'max_depth': [2,4,6,8,16], 
              'min_samples_split': [2, 3, 5, 10, 15,20,25,30,35],
              'min_samples_leaf': [1,3,5,8,50,100]
             }
grid = GridSearchCV(clf,params)
grid.fit(X_train,y_train)
y_pred_val = grid.predict(X_val)
print ('Accuracy on validation set(RF): {}'.format(accuracy_score(y_pred_val,y_val))) #Accuracy on validation set

grid.fit(X_train,y_train)
y_pred_test = grid.predict(X_test)
print ('Accuracy on test set of training data(RF): {}'.format(accuracy_score(y_pred_test,y_test)))

KeyboardInterrupt: ignored

In [None]:
y_pred3 = grid.predict(df_test)
pred=pd.DataFrame(y_pred3)
datasets= pd.concat([test['Unnamed: 0'],pred],axis=1)
datasets.columns=['Unnamed: 0','m13']
datasets.to_csv('sample_submission3.csv',index=False)

In [None]:
datasets['m13']= datasets['m13'].fillna(1.0).astype(int)
datasets.head()

In [None]:
datasets.isnull().sum()

In [None]:
datasets.to_csv('sample_submission3.csv',index=False)

In [22]:
seed = 7
scoring = 'accuracy'
# Spot Check Algorithms
models = []
models.append(('LR', LogisticRegression()))
models.append(('RF', RandomForestClassifier()))
models.append(('DT', DecisionTreeClassifier()))


# evaluate each model in turn
results = []
names = []

for name, model in models:
    kfold = KFold(n_splits=10, random_state=seed)
    cv_results = cross_val_score(model, X_train, y_train, cv=kfold, scoring=scoring)
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f" % (name, cv_results.mean())
    print(msg)

NameError: ignored

In [23]:
LR=LogisticRegression()
LR.fit(X_train,y_train)
#y_pred_val = LR.predict(X_val)
#print ('Accuracy on validation set(LR): {}'.format(accuracy_score(y_pred_val,y_val))) #Accuracy on validation set

#LR.fit(X_train,y_train)
#y_pred_test = LR.predict(X_test)
#print ('Accuracy on test set of training data(LR): {}'.format(accuracy_score(y_pred_test,y_test)))

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [24]:
y_pred3 = LR.predict(df_test)
pred=pd.DataFrame(y_pred3)
datasets= pd.concat([test['Unnamed: 0'],pred],axis=1)
datasets.columns=['Unnamed: 0','m13']
datasets.to_csv('sample_submission4.csv',index=False)

In [None]:
(
 learning_rate =0.1,
 n_estimators=1000,
 max_depth=5,
 min_child_weight=1,
 gamma=0,
 subsample=0.8,
 colsample_bytree=0.8,
 objective= 'binary:logistic',
 nthread=4,
 scale_pos_weight=1,
 seed=27)

In [28]:
from xgboost import XGBClassifier
xgb = XGBClassifier()
xgb.fit(X_train, y_train)
y_pred3 =xgb.predict(df_test)
pred=pd.DataFrame(y_pred3)
datasets= pd.concat([test['Unnamed: 0'],pred],axis=1)
datasets.columns=['Unnamed: 0','m13']
datasets.to_csv('sample_submission5.csv',index=False)

In [30]:
y_predf = xgb.predict(X_test)

In [32]:
print(confusion_matrix(y_test,y_predf))
print(accuracy_score(y_test,y_predf))
print(classification_report(y_test,y_predf))

[[18458    13]
 [   61    38]]
0.9960150780829294
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     18471
           1       0.75      0.38      0.51        99

    accuracy                           1.00     18570
   macro avg       0.87      0.69      0.75     18570
weighted avg       1.00      1.00      1.00     18570



In [35]:
from xgboost import XGBClassifier
xgb = XGBClassifier(
 learning_rate =0.35,
 n_estimators=1000,
 max_depth=5,
 min_child_weight=1,
 gamma=0,
 subsample=0.8,
 colsample_bytree=0.8,
 objective= 'binary:logistic',
 nthread=4,
 scale_pos_weight=1,
 seed=27)
xgb.fit(X_train, y_train)
y_predf = xgb.predict(X_test)
print(confusion_matrix(y_test,y_predf))
print(accuracy_score(y_test,y_predf))
print(classification_report(y_test,y_predf))

[[18452    19]
 [   60    39]]
0.9957458266020464
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     18471
           1       0.67      0.39      0.50        99

    accuracy                           1.00     18570
   macro avg       0.83      0.70      0.75     18570
weighted avg       1.00      1.00      1.00     18570



In [37]:
from xgboost import XGBClassifier
xgb = XGBClassifier(
 learning_rate =0.2,
 n_estimators=1200,
 max_depth=6,
 min_child_weight=1,
 gamma=0,
 subsample=0.8,
 colsample_bytree=0.8,
 objective= 'binary:logistic',
 nthread=4,
 scale_pos_weight=1,
 seed=27)
xgb.fit(X_train, y_train)
y_predf = xgb.predict(X_test)
print(confusion_matrix(y_test,y_predf))
print(accuracy_score(y_test,y_predf))
print(classification_report(y_test,y_predf))

[[18454    17]
 [   58    41]]
0.9959612277867528
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     18471
           1       0.71      0.41      0.52        99

    accuracy                           1.00     18570
   macro avg       0.85      0.71      0.76     18570
weighted avg       1.00      1.00      1.00     18570



In [44]:
from xgboost import XGBClassifier
xgb = XGBClassifier(
 learning_rate =0.08,
 n_estimators=1000,
 max_depth=4,
 min_child_weight=1,
 gamma=0,
 subsample=0.8,
 colsample_bytree=0.8,
 objective= 'binary:logistic',
 nthread=4,
 scale_pos_weight=1,
 seed=27)
xgb.fit(X_train, y_train)
y_predf = xgb.predict(X_test)
print(confusion_matrix(y_test,y_predf))
print(accuracy_score(y_test,y_predf))
print(classification_report(y_test,y_predf))

[[18454    17]
 [   56    43]]
0.9960689283791061
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     18471
           1       0.72      0.43      0.54        99

    accuracy                           1.00     18570
   macro avg       0.86      0.72      0.77     18570
weighted avg       1.00      1.00      1.00     18570



In [43]:
from xgboost import XGBClassifier
xgb = XGBClassifier(
 learning_rate =0.07,
 n_estimators=1000,
 max_depth=4,
 min_child_weight=1,
 gamma=0,
 subsample=0.8,
 colsample_bytree=0.8,
 objective= 'binary:logistic',
 nthread=4,
 scale_pos_weight=1,
 seed=27)
xgb.fit(X_train, y_train)
y_predf = xgb.predict(X_test)
print(confusion_matrix(y_test,y_predf))
print(accuracy_score(y_test,y_predf))
print(classification_report(y_test,y_predf))

[[18453    18]
 [   58    41]]
0.9959073774905762
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     18471
           1       0.69      0.41      0.52        99

    accuracy                           1.00     18570
   macro avg       0.85      0.71      0.76     18570
weighted avg       1.00      1.00      1.00     18570



In [45]:
y_pred3 = xgb.predict(df_test)
pred=pd.DataFrame(y_pred3)
datasets= pd.concat([test['Unnamed: 0'],pred],axis=1)
datasets.columns=['Unnamed: 0','m13']
datasets.to_csv('sample_submission11.csv',index=False)