In [1]:
import numpy as np 
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import chi2_contingency
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import KFold   
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn import metrics

  from numpy.core.umath_tests import inner1d


In [2]:
def classification_model(model, data, predictors, outcome):
  #Fit the model:
  model.fit(data[predictors],data[outcome])
  
  #Make predictions on training set:
  predictions = model.predict(data[predictors])
  print(predictions)
  #Print accuracy
  accuracy = metrics.accuracy_score(predictions,data[outcome])
  print ("Accuracy : %s" % "{0:.3%}".format(accuracy))

  #Perform k-fold cross-validation with 10 folds
  kf = KFold(data.shape[0], n_folds=10)
  error = []
  for train, test in kf:
    # Filter training data
    train_predictors = (data[predictors].iloc[train,:])
    
    # The target we're using to train the algorithm.
    train_target = data[outcome].iloc[train]
    
    # Training the algorithm using the predictors and target.
    model.fit(train_predictors, train_target)
    
    #Record error from each cross-validation run
    error.append(model.score(data[predictors].iloc[test,:], data[outcome].iloc[test]))
 
  print ("Cross-Validation Score : %s" % "{0:.3%}".format(np.mean(error)))

  #Fit the model again so that it can be refered outside the function:
  model.fit(data[predictors],data[outcome]) 

In [27]:
dataset = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [28]:
#Pre-processing

dataset['Date.of.Birth']= pd.to_datetime(dataset['Date.of.Birth']) 

dataset['DisbursalDate']= pd.to_datetime(dataset['DisbursalDate']) 

dataset = dataset.drop('MobileNo_Avl_Flag', axis = 1)

dataset = dataset.drop(['SEC.NO.OF.ACCTS','SEC.ACTIVE.ACCTS','SEC.OVERDUE.ACCTS','SEC.CURRENT.BALANCE','SEC.SANCTIONED.AMOUNT','SEC.DISBURSED.AMOUNT'], axis = 1)

dataset['PERFORM_CNS.SCORE.DESCRIPTION'] = dataset['PERFORM_CNS.SCORE.DESCRIPTION'].map({'No Bureau History Available':0, 
'C-Very Low Risk':1, 'A-Very Low Risk':2, 'D-Very Low Risk':3, 'B-Very Low Risk': 4, 'M-Very High Risk':5, 'F-Low Risk':6, 
'K-High Risk':7, 'H-Medium Risk':8, 'E-Low Risk':9, 'I-Medium Risk':10, 'G-Low Risk':11, 
'Not Scored: Sufficient History Not Available': 12, 'J-High Risk':13, 'Not Scored: Not Enough Info available on the customer':14,
'Not Scored: No Activity seen on the customer (Inactive)':15, 'Not Scored: No Updates available in last 36 months':16, 
'L-Very High Risk':17, 'Not Scored: Only a Guarantor':18, 'Not Scored: More than 50 active Accounts found':19})

dataset['Employment.Type_random'] = dataset['Employment.Type']
random_sample_dataset = dataset['Employment.Type'].dropna().sample(dataset['Employment.Type'].isnull().sum(),random_state = 0)
random_sample_dataset.index = dataset[dataset['Employment.Type'].isnull()].index
dataset.loc[dataset['Employment.Type'].isnull(), 'Employment.Type_random'] = random_sample_dataset
dataset['Employment.Type'] = dataset['Employment.Type_random']
dataset = dataset.drop('Employment.Type_random', axis = 1)

dataset['Employment.Type'] = dataset['Employment.Type'].map({'Salaried':0, 'Self employed':1})

dataset = dataset.drop(['VoterID_flag', 'PRI.DISBURSED.AMOUNT'], axis = 1)

index = dataset[(dataset['manufacturer_id'] > 150)].index
dataset.drop(index, inplace=True)

index = dataset[(dataset['State_ID'] == 22)].index
dataset.drop(index, inplace=True)

temp = dataset.groupby('PRI.OVERDUE.ACCTS')['PRI.OVERDUE.ACCTS'].count()/np.float(len(dataset))
rare_cat = [x for x in temp.loc[temp<0.0005].index.values]
dataset['PRI.OVERDUE.ACCTS'] = np.where(dataset['PRI.OVERDUE.ACCTS'].isin(rare_cat), 6 , dataset['PRI.OVERDUE.ACCTS'])

In [29]:
#Pre-processing

test['Date.of.Birth']= pd.to_datetime(test['Date.of.Birth']) 

test['DisbursalDate']= pd.to_datetime(test['DisbursalDate']) 

test = test.drop('MobileNo_Avl_Flag', axis = 1)

test = test.drop(['SEC.NO.OF.ACCTS','SEC.ACTIVE.ACCTS','SEC.OVERDUE.ACCTS','SEC.CURRENT.BALANCE','SEC.SANCTIONED.AMOUNT','SEC.DISBURSED.AMOUNT'], axis = 1)

test['PERFORM_CNS.SCORE.DESCRIPTION'] = test['PERFORM_CNS.SCORE.DESCRIPTION'].map({'No Bureau History Available':0, 
'C-Very Low Risk':1, 'A-Very Low Risk':2, 'D-Very Low Risk':3, 'B-Very Low Risk': 4, 'M-Very High Risk':5, 'F-Low Risk':6, 
'K-High Risk':7, 'H-Medium Risk':8, 'E-Low Risk':9, 'I-Medium Risk':10, 'G-Low Risk':11, 
'Not Scored: Sufficient History Not Available': 12, 'J-High Risk':13, 'Not Scored: Not Enough Info available on the customer':14,
'Not Scored: No Activity seen on the customer (Inactive)':15, 'Not Scored: No Updates available in last 36 months':16, 
'L-Very High Risk':17, 'Not Scored: Only a Guarantor':18, 'Not Scored: More than 50 active Accounts found':19})

test['Employment.Type_random'] = test['Employment.Type']
random_sample_test = test['Employment.Type'].dropna().sample(test['Employment.Type'].isnull().sum(),random_state = 0)
random_sample_test.index = test[test['Employment.Type'].isnull()].index
test.loc[dataset['Employment.Type'].isnull(), 'Employment.Type_random'] = random_sample_test
test['Employment.Type'] = test['Employment.Type_random']
test = test.drop('Employment.Type_random', axis = 1)

test['Employment.Type'] = test['Employment.Type'].map({'Salaried':0, 'Self employed':1})

test = test.drop(['VoterID_flag', 'PRI.DISBURSED.AMOUNT'], axis = 1)

index = test[(test['manufacturer_id'] > 150)].index
test.drop(index, inplace=True)

index = test[(test['State_ID'] == 22)].index
test.drop(index, inplace=True)

temp = test.groupby('PRI.OVERDUE.ACCTS')['PRI.OVERDUE.ACCTS'].count()/np.float(len(test))
rare_cat = [x for x in temp.loc[temp<0.0005].index.values]
test['PRI.OVERDUE.ACCTS'] = np.where(test['PRI.OVERDUE.ACCTS'].isin(rare_cat), 6 , test['PRI.OVERDUE.ACCTS'])

IndexingError: (0         False
1         False
2         False
3         False
4         False
5         False
6         False
7         False
8         False
9         False
10        False
11        False
12        False
13        False
14        False
15        False
16        False
17        False
18        False
19        False
20        False
21        False
22        False
23        False
24        False
25        False
26        False
27        False
28        False
29        False
          ...  
233124    False
233125    False
233126    False
233127    False
233128    False
233129    False
233130    False
233131    False
233132    False
233133    False
233134    False
233135    False
233136    False
233137    False
233138    False
233139    False
233140    False
233141    False
233142    False
233143    False
233144    False
233145    False
233146    False
233147    False
233148    False
233149    False
233150    False
233151    False
233152    False
233153    False
Name: Employment.Type, Length: 233059, dtype: bool, 'Employment.Type_random')

In [22]:
dataset = dataset.drop(['PAN_flag','Driving_flag'], axis = 1)
index = dataset[(dataset['manufacturer_id'] > 150)].index
dataset.drop(index, inplace=True)
index = dataset[(dataset['State_ID'] == 22)].index
dataset.drop(index, inplace=True)

temp = dataset.groupby('PRI.OVERDUE.ACCTS')['PRI.OVERDUE.ACCTS'].count()/np.float(len(dataset))
rare_cat = [x for x in temp.loc[temp<0.0005].index.values]
dataset['PRI.OVERDUE.ACCTS'] = np.where(dataset['PRI.OVERDUE.ACCTS'].isin(rare_cat), 6 , dataset['PRI.OVERDUE.ACCTS'])

In [None]:
test = dataset.drop(['PAN_flag','Driving_flag'], axis = 1)
index = dataset[(dataset['manufacturer_id'] > 150)].index
dataset.drop(index, inplace=True)
index = dataset[(dataset['State_ID'] == 22)].index
dataset.drop(index, inplace=True)

temp = dataset.groupby('PRI.OVERDUE.ACCTS')['PRI.OVERDUE.ACCTS'].count()/np.float(len(dataset))
rare_cat = [x for x in temp.loc[temp<0.0005].index.values]
dataset['PRI.OVERDUE.ACCTS'] = np.where(dataset['PRI.OVERDUE.ACCTS'].isin(rare_cat), 6 , dataset['PRI.OVERDUE.ACCTS'])

In [7]:
from xgboost import XGBClassifier

In [11]:
outcome_var = 'loan_default'
model = RandomForestClassifier(n_estimators=100, max_depth=10)
predictor_var = ['disbursed_amount', 'asset_cost', 'ltv', 'branch_id',
       'supplier_id', 'manufacturer_id', 'Current_pincode_ID',
       'Employment.Type', 'State_ID', 'Employee_code_ID',
       'Aadhar_flag', 'Passport_flag', 'PERFORM_CNS.SCORE',
       'PERFORM_CNS.SCORE.DESCRIPTION', 'PRI.NO.OF.ACCTS', 'PRI.ACTIVE.ACCTS',
       'PRI.OVERDUE.ACCTS', 'PRI.CURRENT.BALANCE', 'PRI.SANCTIONED.AMOUNT',
       'PRIMARY.INSTAL.AMT', 'SEC.INSTAL.AMT', 'NEW.ACCTS.IN.LAST.SIX.MONTHS',
       'DELINQUENT.ACCTS.IN.LAST.SIX.MONTHS', 'AVERAGE.ACCT.AGE',
       'CREDIT.HISTORY.LENGTH', 'NO.OF_INQUIRIES']
classification_model(model, dataset ,predictor_var,outcome_var)

Accuracy : 78.367%
Cross-Validation Score : 78.295%


In [25]:
outcome_var = 'loan_default'
predictor_var = ['disbursed_amount', 'asset_cost', 'ltv', 'branch_id',
       'supplier_id', 'manufacturer_id', 'Current_pincode_ID',
       'Employment.Type', 'State_ID', 'Employee_code_ID',
       'Aadhar_flag', 'Passport_flag', 'PERFORM_CNS.SCORE',
       'PERFORM_CNS.SCORE.DESCRIPTION', 'PRI.NO.OF.ACCTS', 'PRI.ACTIVE.ACCTS',
       'PRI.OVERDUE.ACCTS', 'PRI.CURRENT.BALANCE', 'PRI.SANCTIONED.AMOUNT',
       'PRIMARY.INSTAL.AMT', 'SEC.INSTAL.AMT', 'NEW.ACCTS.IN.LAST.SIX.MONTHS',
       'DELINQUENT.ACCTS.IN.LAST.SIX.MONTHS', 'AVERAGE.ACCT.AGE',
       'CREDIT.HISTORY.LENGTH', 'NO.OF_INQUIRIES']
model = RandomForestClassifier(n_estimators=100, max_depth=10)
model.fit(dataset[predictor_var],dataset[outcome_var])
predictions = model.predict(dataset[predictor_var])
dataset['predictions'] = predictions 

NotFittedError: This RandomForestClassifier instance is not fitted yet. Call 'fit' with appropriate arguments before using this method.

In [19]:
dataset.head()

Unnamed: 0,UniqueID,disbursed_amount,asset_cost,ltv,branch_id,supplier_id,manufacturer_id,Current_pincode_ID,Date.of.Birth,Employment.Type,...,PRI.SANCTIONED.AMOUNT,PRIMARY.INSTAL.AMT,SEC.INSTAL.AMT,NEW.ACCTS.IN.LAST.SIX.MONTHS,DELINQUENT.ACCTS.IN.LAST.SIX.MONTHS,AVERAGE.ACCT.AGE,CREDIT.HISTORY.LENGTH,NO.OF_INQUIRIES,loan_default,predictions
0,420825,50578,58400,89.55,67,22807,45,1441,1984-01-01,0,...,0,0,0,0,0,0,0,0,0,0
1,537409,47145,65550,73.23,67,22807,45,1502,1985-07-31,1,...,50200,1991,0,0,1,23,23,0,1,0
2,417566,53278,61360,89.63,67,22807,45,1497,1985-08-24,1,...,0,0,0,0,0,0,0,0,0,0
3,624493,57513,66113,88.48,67,22807,45,1501,1993-12-30,1,...,0,31,0,0,0,8,15,1,1,0
4,539055,52378,60300,88.39,67,22807,45,1495,1977-09-12,1,...,0,0,0,0,0,0,0,1,1,0
