In [12]:
import numpy as np
import scipy as sp
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

# Pandas options
pd.set_option('display.max_colwidth', 1000,'display.max_rows',None, 'display.max_columns',None)

# Plot options
%matplotlib inline
mpl.style.use('ggplot')
sns.set(style = 'whitegrid')


In [50]:
loans = pd.read_csv('loans_clean.csv')

**Train, test split**

In [14]:
from sklearn.model_selection import train_test_split

In [37]:
'''
train, test = train_test_split(loans, test_size=0.2, random_state=101)

print(train.shape)
print(test.shape)

train = train[(train['dti'] <=50)  & (train['open_acc'] <= 40) 
& (train['total_acc'] <= 80) & (train['revol_util'] <= 120)]

X_train = train.drop('loan_status', axis =1)
y_train = train['loan_status']

X_test = test.drop('loan_status', axis =1)
y_test = test['loan_status']

'''

In [51]:
X = loans.drop('loan_status', axis =1)
y = loans['loan_status']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=101, stratify = y)

In [52]:
print(X_train.shape)

(1070035, 69)


**Remove outliers from the training set**

In [53]:
X_train = X_train[(X_train['dti'] <=50)  & (X_train['open_acc'] <= 40) 
& (X_train['total_acc'] <= 80) & (X_train['revol_util'] <= 120)]

y_train = y_train[X_train.index]

In [54]:
print(X_train.shape)
print(y_train.shape)

(1065967, 69)
(1065967,)


**Normalizing data**

In [19]:
from sklearn.preprocessing import MinMaxScaler

In [55]:
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

### Setting a baseline

In this classification problem, the rate of loans that paid off (most frequent class) can be used as a baseline to evaluate the quality of models generated. These models should outperform the baseline capabilities to be considered for future predictions.

In [56]:
baseline = loans['loan_status'].value_counts(normalize=True)[0]
baseline

0.8002547953562649

### Accessing multiple algorithms



In [24]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import classification_report, accuracy_score

results =[]

**Random Forest Tree Model**

In [59]:
rf = RandomForestRegressor(random_state=42)
rf.fit(X_train, y_train)
predictions = rf.predict(X_test)
accuracy_score(y_test, predictions)

KeyboardInterrupt: 

In [58]:
param_grid = dict(
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num =10)],
# Number of features to consider at every split
max_features = ['auto', 'sqrt'],
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10,100, num=10)],
# Minimum number of samples required to split a node
#min_samples_split = [int(x) for x in np.linspace(1,10, num=10)],
# Minimum number of samples required at each leaf node
#min_samples_leaf = [int(x) for x in np.linspace(1,10, num=10)]
)

# Random search of parameters using 3 fold cross validation, search acroos 100 different combinations and use all avilable cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions =param_grid, n_iter =100, cv=3, verbose = 2, random_state = 42, n_jobs = -1 )

# Fit the random search mode
rf_random.fit(X_train, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits
[CV] END ..max_depth=20, max_features=sqrt, n_estimators=200; total time=16.6min
[CV] END ..max_depth=20, max_features=sqrt, n_estimators=200; total time=16.6min
[CV] END ..max_depth=20, max_features=sqrt, n_estimators=200; total time=16.6min


KeyboardInterrupt: 

In [None]:
best_random = rf_random.best_estimator_
predictions = best_random.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
results.append(('Random forest tree',accuracy))
print(results)


**XGB Classifier**