In [59]:
import wrangle as w
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from matplotlib import pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.utils import resample
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import recall_score
import xgboost as xgb

In [2]:
df = w.get_cws_data()

In [3]:
df.bad_resident.value_counts()

0    4808
1     191
Name: bad_resident, dtype: int64

In [4]:
train, y_train, y_val, y_test, train_scaled, val_scaled, test_scaled = w.model_prep(df)

In [5]:
train_scaled

Unnamed: 0,rent,term,monthly_inc,total_inc,age,risk_score,prop_id_Arizona,prop_id_California,prop_id_Colorado,prop_id_Georgia,...,Recommendation_A-Criminal History Meets Requirements,Recommendation_A-Meets Requirements,Recommendation_Accept,Recommendation_Accept with Conditions (Extra Deposit),Recommendation_Accept with Conditions (Guarantor),Recommendation_Accept with Conditions (Max Deposit),Recommendation_Accept with Extra Security Deposit,Recommendation_Guarantor Not Qualified,Recommendation_Qualified Guarantor,Recommendation_REJECT
2707,-0.090190,12,-0.334993,-0.378361,-0.513961,-2.308439,0,0,1,0,...,1,0,0,0,0,0,0,0,0,0
4983,0.122051,12,0.057028,-0.221645,-0.155214,0.593710,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4959,-0.031824,9,0.285852,0.568771,0.382907,0.367771,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4492,-0.249372,14,-0.175281,-0.078992,-0.334587,0.737843,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
995,0.281233,13,0.187701,-0.143971,-0.155214,0.511904,0,0,0,1,...,0,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
987,0.307763,5,-0.088165,-0.267677,0.024160,-0.111376,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2512,1.262851,12,0.018406,-0.599730,0.203534,0.574232,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
443,0.466944,14,-0.146243,-0.157578,-0.872708,0.021071,0,0,0,1,...,0,0,0,0,0,1,0,0,0,0
4028,-0.488144,13,-0.010640,-0.006752,-0.962395,0.562546,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0


In [6]:
dt = DecisionTreeClassifier(max_depth=12, random_state=91)

In [7]:
dt = dt.fit(train_scaled, y_train)

In [8]:
train_pred = dt.predict(train_scaled)

In [9]:
print(classification_report(y_train, train_pred))

              precision    recall  f1-score   support

           0       0.99      1.00      1.00      2692
           1       1.00      0.80      0.89       107

    accuracy                           0.99      2799
   macro avg       1.00      0.90      0.94      2799
weighted avg       0.99      0.99      0.99      2799



In [10]:
labels = sorted(y_train.unique())

In [11]:
pd.DataFrame(confusion_matrix(y_train, train_pred), index=labels, columns=labels)

Unnamed: 0,0,1
0,2692,0
1,21,86


In [12]:
val_pred = dt.predict(val_scaled)

In [13]:
print(classification_report(y_val, val_pred))

              precision    recall  f1-score   support

           0       0.97      0.96      0.96      1154
           1       0.12      0.13      0.12        46

    accuracy                           0.93      1200
   macro avg       0.54      0.55      0.54      1200
weighted avg       0.93      0.93      0.93      1200



In [14]:
pd.DataFrame(confusion_matrix(y_val, val_pred), index=labels, columns=labels)

Unnamed: 0,0,1
0,1109,45
1,40,6


In [15]:
df_majority = train[train.bad_resident==0]
df_minority = train[train.bad_resident==1]
len(df_minority)

107

In [16]:
df_minority_upsampled = resample(df_minority, 
                                 replace=True,     # sample with replacement
                                 n_samples=214,    # to match majority class
                                 random_state=91) # reproducible results

In [17]:
df_upsampled = pd.concat([df_majority, df_minority_upsampled])

In [18]:
df_upsampled.bad_resident.value_counts()

0    2692
1     214
Name: bad_resident, dtype: int64

In [19]:
y_up = df_upsampled.bad_resident

In [20]:
df_upsampled = df_upsampled.drop(columns = 'bad_resident')

In [21]:
df_upsampled[['rent', 'monthly_inc','total_inc', 'age', 'risk_score']] = StandardScaler().fit_transform(df_upsampled[['rent', 'monthly_inc','total_inc', 'age', 'risk_score']])

In [22]:
df_upsampled

Unnamed: 0,rent,term,monthly_inc,total_inc,age,risk_score,prop_id_Arizona,prop_id_California,prop_id_Colorado,prop_id_Georgia,...,Recommendation_A-Criminal History Meets Requirements,Recommendation_A-Meets Requirements,Recommendation_Accept,Recommendation_Accept with Conditions (Extra Deposit),Recommendation_Accept with Conditions (Guarantor),Recommendation_Accept with Conditions (Max Deposit),Recommendation_Accept with Extra Security Deposit,Recommendation_Guarantor Not Qualified,Recommendation_Qualified Guarantor,Recommendation_REJECT
2707,-0.107014,12,-0.273994,-0.365509,-0.510785,-2.297009,0,0,1,0,...,1,0,0,0,0,0,0,0,0,0
4983,0.101885,12,0.037528,-0.215222,-0.151858,0.601310,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4959,-0.049567,9,0.219365,0.542769,0.386531,0.375669,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4492,-0.263688,14,-0.147078,-0.078422,-0.331321,0.745253,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
995,0.258559,13,0.141369,-0.140734,-0.151858,0.519612,0,0,0,1,...,0,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
129,-0.524811,15,0.051373,-0.189435,-0.510785,0.029427,0,0,0,1,...,0,0,1,0,0,0,0,0,0,0
154,0.441345,12,-0.166346,-0.222465,-0.600516,0.255068,0,0,0,1,...,0,0,1,0,0,0,0,0,0,0
405,0.336896,12,-0.435525,-0.452922,-1.138906,0.667446,1,0,0,0,...,0,0,0,0,1,0,0,0,0,0
11,-0.060012,18,-0.016244,-0.009145,-0.421053,-2.297009,0,0,0,1,...,1,0,0,0,0,0,0,0,0,0


In [23]:
rf.fit(df_upsampled, y_up)

NameError: name 'rf' is not defined

In [None]:
val_pred = rf.predict(val_scaled)

In [None]:
print(classification_report(y_val, val_pred))

In [None]:
pd.DataFrame(confusion_matrix(y_val, val_pred), index=labels, columns=labels)

In [None]:
# train, y_train, y_val, y_test, train_scaled, val_scaled, test_scaled = w.model_prep(df_upsampled)

In [None]:
dt = DecisionTreeClassifier(max_depth=8, random_state=91)

In [None]:
dt = dt.fit(train_scaled, y_train)

In [None]:
train_pred = dt.predict(train_scaled)

In [None]:
print(classification_report(y_train, train_pred))

In [None]:
pd.DataFrame(confusion_matrix(y_train, train_pred), index=labels, columns=labels)

In [None]:
val_pred = dt.predict(val_scaled)

In [None]:
print(classification_report(y_val, val_pred))

In [None]:
pd.DataFrame(confusion_matrix(y_val, val_pred), index=labels, columns=labels)

In [None]:
rf = RandomForestClassifier(bootstrap=True, 
                            class_weight= 'balanced', 
                            criterion='entropy',
                            min_samples_leaf=3,
                            n_estimators=100,
                            max_depth=5, 
                            random_state=91)

In [None]:
rf.fit(df_upsampled, y_up)

In [None]:
train_pred = rf.predict(train_scaled)

In [None]:
print(classification_report(y_train, train_pred))

In [None]:
val_pred = rf.predict(val_scaled)

In [None]:
print(classification_report(y_val, val_pred))

In [None]:
pd.DataFrame(confusion_matrix(y_val, val_pred), index=labels, columns=labels)

In [None]:
params = {'max_depth': [3, 4, 5, 6, 7, 8],
          'min_samples_leaf': [1, 3, 5],
         'n_estimators':[100, 250, 500],
         'class_weight': ['balanced'],
         'bootstrap': [True],
         'criterion':['gini', 'entropy'],
         'random_state': [91]}
rfG= RandomForestClassifier()
grid = GridSearchCV(rfG, params, cv=3, scoring = 'recall')
grid.fit(df_upsampled, y_up)

In [None]:
results = grid.cv_results_
test_scores = results['mean_test_score']
test_scores

In [None]:
params = results['params']

In [None]:
for p, s in zip(params, test_scores):
    p['score'] = s

pd.DataFrame(params).sort_values(by='score', ascending = False)

In [24]:
data = df_upsampled
label = y_up
dtrain = xgb.DMatrix(data, label=label)
dtest = xgb.DMatrix(val_scaled, label = y_val)

In [28]:
param = {'max_depth': 6, 'eta': 1, 'objective': 'binary:logistic'}
param['nthread'] = 4
param['eval_metric'] = 'auc'

In [29]:
evallist = [(dtrain,'train')]

In [30]:
num_round = 10
bst = xgb.train(param, dtrain, num_round, evallist)

[0]	train-auc:0.76590
[1]	train-auc:0.92239
[2]	train-auc:0.96375
[3]	train-auc:0.98027
[4]	train-auc:0.98851
[5]	train-auc:0.99470
[6]	train-auc:0.99620
[7]	train-auc:0.99792
[8]	train-auc:0.99918
[9]	train-auc:0.99960


<xgboost.core.Booster at 0x17ff25610>

In [None]:
ypred = bst.predict(dtest)

In [None]:
boost = xgb.XGBClassifier(n_estimators=100)

In [None]:
boost.fit(df_upsampled, y_up)

In [None]:
preds = boost.predict(val_scaled)
acc_xgb = (preds == y_val).sum().astype(float) / len(preds)*100

In [None]:
print(classification_report(y_val, preds))

In [None]:
params = bootstrap=True, 
                            class_weight= 'balanced', 
                            criterion='entropy',
                            min_samples_leaf=3,
                            n_estimators=100,
                            max_depth=5, 
                            random_state=91

In [97]:
cv = xgb.XGBClassifier(n_estimators = 100, max_depth = 7, max_leaves = 3, learning_rate = .3, 
                       min_child_weight = 1, 
                       eval_metric = 'aucpr', early_stopping_rounds = 1)

In [98]:
cv = cv.fit(X = df_upsampled, y = y_up, eval_set=[(df_upsampled, y_up), (val_scaled, y_val)])

[0]	validation_0-aucpr:0.46832	validation_1-aucpr:0.08505
[1]	validation_0-aucpr:0.66163	validation_1-aucpr:0.09241


In [99]:
val_preds = cv.predict(val_scaled)

In [100]:
print(classification_report(y_val, val_preds))

              precision    recall  f1-score   support

           0       0.96      0.99      0.98      1154
           1       0.27      0.07      0.11        46

    accuracy                           0.96      1200
   macro avg       0.62      0.53      0.54      1200
weighted avg       0.94      0.96      0.94      1200



In [101]:
cv_params = {'subsample': [0.8,0.9,1], 'max_delta_step': [0,1,2,4]}
fix_params = {'learning_rate': 0.2, 'n_estimators': 100, 'objective': 'binary:logistic', 'max_depth': 5, 'min_child_weight':3}

In [104]:
csv = GridSearchCV(xgb.XGBClassifier(**fix_params), cv_params, scoring = 'roc_auc', cv = 5) 
csv.fit(df_upsampled, y_up)
csv.best_params_


{'max_delta_step': 0, 'subsample': 0.8}

In [105]:
fix_params['max_delta_step'] = 0

In [106]:
fix_params['subsample'] = .8

In [111]:
clf = xgb.XGBClassifier(**fix_params, early_stopping_rounds = 1)


In [112]:
clf = clf.fit(X = df_upsampled, y = y_up, eval_set=[(df_upsampled, y_up), (val_scaled, y_val)])

[0]	validation_0-logloss:0.55543	validation_1-logloss:0.55175
[1]	validation_0-logloss:0.46213	validation_1-logloss:0.45387
[2]	validation_0-logloss:0.39477	validation_1-logloss:0.38352
[3]	validation_0-logloss:0.34408	validation_1-logloss:0.33125
[4]	validation_0-logloss:0.30538	validation_1-logloss:0.29394
[5]	validation_0-logloss:0.27434	validation_1-logloss:0.26467
[6]	validation_0-logloss:0.24993	validation_1-logloss:0.24147
[7]	validation_0-logloss:0.23146	validation_1-logloss:0.22318
[8]	validation_0-logloss:0.21615	validation_1-logloss:0.20863
[9]	validation_0-logloss:0.20167	validation_1-logloss:0.19870
[10]	validation_0-logloss:0.18997	validation_1-logloss:0.18897
[11]	validation_0-logloss:0.18199	validation_1-logloss:0.18170
[12]	validation_0-logloss:0.17552	validation_1-logloss:0.17653
[13]	validation_0-logloss:0.16721	validation_1-logloss:0.17352
[14]	validation_0-logloss:0.16263	validation_1-logloss:0.16985
[15]	validation_0-logloss:0.15833	validation_1-logloss:0.16701
[1

In [113]:
val_pred = clf.predict(val_scaled)

In [117]:
print(classification_report(y_val, val_pred))

              precision    recall  f1-score   support

           0       0.96      1.00      0.98      1154
           1       0.00      0.00      0.00        46

    accuracy                           0.96      1200
   macro avg       0.48      0.50      0.49      1200
weighted avg       0.92      0.96      0.94      1200

