In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

import wrangle as w
import wrangle_test as wt
import explore as e

import scipy.stats as stats
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.utils import resample
from sklearn import metrics
from imblearn.over_sampling import SMOTENC

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [2]:
df = w.get_cws_data()
df.head(3)

Unnamed: 0,id,total_charges,amount_paid,open,charge_code,description,prop_id,charge_name,sStatus,rent,term,monthly_inc,GuarantorRequired,total_inc,Recommendation,age,risk_score,reason,bad_resident
0,3946,75.0,75.0,0,131,"Uncllctbl key, remote fees",136,"Uncllctbl key, remote fees",Current,1311,15,52500,False,666000,Accept,33,745,Insufficient period of Residence History,1
1,3962,13.66,13.66,0,112,Water 8/1/2021-9/1/2021,136,Utility-water charges,Current,1661,12,65000,False,889992,Accept with Conditions (Extra Deposit),27,758,No Credit Experience,0
2,4050,46.17,46.17,0,155,Damage charges - Clean,136,Damage charges,Current,1412,12,48000,False,576000,Accept with Conditions (Extra Deposit),33,666,Insufficient period of Residence History,1


In [3]:
df.shape

(4999, 19)

In [4]:
y_train, y_val, y_test, train_scaled, val_scaled, test_scaled,train, validate, test, df = wt.model_prep(df)

In [5]:
df.shape, train.shape, validate.shape, test.shape, train_scaled.shape, val_scaled.shape, test_scaled.shape

((4999, 25),
 (2799, 25),
 (1200, 25),
 (1000, 25),
 (2799, 24),
 (1200, 24),
 (1000, 24))

In [6]:
train_scaled.head(3)

Unnamed: 0,rent,term,monthly_inc,total_inc,age,risk_score,prop_id_Arizona,prop_id_California,prop_id_Colorado,prop_id_Georgia,prop_id_North Carolina,prop_id_Tennessee,prop_id_Texas,prop_id_Washington,Recommendation_A-Criminal History Meets Requirements,Recommendation_A-Meets Requirements,Recommendation_Accept,Recommendation_Accept with Conditions (Extra Deposit),Recommendation_Accept with Conditions (Guarantor),Recommendation_Accept with Conditions (Max Deposit),Recommendation_Accept with Extra Security Deposit,Recommendation_Guarantor Not Qualified,Recommendation_Qualified Guarantor,Recommendation_REJECT
2707,-0.09019,12,-0.334993,-0.378361,-0.513961,-2.308439,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
4983,0.122051,12,0.057028,-0.221645,-0.155214,0.59371,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0
4959,-0.031824,9,0.285852,0.568771,0.382907,0.367771,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0


In [7]:
train.head(3)

Unnamed: 0,rent,term,monthly_inc,total_inc,age,risk_score,bad_resident,prop_id_Arizona,prop_id_California,prop_id_Colorado,prop_id_Georgia,prop_id_North Carolina,prop_id_Tennessee,prop_id_Texas,prop_id_Washington,Recommendation_A-Criminal History Meets Requirements,Recommendation_A-Meets Requirements,Recommendation_Accept,Recommendation_Accept with Conditions (Extra Deposit),Recommendation_Accept with Conditions (Guarantor),Recommendation_Accept with Conditions (Max Deposit),Recommendation_Accept with Extra Security Deposit,Recommendation_Guarantor Not Qualified,Recommendation_Qualified Guarantor,Recommendation_REJECT
2707,1639,12,3900.0,46800.0,27,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
4983,1719,12,6600.0,75684.0,31,745,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0
4959,1661,9,8176.0,221364.0,37,687,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0


In [8]:
train.bad_resident.value_counts()

0    2692
1     107
Name: bad_resident, dtype: int64

In [9]:
def upsample(train):
    '''takes a train dataframe, upsamples minority and returns upsampled X_train and upsampled y_train'''
    
    
    df_majority = train[train.bad_resident ==0]
    df_minority = train[train.bad_resident ==1]
    df_minority_upsampled = resample(df_minority, replace=True, n_samples=300 , random_state=91)
    
    # get upsampled_df
    df_upsampled = pd.concat([df_minority_upsampled, df_majority])
    
    # scale df_upsampled
    df_upsampled[['rent', 'monthly_inc', 'total_inc','age', 'risk_score']] = StandardScaler().fit_transform(df_upsampled[['rent', 'monthly_inc', 'total_inc','age', 'risk_score']])
    
    # split scaled df_upsampled
    X_train_upsampled = df_upsampled.drop('bad_resident',axis=1)
    y_train_upsampled = df_upsampled['bad_resident']
    
    return X_train_upsampled, y_train_upsampled

In [10]:

def get_knn(train, X_validate, y_validate):
    ''' takes a train dataframe, X_ validate, y_validate, print KNN  confusion matrix and classifaction report on train and validate data'''
    
    # get upsampled X_train and Y_train
    X_train_upsampled, y_train_upsampled = upsample(train)
    
    # create model
    knn= KNeighborsClassifier(n_neighbors =3)

    # fit the model to train data
    knn.fit(X_train_upsampled, y_train_upsampled)
    
    # make prediction on train obeservations
    y_pred = knn.predict(X_train_upsampled)
    
    # make prediction on validate obeservations
    y_pred_val = knn.predict(X_validate)
    
    # get confusion matrix
    confusion_matrix_train = metrics.confusion_matrix(y_train_upsampled, y_pred)
    confusion_matrix_val = metrics.confusion_matrix(y_validate, y_pred_val)
    
    # print confusion matrix and classification report
    print('Confusion Matrix of upsampled Train:')
    print(confusion_matrix_train)
    print('\n Classification Report on Train:')
    print(classification_report(y_train_upsampled, y_pred))
    print('-----------------------------------------------------------')
    print('Confusion Matrix of Validate:')
    print( confusion_matrix_val)
    print('\n Classification Report on Validate: ')
    print(classification_report(y_validate, y_pred_val))
    

In [11]:
get_knn(train, val_scaled, y_val)

Confusion Matrix of upsampled Train:
[[2624   68]
 [  18  282]]

 Classification Report on Train:
              precision    recall  f1-score   support

           0       0.99      0.97      0.98      2692
           1       0.81      0.94      0.87       300

    accuracy                           0.97      2992
   macro avg       0.90      0.96      0.93      2992
weighted avg       0.97      0.97      0.97      2992

-----------------------------------------------------------
Confusion Matrix of Validate:
[[1100   54]
 [  40    6]]

 Classification Report on Validate: 
              precision    recall  f1-score   support

           0       0.96      0.95      0.96      1154
           1       0.10      0.13      0.11        46

    accuracy                           0.92      1200
   macro avg       0.53      0.54      0.54      1200
weighted avg       0.93      0.92      0.93      1200



In [12]:
sm = SMOTENC(categorical_features=[0], random_state=91, sampling_strategy=.6)

In [13]:
X_train_upscaled, y_train_upscaled = sm.fit_resample(train_scaled, y_train)

In [14]:
X_train_upscaled.prop_id_Texas.value_counts()

0    2597
1    1710
Name: prop_id_Texas, dtype: int64

In [15]:
train_scaled.prop_id_Texas.value_counts()

1    1476
0    1323
Name: prop_id_Texas, dtype: int64