In [13]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

import wrangle as w
import explore as e

import scipy.stats as stats
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.utils import resample

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [14]:
df = w.get_cws_data()
df.head()

Unnamed: 0,id,total_charges,amount_paid,open,charge_code,description,prop_id,charge_name,sStatus,rent,term,monthly_inc,GuarantorRequired,total_inc,Recommendation,age,risk_score,reason,bad_resident
0,3946,75.0,75.0,0,131,"Uncllctbl key, remote fees",136,"Uncllctbl key, remote fees",Current,1311,15,52500,False,666000,Accept,33,745,Insufficient period of Residence History,1
1,3962,13.66,13.66,0,112,Water 8/1/2021-9/1/2021,136,Utility-water charges,Current,1661,12,65000,False,889992,Accept with Conditions (Extra Deposit),27,758,No Credit Experience,0
2,4050,46.17,46.17,0,155,Damage charges - Clean,136,Damage charges,Current,1412,12,48000,False,576000,Accept with Conditions (Extra Deposit),33,666,Insufficient period of Residence History,1
3,4948,9.76,9.76,0,111,Sewer Charge,140,Utility-sewer charges,Current,1377,18,72000,False,0,A-Criminal History Meets Requirements,39,0,Meets All Property Requirements,0
4,5001,6.85,6.85,0,113,Sewer Base Charge,140,Utility-water flat base charges,Current,2199,13,0,True,54996,Accept with Conditions (Guarantor),23,752,,0


In [15]:
y_train, y_val, y_test, train_scaled, val_scaled, test_scaled = w.model_prep(df)

In [16]:
train_scaled.head()

Unnamed: 0,rent,term,monthly_inc,total_inc,age,risk_score,prop_id_Arizona,prop_id_California,prop_id_Colorado,prop_id_Georgia,prop_id_North Carolina,prop_id_Tennessee,prop_id_Texas,prop_id_Washington,Recommendation_A-Criminal History Meets Requirements,Recommendation_A-Meets Requirements,Recommendation_Accept,Recommendation_Accept with Conditions (Extra Deposit),Recommendation_Accept with Conditions (Guarantor),Recommendation_Accept with Conditions (Max Deposit),Recommendation_Accept with Extra Security Deposit,Recommendation_Guarantor Not Qualified,Recommendation_Qualified Guarantor,Recommendation_REJECT
2707,-0.09019,12,-0.334993,-0.378361,-0.513961,-2.308439,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
4983,0.122051,12,0.057028,-0.221645,-0.155214,0.59371,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0
4959,-0.031824,9,0.285852,0.568771,0.382907,0.367771,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0
4492,-0.249372,14,-0.175281,-0.078992,-0.334587,0.737843,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0
995,0.281233,13,0.187701,-0.143971,-0.155214,0.511904,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0


In [17]:
df.bad_resident.value_counts()

0    4808
1     191
Name: bad_resident, dtype: int64

In [18]:
df_majority = df[df.bad_resident ==0]
df_minority = df[df.bad_resident ==1]


In [25]:
df_majority_downsampled = resample(df_majority, replace=False, n_samples=191 , random_state=1)

In [26]:
df_downsampled = pd.concat([df_majority_downsampled, df_minority])

In [28]:
df_downsampled.bad_resident.value_counts()

0    191
1    191
Name: bad_resident, dtype: int64

In [35]:
y_train, y_val, y_test, train_scaled, val_scaled, test_scaled = w.model_prep(df_downsampled)

In [54]:
def get_knn(X_train, X_validate, y_train, y_validate):
    ''' get KNN accuracy score on train and validate data'''
    
    # create model
    knn= KNeighborsClassifier(n_neighbors = 3) 

    # fit the model to train data
    knn.fit(X_train, y_train)
    
    # make prediction on train obeservations
    y_pred = knn.predict(X_train)
    
    y_pred_val = knn.predict(X_validate)
    
    print('Classification Report on Train: ')
    print(classification_report(y_train, y_pred), '\n')
    
    print('Classification Report on Validate: ')
    print(classification_report(y_validate, y_pred_val))
    

In [55]:
get_knn(train_scaled,val_scaled, y_train, y_val)

Classification Report on Train: 
              precision    recall  f1-score   support

           0       0.80      0.77      0.79       106
           1       0.78      0.81      0.80       107

    accuracy                           0.79       213
   macro avg       0.79      0.79      0.79       213
weighted avg       0.79      0.79      0.79       213
 

Classification Report on Validate: 
              precision    recall  f1-score   support

           0       0.70      0.80      0.75        46
           1       0.77      0.65      0.71        46

    accuracy                           0.73        92
   macro avg       0.73      0.73      0.73        92
weighted avg       0.73      0.73      0.73        92

