In [19]:
import pandas as pd
import numpy as np 

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix


import wrangle as w
import explore as e

import warnings
warnings.filterwarnings('ignore')

In [2]:
df = w.get_cws_data()
df.head()

Unnamed: 0,id,total_charges,amount_paid,open,charge_code,description,prop_id,charge_name,sStatus,rent,term,monthly_inc,GuarantorRequired,total_inc,Recommendation,age,risk_score,reason,bad_resident
0,3946,75.0,75.0,0,131,"Uncllctbl key, remote fees",136,"Uncllctbl key, remote fees",Current,1311,15,52500,False,666000,Accept,33,745,Insufficient period of Residence History,1
1,3962,13.66,13.66,0,112,Water 8/1/2021-9/1/2021,136,Utility-water charges,Current,1661,12,65000,False,889992,Accept with Conditions (Extra Deposit),27,758,No Credit Experience,0
2,4050,46.17,46.17,0,155,Damage charges - Clean,136,Damage charges,Current,1412,12,48000,False,576000,Accept with Conditions (Extra Deposit),33,666,Insufficient period of Residence History,1
3,4948,9.76,9.76,0,111,Sewer Charge,140,Utility-sewer charges,Current,1377,18,72000,False,0,A-Criminal History Meets Requirements,39,0,Meets All Property Requirements,0
4,5001,6.85,6.85,0,113,Sewer Base Charge,140,Utility-water flat base charges,Current,2199,13,0,True,54996,Accept with Conditions (Guarantor),23,752,,0


In [3]:
train, validate, test, X_train, y_train, X_val, y_val, X_test, y_test = w.train_vailidate_test_split(df, 'bad_resident', 'bad_resident')

In [4]:
# Replacing any blank total income with monthly income * 12
train.total_inc = np.where(train.total_inc == 0, train.monthly_inc * 12, train.total_inc)

# Replacing any blank monthly income with total income / 12
train.monthly_inc = np.where(train.monthly_inc == 0, train.total_inc/12, train.monthly_inc)

In [5]:
#scaler = StandardScaler()
#w.scale_splits(X_train, X_val, X_test, scaler)

In [6]:
X_train = train.drop(columns=['open', 'charge_code', 'description', 'charge_name', 'sStatus','GuarantorRequired', 'reason', 'bad_resident'])

y_train = train.bad_resident
#X_val = validate.drop(columns=target)
#y_val = validate[target]
#X_test = test.drop(columns=target)
#y_test = test[target]
    

In [7]:
X_train

Unnamed: 0,id,total_charges,amount_paid,prop_id,rent,term,monthly_inc,total_inc,Recommendation,age,risk_score
2707,63851,2.00,2.00,126,1639,12,3900.0,46800,A-Criminal History Meets Requirements,27,0
4983,73919,1650.00,1650.00,103,1719,12,6600.0,75684,Accept with Conditions (Extra Deposit),31,745
4959,73653,4.00,4.00,84,1661,9,8176.0,221364,Accept with Conditions (Extra Deposit),37,687
4492,70961,0.33,0.33,80,1579,14,5000.0,101976,Accept,29,782
995,33019,1.17,1.17,139,1779,13,7500.0,90000,Accept,31,724
...,...,...,...,...,...,...,...,...,...,...,...
987,32986,17.20,0.00,65,1789,5,5600.0,67200,REJECT,33,564
2512,63297,50.00,50.00,150,2149,12,6334.0,6000,Accept,35,740
443,21611,40.00,40.00,134,1849,14,5200.0,87492,Accept with Conditions (Max Deposit),23,598
4028,68721,4.00,4.00,69,1489,13,0.0,0,Accept with Conditions (Guarantor),22,737


In [8]:
df[(df['age'] == 122) | (df['age'] == 3)]

Unnamed: 0,id,total_charges,amount_paid,open,charge_code,description,prop_id,charge_name,sStatus,rent,term,monthly_inc,GuarantorRequired,total_inc,Recommendation,age,risk_score,reason,bad_resident
4891,73103,-50.0,-50.0,0,199,Corporate Housing,67,"Waive application, trnsfr fees",Current,1889,6,10000,False,0,A-Meets Requirements,3,0,Meets All Property Requirements,0


In [9]:
def remove_outliers(data, col):
    
    q1, q3 = np.percentile(data[col], [5, 95])
    iqr = q3 - q1
    
    lower_bound = q1 - 1.5 * iqr
    upper_bound = q3 + 1.5 * iqr
    
    filtered_data = [x for x in data[col] if (x >= lower_bound) and (x <= upper_bound)]
    
    filtered_age = filtered_data
    filtered_df = data.loc[data[col].isin(filtered_age)]
    
    return filtered_df, filtered_age



In [10]:
filtered_df, filtered_age = remove_outliers(df,'age')

In [11]:
filtered_df = df.loc[df['age'].isin(filtered_age)]
filtered_df 

Unnamed: 0,id,total_charges,amount_paid,open,charge_code,description,prop_id,charge_name,sStatus,rent,term,monthly_inc,GuarantorRequired,total_inc,Recommendation,age,risk_score,reason,bad_resident
0,3946,75.00,75.00,0,131,"Uncllctbl key, remote fees",136,"Uncllctbl key, remote fees",Current,1311,15,52500,False,666000,Accept,33,745,Insufficient period of Residence History,1
1,3962,13.66,13.66,0,112,Water 8/1/2021-9/1/2021,136,Utility-water charges,Current,1661,12,65000,False,889992,Accept with Conditions (Extra Deposit),27,758,No Credit Experience,0
2,4050,46.17,46.17,0,155,Damage charges - Clean,136,Damage charges,Current,1412,12,48000,False,576000,Accept with Conditions (Extra Deposit),33,666,Insufficient period of Residence History,1
3,4948,9.76,9.76,0,111,Sewer Charge,140,Utility-sewer charges,Current,1377,18,72000,False,0,A-Criminal History Meets Requirements,39,0,Meets All Property Requirements,0
4,5001,6.85,6.85,0,113,Sewer Base Charge,140,Utility-water flat base charges,Current,2199,13,0,True,54996,Accept with Conditions (Guarantor),23,752,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4994,74033,1439.37,1439.37,0,74,Rent for 29 days,85,Rent,Current,1489,12,0,True,0,Accept with Conditions (Guarantor),24,708,Insufficient Income to Support Rent,0
4995,74056,150.00,150.00,0,162,Deposit Waiver Fee (Amount subject to screenin...,152,Deposit waiver fee,Current,1859,13,10000,False,121776,Accept,32,786,Meets All Property Requirements,0
4996,74230,5.00,5.00,0,186,Convenience Fee,61,OSP resident convenience fee,Current,1439,12,4350,False,57432,Accept with Conditions (Extra Deposit),36,691,Limited period of Residence History,0
4997,74231,5.00,5.00,0,186,Convenience Fee (ray farmer),83,OSP resident convenience fee,Current,1739,12,7916,False,94992,Accept,47,770,Meets All Property Requirements,0


In [12]:
len(filtered_age) == len(df['age']) # outliers removed

True

In [13]:
y_train, y_val, y_test, train_scaled, val_scaled, test_scaled = w.model_prep(df)

In [14]:
train_scaled

Unnamed: 0,rent,term,monthly_inc,total_inc,age,risk_score,prop_id_Arizona,prop_id_California,prop_id_Colorado,prop_id_Georgia,...,Recommendation_A-Criminal History Meets Requirements,Recommendation_A-Meets Requirements,Recommendation_Accept,Recommendation_Accept with Conditions (Extra Deposit),Recommendation_Accept with Conditions (Guarantor),Recommendation_Accept with Conditions (Max Deposit),Recommendation_Accept with Extra Security Deposit,Recommendation_Guarantor Not Qualified,Recommendation_Qualified Guarantor,Recommendation_REJECT
2707,-0.090190,12,-0.334993,-0.378361,-0.513961,-2.308439,0,0,1,0,...,1,0,0,0,0,0,0,0,0,0
4983,0.122051,12,0.057028,-0.221645,-0.155214,0.593710,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4959,-0.031824,9,0.285852,0.568771,0.382907,0.367771,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4492,-0.249372,14,-0.175281,-0.078992,-0.334587,0.737843,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
995,0.281233,13,0.187701,-0.143971,-0.155214,0.511904,0,0,0,1,...,0,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
987,0.307763,5,-0.088165,-0.267677,0.024160,-0.111376,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2512,1.262851,12,0.018406,-0.599730,0.203534,0.574232,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
443,0.466944,14,-0.146243,-0.157578,-0.872708,0.021071,0,0,0,1,...,0,0,0,0,0,1,0,0,0,0
4028,-0.488144,13,-0.010640,-0.006752,-0.962395,0.562546,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0


In [15]:
w.col_drop(df)

Unnamed: 0,prop_id,rent,term,monthly_inc,total_inc,Recommendation,age,risk_score,bad_resident
0,136,1311,15,52500,666000,Accept,33,745,1
1,136,1661,12,65000,889992,Accept with Conditions (Extra Deposit),27,758,0
2,136,1412,12,48000,576000,Accept with Conditions (Extra Deposit),33,666,1
3,140,1377,18,72000,0,A-Criminal History Meets Requirements,39,0,0
4,140,2199,13,0,54996,Accept with Conditions (Guarantor),23,752,0
...,...,...,...,...,...,...,...,...,...
4994,85,1489,12,0,0,Accept with Conditions (Guarantor),24,708,0
4995,152,1859,13,10000,121776,Accept,32,786,0
4996,61,1439,12,4350,57432,Accept with Conditions (Extra Deposit),36,691,0
4997,83,1739,12,7916,94992,Accept,47,770,0


In [39]:
# logistic regression model with the train dataset 

logit = LogisticRegression(random_state = 91)

logit.fit(train_scaled, y_train)

y_pred = logit.predict(train_scaled)

accuracy = logit.score(train_scaled, y_train)

print(f'Accuracy of Logistic Regression classifier on training set: {accuracy:.2%}')

Accuracy of Logistic Regression classifier on training set: 96.21%


In [40]:
# logistic regression model with the validate dataset

logit = LogisticRegression(random_state = 91)

logit.fit(val_scaled, y_val)

y_pred_val = logit.predict(val_scaled)

accuracy = logit.score(val_scaled, y_val)

print(f'Accuracy of Logistic Regression classifier on training set: {accuracy:.2%}')

Accuracy of Logistic Regression classifier on training set: 96.17%


In [41]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.96      1.00      0.98      2692
           1       1.00      0.01      0.02       107

    accuracy                           0.96      2799
   macro avg       0.98      0.50      0.50      2799
weighted avg       0.96      0.96      0.94      2799



In [42]:
labels = {0: 'actual', 1: 'predicted'}

cm = confusion_matrix(y_train, y_pred)

pd.DataFrame(cm, index = labels.values(), columns = labels.values())

Unnamed: 0,actual,predicted
actual,2692,0
predicted,106,1
