In [None]:
# Import our libraries 

# Pandas and numpy for data wrangling
import pandas as pd
import numpy as np

# Seaborn / matplotlib for visualization 
import seaborn as sns
sns.set()
import matplotlib.pyplot as plt
%matplotlib inline

# Import the trees from sklearn
from sklearn import tree

# Helper function to split our data
from sklearn.model_selection import train_test_split

# Helper fuctions to evaluate our model.
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, f1_score, roc_auc_score 

# Helper function for hyper-parameter turning.
from sklearn.model_selection import GridSearchCV

# Import our Decision Tree
from sklearn.tree import DecisionTreeClassifier 

# Import our Random Forest 
from sklearn.ensemble import RandomForestClassifier

# Library for visualizing our tree
# If you get an error, run 'conda install python-graphviz' in your terminal
import graphviz



In [None]:
df = pd.read_csv('Project Dataset/Application_Data.csv')
# ['NAME_CONTRACT_STATUS']=df['NAME_CONTRACT_STATUS'].astype(int)
df.head()

In [None]:
# check the shape of the dataset
df.shape

In [None]:
# check for null
df.isnull().sum()

In [None]:
# check how many duplicated row in the dataset
df.duplicated().sum()

In [None]:
# Use matplotlib to virually shows that the differece between all types of application result
import matplotlib
sns.countplot(df.NAME_CONTRACT_STATUS)
plt.xlabel("Contract Status")
plt.ylabel("Count of Contract Status")
plt.title("Distribution of Contract Status")
plt.show()

In [None]:
# drop the cancled loan application cuz it does not make any contribution to our reseach topic
# and mark Approved and Unused offers as approved since they got accepted
# and mark Refused loan as 0
df['NAME_CONTRACT_STATUS']=df['NAME_CONTRACT_STATUS'].replace('Approved', '1')
df['NAME_CONTRACT_STATUS']=df['NAME_CONTRACT_STATUS'].replace('Refused', '0')
df['NAME_CONTRACT_STATUS']=df['NAME_CONTRACT_STATUS'].replace('Unused offer', '1')
df.drop(df[df['NAME_CONTRACT_STATUS'] =='Canceled'].index, inplace = True)


In [None]:
# #since we are not yet interested in the reject reason, drop it for now
# #'NAME_SELLER_INDUSTRY'
# # we are not interested in how approved loan is paid back now, drop it
# df.drop('NAME_PAYMENT_TYPE', axis=1, inplace=True)

# df.drop(['SELLERPLACE_AREA','CNT_PAYMENT','DAYS_FIRST_DRAWING','DAYS_FIRST_DUE'
#         ,'DAYS_LAST_DUE_1ST_VERSION','DAYS_LAST_DUE','DAYS_TERMINATION','NFLAG_INSURED_ON_APPROVAL'], axis=1, inplace=True)

In [None]:
# check if the Canceled loan still exist
df['NAME_CONTRACT_STATUS'].value_counts()

In [None]:
# check the shape of the dataset again, obviously, the canceled data is dropped
df.shape

In [None]:
# show the histogram again, we now only have approved abd refused application
sns.countplot(df.NAME_CONTRACT_STATUS)
plt.xlabel("Contract Status")
plt.ylabel("Count of Contract Status")
plt.title("Distribution of Contract Status")
plt.show()


In [None]:
# convert text type data into numerical data
# df = pd.get_dummies(df, columns=['NAME_CONTRACT_TYPE','WEEKDAY_APPR_PROCESS_START','FLAG_LAST_APPL_PER_CONTRACT',
#                                  'NAME_CASH_LOAN_PURPOSE','NAME_TYPE_SUITE','NAME_CLIENT_TYPE',
#                                 'NAME_GOODS_CATEGORY','NAME_PORTFOLIO','NAME_PRODUCT_TYPE',
#                                 'CHANNEL_TYPE','NAME_YIELD_GROUP','PRODUCT_COMBINATION','NFLAG_INSURED_ON_APPROVAL'], drop_first=True)

In [None]:
#Function to calculate meta-data to identify % of data is missing in each column
def meta_data(data):
    total = data.isnull().sum()
    percent = (data.isnull().sum()/data.isnull().count()*100)
    unique = data.nunique()
    datatypes = data.dtypes
    return pd.concat([total, percent, unique, datatypes], axis=1, keys=['Total', 'Percent', 'Unique', 'Data_Type']).sort_values(by="Percent", ascending=False)

In [None]:
#calculating meta-data for application_data
app_meta_data=meta_data(df)
app_meta_data.head(20)

In [None]:
#dropping columns with more than 57% missing values 
#Selected 57% because we don't want to drop EXT_SOURCE_1 which is an important variable
cols_to_keep=list(app_meta_data[(app_meta_data.Percent<57)].index)
application_data=df[cols_to_keep]
application_data.describe()

In [None]:
#deal with missing AMT_ANNUITY values
df.isnull().sum()

In [None]:
df["NAME_CONTRACT_STATUS"].value_counts()

In [None]:
# get the average annuity of approved applications and refused applications
average_approved_AMT_ANNUITY=(df[(df['NAME_CONTRACT_STATUS']=='Approved')])['AMT_ANNUITY'].mean()
average_Refused_ANNUITY=(df[(df['NAME_CONTRACT_STATUS']=='Refused')])['AMT_ANNUITY'].mean()
print("the average annuity of approved applications is ", average_approved_AMT_ANNUITY)
print("the average annuity of refused applications is ", average_Refused_ANNUITY)

In [None]:
#it seems like we have to fill something into the null values
#some AMT_ANNUITY is empty, I will replace the the average AMT_ANNUITY based on its NAME_CONTRACT_STATUS

df['AMT_ANNUITY'] = np.where(((df['AMT_ANNUITY'].isnull()==True) & (df['NAME_CONTRACT_STATUS'] == '1') ), average_approved_AMT_ANNUITY,df['AMT_ANNUITY'] )
df['AMT_ANNUITY'] = np.where(((df['AMT_ANNUITY'].isnull()==True) & (df['NAME_CONTRACT_STATUS'] == '0') ), average_Refused_ANNUITY,df['AMT_ANNUITY'] )

In [None]:
df.head(30)
#AMT_ANNUITY data are all filled

In [None]:
# delete the row that miss AMT_CREDIT data
df = df.dropna( how='any',subset=['AMT_CREDIT'])
df.isnull().sum()

In [None]:
df['AMT_DOWN_PAYMENT'] = np.where(((df['AMT_DOWN_PAYMENT'].isnull()==True) ), 0,df['AMT_DOWN_PAYMENT'] )

In [None]:
df['OWN_CAR_AGE'] = np.where(((df['OWN_CAR_AGE'].isnull()==True) ), 0,df['OWN_CAR_AGE'] )

In [None]:
df['OCCUPATION_TYPE'] = np.where(((df['OCCUPATION_TYPE'].isnull()==True) ), 'No Specified',df['OWN_CAR_AGE'] )

In [None]:
df['CNT_FAM_MEMBERS'] = np.where(((df['CNT_FAM_MEMBERS'].isnull()==True) ), 'No Specified',df['CNT_FAM_MEMBERS'] )

In [None]:
df.isnull().sum()

In [None]:
df.shape

In [None]:
df = df.dropna( how='any',subset=['CODE_GENDER'])

In [None]:
df = df.dropna( how='any',subset=['AMT_GOODS_PRICE'])

In [None]:
df = df.dropna( how='any',subset=['CNT_PAYMENT'])

In [None]:
df.isnull().sum()

In [None]:
df['NAME_TYPE_SUITE'] = np.where(((df['NAME_TYPE_SUITE'].isnull()==True) ), 'No Specified',df['NAME_TYPE_SUITE'] )

In [None]:
# # make missing downpayments to zero
# #delete
# df['AMT_DOWN_PAYMENT'] = np.where(((df['AMT_DOWN_PAYMENT'].isnull()==True)), 0,df['AMT_DOWN_PAYMENT'])
# df.isnull().sum()

In [None]:
#client might not tell the goods' price, so just keep missing AMT_GOODS_PRICE values empty 

In [None]:
# #make missing RATE_DOWN_PAYMENT to zero since the AMT_DOWN_PAYMENT is zero
# df['AMT_DOWN_PAYMENT'] = np.where(((df['RATE_DOWN_PAYMENT'].isnull()==True)), 0,df['RATE_DOWN_PAYMENT'])
# df.isnull().sum()
# df=df.drop(['AMT_DOWN_PAYMENT','AMT_DOWN_PAYMENT'],1)

In [None]:
# how many types of loan types?
sns.countplot(df.NAME_CONTRACT_TYPE)
plt.xlabel("Contract Status")
plt.ylabel("Count of Contract Status")
plt.title("Distribution of Contract Status")
plt.show()

In [None]:
# delete rows which has missing NAME_TYPE_SUITE and NFLAG_INSURED_ON_APPROVAL
# missing NAME_TYPE_SUITE means the borrower does not tell Who accompanied client when 
# applying for the loan application, and missing NFLAG_INSURED_ON_APPROVAL means borrower does
# not say about weather he requested insurance during the loan application
df.dropna(subset=['NAME_TYPE_SUITE','AMT_GOODS_PRICE'],inplace=True)
df.isnull().sum()

In [None]:
sns.countplot(df.NAME_CONTRACT_STATUS)
plt.xlabel("Contract Status")
plt.ylabel("Count of Contract Status")
plt.title("Distribution of Contract Status")
plt.show()


In [None]:
# get the total amount of approved and refused applications
approved=df[df.NAME_CONTRACT_STATUS=='1']
refused=df[df.NAME_CONTRACT_STATUS=='0']

In [None]:
#get the percentage
percentage_approved=(len(approved)*100)/len(df)
percentage_refused=(len(refused)*100)/len(df)
print("The Percentage of people whose loans have been Approved is:",round(percentage_approved,3),"%")
print("The Percentage of people whose loans have been Refused is:",round(percentage_refused,3),"%")

In [None]:
#build a function to disaply the numbers of value in a column sorted by refused and approved 
#applications
def plot_charts(var, label_rotation,horizontal_layout):
    if(horizontal_layout):
        fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=(15,5))
    else:
        fig, (ax1, ax2) = plt.subplots(nrows=2, figsize=(15,30))
    
    s1=sns.countplot(ax=ax1,x=refused[var], data=refused, order= refused[var].value_counts().index,)
    ax1.set_title("Refused", fontsize=10)
    ax1.set_xlabel('%s' %var)
    ax1.set_ylabel("Count of Loans")
    if(label_rotation):
        s1.set_xticklabels(s1.get_xticklabels(),rotation=90)
    
    s2=sns.countplot(ax=ax2,x=approved[var], data=approved, order= approved[var].value_counts().index,)
    if(label_rotation):
        s2.set_xticklabels(s2.get_xticklabels(),rotation=90)
    ax2.set_xlabel('%s' %var)
    ax2.set_ylabel("Count of Loans")
    ax2.set_title("Approved", fontsize=10)
    plt.show()

In [None]:
df.select_dtypes('object').columns

In [None]:
plot_charts('PRODUCT_COMBINATION', label_rotation=True,horizontal_layout=True)
#we can see that most applications got refused because those borrows just ask for cash
#and most approved application is for mortgage and mobile

In [None]:
plot_charts('NAME_YIELD_GROUP', label_rotation=True,horizontal_layout=True)
#we can see low interest rate application is likely to get refused.

In [None]:
plot_charts('NAME_SELLER_INDUSTRY', label_rotation=True,horizontal_layout=True)

In [None]:
plot_charts('NAME_CASH_LOAN_PURPOSE', label_rotation=True,horizontal_layout=True)

In [None]:
#sns.pairplot(df, hue='NAME_CONTRACT_STATUS');

In [None]:
# #drop interest rate column since most of them are missing
# df = df.drop('RATE_INTEREST_PRIMARY', 1)
# df = df.drop('RATE_INTEREST_PRIVILEGED', 1)
# df = df.drop('SK_ID_PREV', 1)
df = df.drop('SK_ID_CURR', 1)
df.isnull().sum()

In [None]:
df=df.drop(['NFLAG_LAST_APPL_IN_DAY','NFLAG_LAST_APPL_IN_DAY'],1)

In [None]:
df=df.drop(['PRODUCT_COMBINATION','PRODUCT_COMBINATION'],1)

In [None]:
df=df.drop(['NAME_CASH_LOAN_PURPOSE','NAME_CASH_LOAN_PURPOSE'],1)

In [None]:
df=df.drop(['NAME_PAYMENT_TYPE','NAME_PAYMENT_TYPE'],1)

In [None]:
df=df.drop(['REGION_POPULATION_RELATIVE','REGION_POPULATION_RELATIVE'],1)

In [None]:
df=df.drop(['DAYS_REGISTRATION','DAYS_REGISTRATION'],1)

In [None]:
df=df.drop(['AMT_ANNUITY','AMT_ANNUITY'],1)

In [None]:
df=df.drop(['AMT_CREDIT','AMT_CREDIT'],1)

In [None]:
df=df.drop(['OCCUPATION_TYPE','OCCUPATION_TYPE'],1)

In [None]:
df=df.drop(['FLAG_WORK_PHONE','FLAG_WORK_PHONE'],1)

In [None]:
df.isnull().sum()

In [None]:
df['NAME_YIELD_GROUP'].value_counts()

In [None]:
df['CNT_PAYMENT'].value_counts()
x=0;
for i in df['CNT_PAYMENT']:
    if i>x:
        x=i
print(x)

In [None]:
df.head()

In [None]:
df['CODE_GENDER'] = np.where(((df['CODE_GENDER']=='M') ), '1','0')

In [None]:
df['FLAG_OWN_CAR'] = np.where(((df['FLAG_OWN_CAR']=='Y') ), '1','0')

In [None]:
df['FLAG_OWN_REALTY'] = np.where(((df['FLAG_OWN_REALTY']=='Y') ), '1','0')

In [None]:
df['NAME_INCOME_TYPE'].value_counts()

In [None]:
df['NAME_EDUCATION_TYPE'].value_counts()

In [None]:
df['NAME_FAMILY_STATUS'].value_counts()

In [None]:
df['NAME_HOUSING_TYPE'].value_counts()

In [None]:
df = pd.get_dummies(df, columns=['NAME_CONTRACT_TYPE','WEEKDAY_APPR_PROCESS_START','NAME_TYPE_SUITE',
                                'NAME_CLIENT_TYPE','NAME_GOODS_CATEGORY','NAME_SELLER_INDUSTRY','NAME_INCOME_TYPE',
                                 'NAME_EDUCATION_TYPE','NAME_FAMILY_STATUS','NAME_HOUSING_TYPE','NAME_YIELD_GROUP'
                                ], drop_first=True)

In [None]:
for col_name in df.columns: 
    print(f"'{col_name}',")

In [None]:
#prepare datas for build
selected_features = ['AMT_APPLICATION',
'AMT_DOWN_PAYMENT',
'AMT_GOODS_PRICE',
'HOUR_APPR_PROCESS_START',
'NAME_CONTRACT_STATUS',
'CNT_PAYMENT',
'CODE_GENDER',
'FLAG_OWN_CAR',
'FLAG_OWN_REALTY',
'CNT_CHILDREN',
'AMT_INCOME_TOTAL',
'DAYS_BIRTH',
'DAYS_EMPLOYED',
'OWN_CAR_AGE',
'FLAG_MOBIL',
'FLAG_EMAIL',
'CNT_FAM_MEMBERS',
'NAME_CONTRACT_TYPE_Consumer loans',
'NAME_CONTRACT_TYPE_Revolving loans',
'WEEKDAY_APPR_PROCESS_START_MONDAY',
'WEEKDAY_APPR_PROCESS_START_SATURDAY',
'WEEKDAY_APPR_PROCESS_START_SUNDAY',
'WEEKDAY_APPR_PROCESS_START_THURSDAY',
'WEEKDAY_APPR_PROCESS_START_TUESDAY',
'WEEKDAY_APPR_PROCESS_START_WEDNESDAY',
'NAME_TYPE_SUITE_Family',
'NAME_TYPE_SUITE_Group of people',
'NAME_TYPE_SUITE_No Specified',
'NAME_TYPE_SUITE_Other_A',
'NAME_TYPE_SUITE_Other_B',
'NAME_TYPE_SUITE_Spouse, partner',
'NAME_TYPE_SUITE_Unaccompanied',
'NAME_CLIENT_TYPE_Refreshed',
'NAME_CLIENT_TYPE_Repeater',
'NAME_CLIENT_TYPE_XNA',
'NAME_GOODS_CATEGORY_Animals',
'NAME_GOODS_CATEGORY_Audio/Video',
'NAME_GOODS_CATEGORY_Auto Accessories',
'NAME_GOODS_CATEGORY_Clothing and Accessories',
'NAME_GOODS_CATEGORY_Computers',
'NAME_GOODS_CATEGORY_Construction Materials',
'NAME_GOODS_CATEGORY_Consumer Electronics',
'NAME_GOODS_CATEGORY_Direct Sales',
'NAME_GOODS_CATEGORY_Education',
'NAME_GOODS_CATEGORY_Fitness',
'NAME_GOODS_CATEGORY_Furniture',
'NAME_GOODS_CATEGORY_Gardening',
'NAME_GOODS_CATEGORY_Homewares',
'NAME_GOODS_CATEGORY_Insurance',
'NAME_GOODS_CATEGORY_Jewelry',
'NAME_GOODS_CATEGORY_Medical Supplies',
'NAME_GOODS_CATEGORY_Medicine',
'NAME_GOODS_CATEGORY_Mobile',
'NAME_GOODS_CATEGORY_Office Appliances',
'NAME_GOODS_CATEGORY_Other',
'NAME_GOODS_CATEGORY_Photo / Cinema Equipment',
'NAME_GOODS_CATEGORY_Sport and Leisure',
'NAME_GOODS_CATEGORY_Tourism',
'NAME_GOODS_CATEGORY_Vehicles',
'NAME_GOODS_CATEGORY_Weapon',
'NAME_GOODS_CATEGORY_XNA',
'NAME_SELLER_INDUSTRY_Clothing',
'NAME_SELLER_INDUSTRY_Connectivity',
'NAME_SELLER_INDUSTRY_Construction',
'NAME_SELLER_INDUSTRY_Consumer electronics',
'NAME_SELLER_INDUSTRY_Furniture',
'NAME_SELLER_INDUSTRY_Industry',
'NAME_SELLER_INDUSTRY_Jewelry',
'NAME_SELLER_INDUSTRY_MLM partners',
'NAME_SELLER_INDUSTRY_Tourism',
'NAME_SELLER_INDUSTRY_XNA',
'NAME_INCOME_TYPE_Maternity leave',
'NAME_INCOME_TYPE_Pensioner',
'NAME_INCOME_TYPE_State servant',
'NAME_INCOME_TYPE_Student',
'NAME_INCOME_TYPE_Unemployed',
'NAME_INCOME_TYPE_Working',
'NAME_EDUCATION_TYPE_Higher education',
'NAME_EDUCATION_TYPE_Incomplete higher',
'NAME_EDUCATION_TYPE_Lower secondary',
'NAME_EDUCATION_TYPE_Secondary / secondary special',
'NAME_FAMILY_STATUS_Married',
'NAME_FAMILY_STATUS_Separated',
'NAME_FAMILY_STATUS_Single / not married',
'NAME_FAMILY_STATUS_Widow',
'NAME_HOUSING_TYPE_House / apartment',
'NAME_HOUSING_TYPE_Municipal apartment',
'NAME_HOUSING_TYPE_Office apartment',
'NAME_HOUSING_TYPE_Rented apartment',
'NAME_HOUSING_TYPE_With parents',
'NAME_YIELD_GROUP_high',
'NAME_YIELD_GROUP_low_action',
'NAME_YIELD_GROUP_low_normal',
'NAME_YIELD_GROUP_middle']

X = df[selected_features]
y = df['NAME_CONTRACT_STATUS']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=45)
print('Lenght of our Training data:', X_train.shape, '\nLength of our Testing data:', y_test.shape)

In [None]:
df['NAME_CONTRACT_STATUS'].value_counts()

In [None]:
df.info()

In [None]:
y_train.value_counts()

In [None]:
y_train.isna().sum()


In [None]:
df.head(50)

In [None]:
from sklearn.tree import DecisionTreeClassifier 


model = DecisionTreeClassifier(max_depth=9)

model.fit(X_train,y_train)

y_pred = model.predict(X_test)

accuracy = accuracy_score(y_true=y_test, y_pred=y_pred)
print("Accuracy Score: %f" % accuracy)

precision = precision_score(y_true=y_test, y_pred=y_pred,pos_label='1')
print("Precision Score: %f" % precision)

recall = recall_score(y_true=y_test, y_pred=y_pred, pos_label='1')
print("Recall Score: %f" % recall)

f1 = f1_score(y_true=y_test, y_pred=y_pred,pos_label='1')
print('F1 Score: %f' % f1)


# Calculate predicted probabilities
y_pred_proba = model.predict_proba(X_test)

# # Keep only the proba for True
y_pred_proba = y_pred_proba[:,1]

# # Compute auc score
auc = roc_auc_score(y_true=y_test, y_score=y_pred_proba)

print('AUC Score: %f' % auc)

In [None]:
dot_data = tree.export_graphviz(model, out_file=None, 
                     feature_names=selected_features,
                     class_names=['Approved','refused'],
                     filled=True, rounded=True,  
                     special_characters=True)  
graph = graphviz.Source(dot_data)  
graph



In [None]:

# Now lets look at our feature importances
my_dict = {'feature_importance': model.feature_importances_,
           'feature':selected_features }
feature_imp = pd.DataFrame.from_dict( my_dict ).sort_values('feature_importance', ascending=False)
feature_imp
# 


In [None]:
# Initialize an empty Random Forest model
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier()

model.fit(X_train, y_train)

# Now lets evaluate our model
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_true=y_test, y_pred=y_pred)
print("Accuracy Score: %f" % accuracy)

precision = precision_score(y_true=y_test, y_pred=y_pred, pos_label='1')
print("Precision Score: %f" % precision)

recall = recall_score(y_true=y_test, y_pred=y_pred, pos_label='1')
print("Recall Score: %f" % recall)

f1 = f1_score(y_true=y_test, y_pred=y_pred, pos_label='1')
print('F1 Score: %f' % f1)

# Calculate predicted probabilities, keep only probability for when class = 1
y_pred_proba = model.predict_proba(X_test)[:,1]
auc = roc_auc_score(y_true=y_test, y_score=y_pred_proba)
print('AUC Score: %f' % auc)



In [None]:
df.isnull().sum()

In [None]:

params = {
    'n_estimators' : [5, 10, 50, 100],
    'criterion' : ['gini', 'entropy'],
    'max_depth': [5, 10, 20], 
    'min_samples_split': [2, 10, 100],
    'max_features': [2, 4, 'auto']
}

grid_search_cv = GridSearchCV( 
    estimator=RandomForestClassifier(), 
    param_grid=params,
    scoring='f1', )


# Now, with one easy command, fit all combination of trees. 
grid_search_cv.fit(X_train, y_train)


# Print the best parameters it found
print(grid_search_cv.best_params_)


# This command gives you model that has the highest f1-score. 
model = grid_search_cv.best_estimator_

# Now lets evaluate our model
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_true=y_test, y_pred=y_pred)
print("Accuracy Score: %f" % accuracy)

precision = precision_score(y_true=y_test, y_pred=y_pred, pos_label='1')
print("Precision Score: %f" % precision)

recall = recall_score(y_true=y_test, y_pred=y_pred, pos_label='1')
print("Recall Score: %f" % recall)

f1 = f1_score(y_true=y_test, y_pred=y_pred, pos_label='1')
print('F1 Score: %f' % f1)

# Calculate predicted probabilities, keep only probability for when class = 1
y_pred_proba = model.predict_proba(X_test)[:,1]
auc = roc_auc_score(y_true=y_test, y_score=y_pred_proba)
print('AUC Score: %f' % auc)



In [None]:
# predicting the loan amount range
# will have delete the refused data, and delete the application_result column
# and treat loan_amount as our new y data
df.head(30)

In [None]:
# drop refused data
df.drop(df[df['NAME_CONTRACT_STATUS']=='Refused'].index, inplace = True)

In [None]:
# delete NAME_CONTRACT_STATUS column
df = df.drop('NAME_CONTRACT_STATUS', 1)

In [None]:
pd.set_option('display.max_columns', None)
df.head(100)

In [None]:
df['AMT_CREDIT'].value_counts()

In [None]:

# Now lets look at our feature importances
my_dict = {'feature_importance': model.feature_importances_,
           'feature':selected_features }
feature_imp = pd.DataFrame.from_dict( my_dict ).sort_values('feature_importance', ascending=False)
feature_imp

In [None]:
import scipy.stats as stats

original_cols = df.columns

target_cols = ['AMT_ANNUITY',
'AMT_APPLICATION',
'AMT_GOODS_PRICE',
'HOUR_APPR_PROCESS_START',
'NFLAG_LAST_APPL_IN_DAY',
'DAYS_DECISION',
'NAME_CONTRACT_TYPE_Consumer loans',
'NAME_CONTRACT_TYPE_Revolving loans']

z_score_cols = []

# Loop through our target columns
for col in target_cols:
    # Make the new column name the same as the original but with 'z_score' added to it
    new_col_name = col + "_zscore"
    
    # Set the new column equal to the score
    df[new_col_name] = stats.stats.zscore( df[col] )
    
    # Set the z-score to its absolute value of the for easier filtering
    df[new_col_name] = abs( df[new_col_name] )
    
    # Append the new column name our our z_score_cols list for easier access for later.
    z_score_cols.append(new_col_name)


condition = df[z_score_cols] < 3
print(df.shape)

# # Say TRUE only if all of the rows are True, else return False
condition = condition.all(axis=1)

print('Before removal of outliers', df.shape)

df = df[condition]

print('After removal of outliers', df.shape)



In [None]:
df = df[original_cols]

In [None]:
from sklearn.neighbors import KNeighborsClassifier

model = KNeighborsClassifier()

model.fit(X_train, y_train)

# Now lets evaluate our model
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_true=y_test, y_pred=y_pred)
print("Accuracy Score: %f" % accuracy)

precision = precision_score(y_true=y_test, y_pred=y_pred, pos_label='1')
print("Precision Score: %f" % precision)

recall = recall_score(y_true=y_test, y_pred=y_pred, pos_label='1')
print("Recall Score: %f" % recall)

f1 = f1_score(y_true=y_test, y_pred=y_pred, pos_label='1')
print('F1 Score: %f' % f1)

# Calculate predicted probabilities, keep only probability for when class = 1
y_pred_proba = model.predict_proba(X_test)[:,1]
auc = roc_auc_score(y_true=y_test, y_score=y_pred_proba)
print('AUC Score: %f' % auc)