# 1.0 Common commands

## 1.1 Mount drive

In [None]:
#Mount Google drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## 1.2 Install packages

In [None]:
#Install packages
!pip install pycaret
!pip install --upgrade pycaret
!pip install pyod
!pip install --upgrade pyod

Collecting pycaret
  Downloading pycaret-2.3.6-py3-none-any.whl (301 kB)
[K     |████████████████████████████████| 301 kB 10.6 MB/s 
[?25hCollecting pyLDAvis
  Downloading pyLDAvis-3.3.1.tar.gz (1.7 MB)
[K     |████████████████████████████████| 1.7 MB 42.2 MB/s 
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
Collecting mlflow
  Downloading mlflow-1.23.1-py3-none-any.whl (15.6 MB)
[K     |████████████████████████████████| 15.6 MB 603 kB/s 
[?25hCollecting kmodes>=0.10.1
  Downloading kmodes-0.11.1-py2.py3-none-any.whl (19 kB)
Collecting yellowbrick>=1.0.1
  Downloading yellowbrick-1.4-py3-none-any.whl (274 kB)
[K     |████████████████████████████████| 274 kB 23.5 MB/s 
Collecting scikit-plot
  Downloading scikit_plot-0.3.7-py3-none-any.whl (33 kB)
Collecting mlxtend>=0.17.0
  Downloading mlxtend-0.19.0-py2.p



## 1.3 Import libraries

In [None]:
#Import libraries
import pandas as pd

import numpy as np

from scipy.stats import uniform
from pycaret.classification import *

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import GradientBoostingClassifier

from pyod.models.cblof import CBLOF

import pickle

  defaults = yaml.load(f)


## 1.4 Import data

In [None]:
#Function named dataframe_optimizer is defined. This will reduce space consumption by dataframes.
#Credit - https://www.kaggle.com/rinnqd/reduce-memory-usage and 
#https://www.analyticsvidhya.com/blog/2021/04/how-to-reduce-memory-usage-in-python-pandas/
def dataframe_optimizer(df):
  '''This is a dataframe optimizer'''
  start_mem=np.round(df.memory_usage().sum()/1024**2,2)    
  for col in df.columns:
    col_type=df[col].dtype        
    if col_type!=object:
      c_min=df[col].min()
      c_max=df[col].max()
      if str(col_type)[:3]=='int':
        if c_min>np.iinfo(np.int8).min and c_max<np.iinfo(np.int8).max:
            df[col]=df[col].astype(np.int8)
        elif c_min>np.iinfo(np.int16).min and c_max<np.iinfo(np.int16).max:
            df[col]=df[col].astype(np.int16)
        elif c_min>np.iinfo(np.int32).min and c_max<np.iinfo(np.int32).max:
            df[col]=df[col].astype(np.int32)
        elif c_min>np.iinfo(np.int64).min and c_max<np.iinfo(np.int64).max:
            df[col]=df[col].astype(np.int64)  
      else:
        if c_min>np.finfo(np.float16).min and c_max<np.finfo(np.float16).max:
            df[col]=df[col].astype(np.float16)
        elif c_min>np.finfo(np.float32).min and c_max<np.finfo(np.float32).max:
            df[col]=df[col].astype(np.float32)
        else:
            df[col]=df[col].astype(np.float64)
  end_mem=np.round(df.memory_usage().sum()/1024**2,2)
  return df

All the relevant files can be accessed through the following link: https://drive.google.com/drive/folders/1evFZRwFWh4zkR9CiT46llB9PlaXFLfLA?usp=sharing

In [None]:
#Read application_train
application_train = dataframe_optimizer(pd.read_csv('/content/drive/MyDrive/AI_ML_Project/Data/application_train.csv'))

#Read application_test
application_test = dataframe_optimizer(pd.read_csv('/content/drive/MyDrive/AI_ML_Project/Data/application_test.csv'))

#Read bureau
bureau = dataframe_optimizer(pd.read_csv('/content/drive/MyDrive/AI_ML_Project/Data/bureau.csv'))

#Read previous_application
previous_application = dataframe_optimizer(pd.read_csv('/content/drive/MyDrive/AI_ML_Project/Data/previous_application.csv'))

In [None]:
#Print the shape of imported data
print(application_train.shape)
print(application_test.shape)
print(bureau.shape)
print(previous_application.shape)

(307511, 122)
(48744, 121)
(1716428, 17)
(1670214, 37)


# 2.0 Feature Engineering and data merger

## 2.1 Create 3 new ratios from existing columns of application_train and application_test

In [None]:
#Add columns titled DEBT_INCOME_RATIO to application_train
application_train['DEBT_INCOME_RATIO'] = application_train['AMT_ANNUITY']/application_train['AMT_INCOME_TOTAL']

#Add columns titled LOAN_VALUE_RATIO to application_train
application_train['LOAN_VALUE_RATIO'] = application_train['AMT_CREDIT']/application_train['AMT_GOODS_PRICE']

#Add columns titled LOAN_INCOME_RATIO to application_train
application_train['LOAN_INCOME_RATIO'] = application_train['AMT_CREDIT']/application_train['AMT_INCOME_TOTAL']

## 2.2 Merge bureau with application_train

In [None]:
#Create a dataframe with numerical columns of bureau
bureau_numerical = bureau.select_dtypes(exclude=object)
#Create a dataframe with categorical columns of bureau
bureau_categorical = bureau.select_dtypes(include=object)

In [None]:
#Merge numerical features from bureau to application_train
bureau_numerical_merge = bureau_numerical.groupby(by=['SK_ID_CURR']).median().reset_index()
application_train_bureau = application_train.merge(bureau_numerical_merge, on='SK_ID_CURR', how='left', suffixes=('', '_BUREAU'))

#Merge categorical features from bureau to application_train
bureau_categorical['SK_ID_CURR'] = bureau['SK_ID_CURR']
bureau_categorical_merge = bureau_categorical.groupby(by=['SK_ID_CURR']).agg(lambda x:x.value_counts().index[0] if len(x.value_counts()) != 0 else '').reset_index()
application_train_bureau = application_train_bureau.merge(bureau_categorical_merge, on='SK_ID_CURR', how='left', suffixes=('', '_BUREAU'))

#Drop SK_ID_BUREAU
application_train_bureau = application_train_bureau.drop(columns = ['SK_ID_BUREAU'])

#Shape of application and bureau data combined
print('The shape of application_train and bureau data merged: ', application_train_bureau.shape)

The shape of application_train and bureau data merged:  (307511, 140)


In [None]:
#Save the dataframes into CSV files for future use
bureau_numerical_merge.to_csv('bureau_numerical_merge.csv', index = False)
bureau_categorical_merge.to_csv('bureau_categorical_merge.csv', index = False)

## 2.3 Merge previous_application with application_train_bureau

In [None]:
#Create a dataframe with numerical columns of previous_application
previous_application_numerical = previous_application.select_dtypes(exclude=object)
#Create a dataframe with categorical columns of previous_application
previous_application_categorical = previous_application.select_dtypes(include=object)

In [None]:
#Merge numerical features from previous_application to application_train_bureau
previous_numerical_merge = previous_application_numerical.groupby(by=['SK_ID_CURR']).mean().reset_index()
application_train_bureau_previous = application_train_bureau.merge(previous_numerical_merge, on='SK_ID_CURR', how='left', suffixes=('', '_PREVIOUS'))

#Merge categorical features from previous_application to application_train_bureau
previous_application_categorical['SK_ID_CURR'] = bureau['SK_ID_CURR']
previous_categorical_merge = previous_application_categorical.groupby(by=['SK_ID_CURR']).agg(lambda x:x.value_counts().index[0] if len(x.value_counts()) != 0 else '').reset_index()
application_train_bureau_previous = application_train_bureau_previous.merge(previous_categorical_merge, on='SK_ID_CURR', how='left', suffixes=('', '_PREVIOUS'))

#Drop SK_ID_PREV
application_train_bureau_previous = application_train_bureau_previous.drop(columns = ['SK_ID_PREV'])

#Shape of application_train_bureau and previous_application data combined
print('The shape of application_train_bureau and previous_application data merged: ', application_train_bureau_previous.shape)

The shape of application_train_bureau and previous_application data merged:  (307511, 175)


In [None]:
#Save the dataframes into CSV files for future use
previous_numerical_merge.to_csv('previous_numerical_merge.csv', index = False)
previous_categorical_merge.to_csv('previous_categorical_merge.csv', index = False)

## 2.4 Prepare train data and save it and it's column names

In [None]:
#Final train data ready for preprocessing
train_data = application_train_bureau_previous.drop(columns=['SK_ID_CURR'])

In [None]:
#Save the dataframes into CSV files for future use
train_data.to_csv('train_data.csv', index = False)

# 3.0 Training and Pipeline using Pycaret

In [None]:
#Save the list of columns in train_data
file = open('columns_query_data.pkl', 'wb')
pickle.dump(list(application_test.columns), file)
file.close()

## 3.1 Fetch data

In [None]:
#Read train_data
train_data_full = dataframe_optimizer(pd.read_csv('/content/drive/MyDrive/AI_ML_Project/Data/train_data.csv'))

## 3.2 Train model using 100% data

In [None]:
#Make lists of numerical and categorical columns
columns_numerical = list(train_data_full.select_dtypes(exclude=object).columns)
columns_numerical.remove('TARGET')
columns_categorical = list(train_data_full.select_dtypes(include=object).columns)

In [None]:
#Set up data for feeding to model
data = setup(data=train_data_full, target="TARGET", categorical_features=columns_categorical, numeric_features=columns_numerical, train_size=0.9, 
             numeric_imputation='median', normalize=True, remove_outliers=True, data_split_stratify=True, feature_selection=True, feature_selection_threshold=0.35)

Unnamed: 0,Description,Value
0,session_id,202
1,Target,TARGET
2,Target Type,Binary
3,Label Encoded,
4,Original Data,"(307511, 174)"
5,Missing Values,True
6,Numeric Features,138
7,Categorical Features,35
8,Ordinal Features,False
9,High Cardinality Features,False


In [None]:
#Train model
model = create_model('lightgbm')

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.9191,0.7646,0.0225,0.5333,0.0432,0.0369,0.0971
1,0.9187,0.7574,0.0239,0.4722,0.0455,0.038,0.092
2,0.9189,0.7671,0.0206,0.5057,0.0397,0.0335,0.0896
3,0.9188,0.7645,0.023,0.4804,0.0439,0.0367,0.0913
4,0.9195,0.7615,0.0248,0.5889,0.0477,0.0414,0.109
5,0.9197,0.7699,0.0234,0.641,0.0452,0.0397,0.1119
6,0.9193,0.7699,0.0239,0.5667,0.0459,0.0396,0.1042
7,0.9199,0.7694,0.0272,0.6444,0.0522,0.0459,0.1209
8,0.919,0.7729,0.022,0.5165,0.0423,0.0359,0.094
9,0.9192,0.7617,0.03,0.5378,0.0568,0.0487,0.1128


In [None]:
#Tune model
tuned_model = tune_model(model)

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.9194,0.7658,0.037,0.5524,0.0694,0.0598,0.1277
1,0.9193,0.7563,0.0361,0.5347,0.0677,0.058,0.1233
2,0.9188,0.7671,0.0361,0.4873,0.0672,0.0567,0.1157
3,0.9189,0.7627,0.0347,0.5034,0.0649,0.0551,0.116
4,0.9193,0.7557,0.038,0.54,0.071,0.0609,0.1273
5,0.9194,0.7697,0.0361,0.554,0.0678,0.0584,0.1263
6,0.9187,0.7718,0.0323,0.4825,0.0606,0.051,0.1087
7,0.9195,0.7679,0.0403,0.5513,0.0751,0.0648,0.133
8,0.9188,0.7666,0.0398,0.4913,0.0737,0.0623,0.1223
9,0.9195,0.7601,0.045,0.5486,0.0832,0.0718,0.1401


In [None]:
#Save best model and store it in Google Drive for future use
save_model(tuned_model, "model")

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=None,
          steps=[('dtypes',
                  DataTypes_Auto_infer(categorical_features=['NAME_CONTRACT_TYPE',
                                                             'CODE_GENDER',
                                                             'FLAG_OWN_CAR',
                                                             'FLAG_OWN_REALTY',
                                                             'NAME_TYPE_SUITE',
                                                             'NAME_INCOME_TYPE',
                                                             'NAME_EDUCATION_TYPE',
                                                             'NAME_FAMILY_STATUS',
                                                             'NAME_HOUSING_TYPE',
                                                             'OCCUPATION_TYPE',
                                                             'WEEKDAY_APPR_PROCESS_START',
                                                    

Remarks:
This model is not getting successfully deployed on Heroku. The model size is 172 MB. We shall try with 50% and 25% of total data points in case size of the model is an issue.

## 3.3 Train model using 25% data

In [None]:
#Using train test split to extract 25% of train data with stratification
train_data, X_test, y_train, y_test = train_test_split( train_data_full, train_data_full['TARGET'], 
                                                       test_size=0.75, random_state=42, stratify=train_data_full['TARGET'])

In [None]:
#Make lists of numerical and categorical columns
columns_numerical = list(train_data.select_dtypes(exclude=object).columns)
columns_numerical.remove('TARGET')
columns_categorical = list(train_data.select_dtypes(include=object).columns)

In [None]:
#Set up data for feeding to model
data = setup(data=train_data, target="TARGET", categorical_features=columns_categorical, numeric_features=columns_numerical, train_size=0.9, 
             numeric_imputation='median', normalize=True, remove_outliers=True, data_split_stratify=True, feature_selection=True, feature_selection_threshold=0.35)

Unnamed: 0,Description,Value
0,session_id,771
1,Target,TARGET
2,Target Type,Binary
3,Label Encoded,
4,Original Data,"(76877, 174)"
5,Missing Values,True
6,Numeric Features,138
7,Categorical Features,35
8,Ordinal Features,False
9,High Cardinality Features,False


In [None]:
#Train model
model = create_model('lightgbm')

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.9192,0.7522,0.0264,0.5,0.0501,0.0423,0.1006
1,0.9185,0.747,0.0132,0.3684,0.0255,0.02,0.0568
2,0.9178,0.7449,0.0132,0.3043,0.0253,0.0187,0.0486
3,0.9197,0.7539,0.0188,0.5882,0.0365,0.0316,0.0948
4,0.9191,0.7524,0.0245,0.4815,0.0466,0.0391,0.0944
5,0.9189,0.7572,0.0263,0.4828,0.0499,0.0419,0.0981
6,0.9171,0.7357,0.0188,0.303,0.0354,0.0262,0.0578
7,0.918,0.7488,0.0169,0.36,0.0323,0.0252,0.0632
8,0.9195,0.7581,0.0263,0.56,0.0503,0.0433,0.1085
9,0.9194,0.7335,0.0301,0.5161,0.0569,0.0485,0.11


In [None]:
#Tune model
tuned_model = tune_model(model)

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.9197,0.7564,0.0132,0.6364,0.0258,0.0226,0.0835
1,0.9191,0.7562,0.0113,0.4615,0.0221,0.0183,0.0622
2,0.9198,0.7587,0.0245,0.5909,0.047,0.0409,0.1085
3,0.9189,0.7573,0.0188,0.4545,0.0362,0.0299,0.0795
4,0.9198,0.7504,0.0207,0.6111,0.0401,0.035,0.102
5,0.9198,0.765,0.0226,0.6316,0.0436,0.0382,0.1087
6,0.9188,0.7444,0.015,0.4444,0.0291,0.0239,0.0698
7,0.9195,0.7572,0.0132,0.6364,0.0258,0.0226,0.0834
8,0.9191,0.7617,0.0113,0.5,0.0221,0.0186,0.0657
9,0.92,0.741,0.0264,0.6087,0.0505,0.0441,0.1148


In [None]:
#Save best model and store it in Google Drive for future use
save_model(tuned_model, "model")

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=None,
          steps=[('dtypes',
                  DataTypes_Auto_infer(categorical_features=['NAME_CONTRACT_TYPE',
                                                             'CODE_GENDER',
                                                             'FLAG_OWN_CAR',
                                                             'FLAG_OWN_REALTY',
                                                             'NAME_TYPE_SUITE',
                                                             'NAME_INCOME_TYPE',
                                                             'NAME_EDUCATION_TYPE',
                                                             'NAME_FAMILY_STATUS',
                                                             'NAME_HOUSING_TYPE',
                                                             'OCCUPATION_TYPE',
                                                             'WEEKDAY_APPR_PROCESS_START',
                                                    

## 3.4 Create Pipeline and predict

In [None]:
#Import saved data and pickle files
bureau_numerical_merge = dataframe_optimizer(pd.read_csv('/content/drive/MyDrive/AI_ML_Project/Data/bureau_numerical_merge.csv'))
bureau_categorical_merge = dataframe_optimizer(pd.read_csv('/content/drive/MyDrive/AI_ML_Project/Data/bureau_categorical_merge.csv'))
previous_numerical_merge = dataframe_optimizer(pd.read_csv('/content/drive/MyDrive/AI_ML_Project/Data/previous_numerical_merge.csv'))
previous_categorical_merge = dataframe_optimizer(pd.read_csv('/content/drive/MyDrive/AI_ML_Project/Data/previous_categorical_merge.csv'))
filename = open('/content/drive/MyDrive/AI_ML_Project/Data/columns_train_data.pkl', 'rb')
columns = pickle.load(filename)
filename.close()
tuned_model = load_model('/content/drive/MyDrive/AI_ML_Project/Data/model')

Transformation Pipeline and Model Successfully Loaded


In [None]:
#Read query data point(s)
query = dataframe_optimizer(pd.read_csv('/content/drive/MyDrive/AI_ML_Project/Data/application_test.csv'))

In [None]:
#Define a function to create a pipeline for prediction
def inference(query):
  
  #Add columns titled DEBT_INCOME_RATIO to application_train
  query['DEBT_INCOME_RATIO'] = query['AMT_ANNUITY']/query['AMT_INCOME_TOTAL']

  #Add columns titled LOAN_VALUE_RATIO to application_train
  query['LOAN_VALUE_RATIO'] = query['AMT_CREDIT']/query['AMT_GOODS_PRICE']

  #Add columns titled LOAN_INCOME_RATIO to application_train
  query['LOAN_INCOME_RATIO'] = query['AMT_CREDIT']/query['AMT_INCOME_TOTAL']

  #Merge numerical features from bureau to query
  query_bureau = query.merge(bureau_numerical_merge, on='SK_ID_CURR', how='left', suffixes=('', '_BUREAU'))

  #Merge categorical features from bureau to query
  query_bureau = query_bureau.merge(bureau_categorical_merge, on='SK_ID_CURR', how='left', suffixes=('', '_BUREAU'))

  #Drop SK_ID_BUREAU
  query_bureau = query_bureau.drop(columns = ['SK_ID_BUREAU'])

  #Shape of query and bureau data combined
  print('The shape of query and bureau data merged: ', query_bureau.shape)
  
  #Merge numerical features from previous_application to query_bureau
  query_bureau_previous = query_bureau.merge(previous_numerical_merge, on='SK_ID_CURR', how='left', suffixes=('', '_PREVIOUS'))

  #Merge categorical features from previous_application to query_bureau
  query_bureau_previous = query_bureau_previous.merge(previous_categorical_merge, on='SK_ID_CURR', how='left', suffixes=('', '_PREVIOUS'))

  #Drop SK_ID_PREV and SK_ID_CURR
  query_bureau_previous = query_bureau_previous.drop(columns = ['SK_ID_PREV'])

  #Shape of query_bureau and previous_application data combined
  print('The shape of query_bureau and previous_application data merged: ', query_bureau_previous.shape)
  
  #Drop SK_ID_PREV and SK_ID_CURR
  query_bureau_previous = query_bureau_previous.drop(columns = ['SK_ID_CURR'])

  missing_columns = set(list(columns)) - set(['TARGET']) - set(list(query_bureau_previous.columns))
  if len(missing_columns) != 0:
    print("Please enter values for all columns")
  else:
    predictions = predict_model(tuned_model, query_bureau_previous)
    return predictions

In [None]:
#Show predictions
query_prediction = inference(query)
query_prediction

The shape of query and bureau data merged:  (48744, 139)
The shape of query_bureau and previous_application data merged:  (48744, 174)


Unnamed: 0,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,NAME_TYPE_SUITE,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,DAYS_ID_PUBLISH,OWN_CAR_AGE,FLAG_MOBIL,FLAG_EMP_PHONE,FLAG_WORK_PHONE,FLAG_CONT_MOBILE,FLAG_PHONE,FLAG_EMAIL,OCCUPATION_TYPE,CNT_FAM_MEMBERS,REGION_RATING_CLIENT,REGION_RATING_CLIENT_W_CITY,WEEKDAY_APPR_PROCESS_START,HOUR_APPR_PROCESS_START,REG_REGION_NOT_LIVE_REGION,REG_REGION_NOT_WORK_REGION,LIVE_REGION_NOT_WORK_REGION,REG_CITY_NOT_LIVE_CITY,REG_CITY_NOT_WORK_CITY,LIVE_CITY_NOT_WORK_CITY,ORGANIZATION_TYPE,EXT_SOURCE_1,EXT_SOURCE_2,EXT_SOURCE_3,APARTMENTS_AVG,BASEMENTAREA_AVG,YEARS_BEGINEXPLUATATION_AVG,YEARS_BUILD_AVG,COMMONAREA_AVG,ELEVATORS_AVG,ENTRANCES_AVG,FLOORSMAX_AVG,FLOORSMIN_AVG,LANDAREA_AVG,LIVINGAPARTMENTS_AVG,LIVINGAREA_AVG,NONLIVINGAPARTMENTS_AVG,NONLIVINGAREA_AVG,APARTMENTS_MODE,BASEMENTAREA_MODE,YEARS_BEGINEXPLUATATION_MODE,YEARS_BUILD_MODE,COMMONAREA_MODE,ELEVATORS_MODE,ENTRANCES_MODE,FLOORSMAX_MODE,FLOORSMIN_MODE,LANDAREA_MODE,LIVINGAPARTMENTS_MODE,LIVINGAREA_MODE,NONLIVINGAPARTMENTS_MODE,NONLIVINGAREA_MODE,APARTMENTS_MEDI,BASEMENTAREA_MEDI,YEARS_BEGINEXPLUATATION_MEDI,YEARS_BUILD_MEDI,COMMONAREA_MEDI,ELEVATORS_MEDI,ENTRANCES_MEDI,FLOORSMAX_MEDI,FLOORSMIN_MEDI,LANDAREA_MEDI,LIVINGAPARTMENTS_MEDI,LIVINGAREA_MEDI,NONLIVINGAPARTMENTS_MEDI,NONLIVINGAREA_MEDI,FONDKAPREMONT_MODE,HOUSETYPE_MODE,TOTALAREA_MODE,WALLSMATERIAL_MODE,EMERGENCYSTATE_MODE,OBS_30_CNT_SOCIAL_CIRCLE,DEF_30_CNT_SOCIAL_CIRCLE,OBS_60_CNT_SOCIAL_CIRCLE,DEF_60_CNT_SOCIAL_CIRCLE,DAYS_LAST_PHONE_CHANGE,FLAG_DOCUMENT_2,FLAG_DOCUMENT_3,FLAG_DOCUMENT_4,FLAG_DOCUMENT_5,FLAG_DOCUMENT_6,FLAG_DOCUMENT_7,FLAG_DOCUMENT_8,FLAG_DOCUMENT_9,FLAG_DOCUMENT_10,FLAG_DOCUMENT_11,FLAG_DOCUMENT_12,FLAG_DOCUMENT_13,FLAG_DOCUMENT_14,FLAG_DOCUMENT_15,FLAG_DOCUMENT_16,FLAG_DOCUMENT_17,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR,DEBT_INCOME_RATIO,LOAN_VALUE_RATIO,LOAN_INCOME_RATIO,DAYS_CREDIT,CREDIT_DAY_OVERDUE,DAYS_CREDIT_ENDDATE,DAYS_ENDDATE_FACT,AMT_CREDIT_MAX_OVERDUE,CNT_CREDIT_PROLONG,AMT_CREDIT_SUM,AMT_CREDIT_SUM_DEBT,AMT_CREDIT_SUM_LIMIT,AMT_CREDIT_SUM_OVERDUE,DAYS_CREDIT_UPDATE,AMT_ANNUITY_BUREAU,CREDIT_ACTIVE,CREDIT_CURRENCY,CREDIT_TYPE,AMT_ANNUITY_PREVIOUS,AMT_APPLICATION,AMT_CREDIT_PREVIOUS,AMT_DOWN_PAYMENT,AMT_GOODS_PRICE_PREVIOUS,HOUR_APPR_PROCESS_START_PREVIOUS,NFLAG_LAST_APPL_IN_DAY,RATE_DOWN_PAYMENT,RATE_INTEREST_PRIMARY,RATE_INTEREST_PRIVILEGED,DAYS_DECISION,SELLERPLACE_AREA,CNT_PAYMENT,DAYS_FIRST_DRAWING,DAYS_FIRST_DUE,DAYS_LAST_DUE_1ST_VERSION,DAYS_LAST_DUE,DAYS_TERMINATION,NFLAG_INSURED_ON_APPROVAL,NAME_CONTRACT_TYPE_PREVIOUS,WEEKDAY_APPR_PROCESS_START_PREVIOUS,FLAG_LAST_APPL_PER_CONTRACT,NAME_CASH_LOAN_PURPOSE,NAME_CONTRACT_STATUS,NAME_PAYMENT_TYPE,CODE_REJECT_REASON,NAME_TYPE_SUITE_PREVIOUS,NAME_CLIENT_TYPE,NAME_GOODS_CATEGORY,NAME_PORTFOLIO,NAME_PRODUCT_TYPE,CHANNEL_TYPE,NAME_SELLER_INDUSTRY,NAME_YIELD_GROUP,PRODUCT_COMBINATION,Label,Score
0,Cash loans,F,N,Y,0,135000.0,568800.0,20560.5,450000.0,Unaccompanied,Working,Higher education,Married,House / apartment,0.018845,-19241,-2329,-5168.0,-812,,1,1,0,1,0,1,,2.0,2,2,TUESDAY,18,0,0,0,0,0,0,Kindergarten,0.752441,0.789551,0.159546,0.065979,0.058990,0.973145,,,,0.137939,0.125000,,,,0.050507,,,0.067200,0.061188,0.973145,,,,0.137939,0.125000,,,,0.052612,,,0.066589,0.058990,0.973145,,,,0.137939,0.125000,,,,0.051392,,,,block of flats,0.039215,"Stone, brick",No,0.0,0.0,0.0,0.0,-1740.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.152300,1.2640,4.213333,-857.0,0.0,-179.0,-715.0,,0.0,168345.000000,0.000,0.0,0.0,-155.0,0.0,Closed,currency 1,Consumer credit,3951.000000,24835.500000,23787.000000,2520.00,24835.500000,13.000000,1.0,0.104309,,,-1740.0,23.000000,8.000000,365243.000000,-1709.000000,-1499.000000,-1619.000000,-1612.000000,0.000000,Revolving loans,SUNDAY,Y,XAP,Approved,XNA,XAP,Family,Repeater,XNA,Cards,XNA,Credit and cash offices,XNA,XNA,Card Street,0,0.9752
1,Cash loans,M,N,Y,0,99000.0,222768.0,17370.0,180000.0,Unaccompanied,Working,Secondary / secondary special,Married,House / apartment,0.035797,-18064,-4469,-9120.0,-1623,,1,1,0,1,0,0,Low-skill Laborers,2.0,2,2,FRIDAY,9,0,0,0,0,0,0,Self-employed,0.564941,0.291748,0.432861,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,3.0,0.175455,1.2376,2.250182,-137.0,0.0,122.0,-123.0,0.00,0.0,58500.000000,25321.500,0.0,0.0,-31.0,0.0,Active,currency 1,Consumer credit,4813.200195,22308.750000,20076.750000,4464.00,44617.500000,10.500000,1.0,0.108948,,,-536.0,18.000000,12.000000,365243.000000,-706.000000,-376.000000,-466.000000,-460.000000,0.000000,Cash loans,SATURDAY,Y,XAP,Refused,Cash through the bank,XAP,Unaccompanied,Repeater,XNA,Cash,XNA,Country-wide,Connectivity,middle,POS mobile with interest,0,0.9138
2,Cash loans,M,Y,Y,0,202500.0,663264.0,69777.0,630000.0,,Working,Higher education,Married,House / apartment,0.019104,-20038,-4458,-2176.0,-3503,5.0,1,1,0,1,0,0,Drivers,2.0,2,2,MONDAY,14,0,0,0,0,0,0,Transport: type 3,,0.699707,0.610840,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,-856.0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,1.0,4.0,0.344578,1.0528,3.275378,-1835.0,0.0,-999.0,-1168.0,19305.00,0.0,391770.000000,0.000,,0.0,-882.0,0.0,Closed,currency 1,Car loan,11478.195312,130871.250000,146134.125000,3375.00,174495.000000,14.500000,1.0,0.067200,,,-837.5,82.000000,17.328125,365243.000000,-1005.666687,-515.666687,-715.666687,-710.333313,0.333252,Cash loans,THURSDAY,Y,XNA,Canceled,XNA,XAP,Unaccompanied,Repeater,XNA,XNA,XNA,Credit and cash offices,XNA,XNA,Cash,0,0.9548
3,Cash loans,F,N,Y,2,315000.0,1575000.0,49018.5,1575000.0,Unaccompanied,Working,Secondary / secondary special,Married,House / apartment,0.026398,-13976,-1866,-2000.0,-4208,,1,1,0,1,1,0,Sales staff,4.0,2,2,WEDNESDAY,11,0,0,0,0,0,0,Business Entity Type 3,0.525879,0.509766,0.612793,0.305176,0.197388,0.997070,0.958984,0.116516,0.320068,0.275879,0.375000,0.041687,0.204224,0.240356,0.367188,0.038605,0.080017,0.310791,0.204956,0.997070,0.960938,0.117615,0.322266,0.275879,0.375000,0.041687,0.208862,0.262695,0.382812,0.03891,0.084717,0.308105,0.197388,0.997070,0.959473,0.11731,0.320068,0.275879,0.375000,0.041687,0.207764,0.244629,0.373779,0.038788,0.081726,reg oper account,block of flats,0.370117,Panel,No,0.0,0.0,0.0,0.0,-1805.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,3.0,0.155614,1.0000,5.000000,-1612.0,0.0,-896.5,-1375.0,0.00,0.0,129614.039062,0.000,0.0,0.0,-683.5,0.0,Closed,currency 1,Consumer credit,8091.584961,49207.500000,92920.500000,3750.00,82012.500000,10.796875,1.0,0.057709,,,-1124.0,1409.599976,11.335938,243054.328125,-1271.000000,121221.335938,121171.335938,121182.664062,0.000000,Consumer loans,SUNDAY,Y,XAP,Approved,Cash through the bank,XAP,Unaccompanied,Repeater,Computers,POS,XNA,Country-wide,Consumer electronics,middle,POS household with interest,0,0.9666
4,Cash loans,M,Y,N,1,180000.0,625500.0,32067.0,625500.0,Unaccompanied,Working,Secondary / secondary special,Married,House / apartment,0.010033,-13040,-2191,-4000.0,-4262,16.0,1,1,1,1,0,0,,3.0,2,2,FRIDAY,5,0,0,0,0,1,1,Business Entity Type 3,0.202148,0.425781,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,-821.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,,,,,,0.178150,1.0000,3.475000,,,,,,,,,,,,,,,,17782.156250,267727.500000,300550.500000,8095.50,267727.500000,5.500000,1.0,0.087524,,,-466.0,13.000000,24.000000,365243.000000,-787.000000,-457.000000,-457.000000,-449.000000,0.000000,,,,,,,,,,,,,,,,,0,0.8171
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48739,Cash loans,F,N,Y,0,121500.0,412560.0,17473.5,270000.0,Unaccompanied,Working,Secondary / secondary special,Widow,House / apartment,0.002043,-19970,-5169,-9096.0,-3399,,1,1,1,1,1,0,,1.0,3,3,WEDNESDAY,16,0,0,0,0,0,0,Other,,0.648438,0.643066,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,0.0,1.0,0.0,-684.0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0,0.143815,1.5280,3.395555,-601.0,0.0,-98.0,-603.0,11427.75,0.0,145867.500000,0.000,0.0,0.0,-99.0,0.0,Closed,currency 1,Consumer credit,14222.429688,225000.000000,254700.000000,,225000.000000,14.000000,1.0,,,,-683.0,-1.000000,24.000000,365243.000000,-653.000000,37.000000,-593.000000,-591.000000,1.000000,Cash loans,SATURDAY,Y,XNA,Approved,Cash through the bank,XAP,Unaccompanied,Repeater,XNA,Cash,x-sell,Credit and cash offices,XNA,middle,Cash X-Sell: middle,0,0.9423
48740,Cash loans,F,N,N,2,157500.0,622413.0,31909.5,495000.0,Unaccompanied,Commercial associate,Secondary / secondary special,Married,House / apartment,0.035797,-11186,-1149,-3016.0,-3003,,1,1,0,1,0,0,Sales staff,4.0,2,2,MONDAY,11,0,0,0,0,1,1,Trade: type 7,,0.684570,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2.0,0.0,2.0,0.0,0.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,,,,,,0.202600,1.2574,3.951828,,,,,,,,,,,,,,,,6968.891113,86871.375000,98704.125000,1200.00,86871.375000,12.250000,1.0,0.042999,,,-1552.0,99.000000,17.500000,365243.000000,-1519.750000,-1024.750000,-1024.750000,-1019.500000,0.500000,,,,,,,,,,,,,,,,,0,0.8857
48741,Cash loans,F,Y,Y,1,202500.0,315000.0,33205.5,315000.0,Unaccompanied,Commercial associate,Secondary / secondary special,Married,House / apartment,0.026398,-15922,-3037,-2680.0,-1504,4.0,1,1,0,1,1,0,,3.0,2,2,WEDNESDAY,12,0,0,0,0,0,0,Business Entity Type 3,0.733398,0.632812,0.283691,0.111328,0.136353,0.995605,,,0.160034,0.137939,0.333252,,,,0.138306,,0.054199,0.113403,0.141479,0.995605,,,0.161133,0.137939,0.333252,,,,0.144043,,0.057404,0.112427,0.136353,0.995605,,,0.160034,0.137939,0.333252,,,,0.140747,,0.055389,,block of flats,0.166260,"Stone, brick",No,0.0,0.0,0.0,0.0,-838.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,3.0,1.0,0.163978,1.0000,1.555556,-349.0,0.0,-407.5,-406.0,0.00,0.0,54000.000000,0.000,0.0,0.0,-159.0,0.0,Closed,currency 1,Consumer credit,14201.078125,141060.078125,132516.828125,8543.25,141060.078125,20.000000,1.0,0.054474,,,-461.0,146.000000,11.000000,365243.000000,-423.500000,-123.500000,182293.500000,182307.500000,0.000000,Consumer loans,SATURDAY,Y,XAP,Approved,Cash through the bank,XAP,Family,New,Consumer Electronics,POS,XNA,Country-wide,Consumer electronics,middle,POS household with interest,0,0.9841
48742,Cash loans,M,N,N,0,225000.0,450000.0,25128.0,450000.0,Family,Commercial associate,Higher education,Married,House / apartment,0.018845,-13968,-2731,-1461.0,-1364,,1,1,1,1,1,0,Managers,2.0,2,2,MONDAY,10,0,1,1,0,1,1,Self-employed,0.373047,0.445801,0.595215,0.162842,0.072327,0.989746,,,0.160034,0.068970,0.625000,,,,0.156250,,0.149048,0.166016,0.075012,0.989746,,,0.161133,0.068970,0.625000,,,,0.120422,,0.157715,0.164551,0.072327,0.989746,,,0.160034,0.068970,0.625000,,,,0.159058,,0.152100,,block of flats,0.197388,Panel,No,0.0,0.0,0.0,0.0,-2308.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,2.0,0.111680,1.0000,2.000000,-1421.0,0.0,-1122.0,-1513.0,0.00,0.0,147339.000000,0.000,0.0,0.0,-1058.0,0.0,Closed,currency 1,Consumer credit,11486.215820,113758.203125,127578.601562,1500.00,142197.750000,14.000000,1.0,0.036346,,,-1284.0,22.600000,14.500000,365243.000000,-1409.000000,-929.000000,181620.500000,181622.500000,0.000000,Cash loans,SATURDAY,Y,XNA,Approved,Cash through the bank,XAP,Unaccompanied,Repeater,XNA,Cash,XNA,Credit and cash offices,XNA,XNA,Cash,0,0.9537


## 3.5 Conclusion

1. Pycaret is a very convenient tool for data pre-processing and model training. A one line code for data set-up does all the pre-processing. Further one line code can prepare a model and another line can tune it. 
2. Model created by Pycaret is huge in size. Model created using 100% train_data was 173MB in size. Predictions are being made in Google colaboratory using saved models. However, deployment on Heroku throws an error. So models were trained using 50% and 25% data thinking that error in deployment was due to size of the model. Error persisted with models prepared using 50% and 25% data.
3. It was decided to switch to Sklearn and try to deploy a model created by using Sklearn. This is implemented in the next sub-section.






# 4.0 Training and Pipeline using Sklearn

## 4.1 Define list of column names for use in pipeline 

In [None]:
#Read application_test
application_test = dataframe_optimizer(pd.read_csv('/content/drive/MyDrive/AI_ML_Project/Data/application_test.csv'))

#Read columns from application_test
columns_input = list(application_test.columns)

#Save columns_input
file = open('columns_input.pkl', 'wb')
pickle.dump(columns_input, file)
file.close()

## 4.2 Data Preparation

In [None]:
#Read train_data
train_data = dataframe_optimizer(pd.read_csv('/content/drive/MyDrive/AI_ML_Project/Data/train_data.csv'))

In [None]:
#Make lists of numerical and categorical columns
y_train = train_data['TARGET']
X_train_numerical = train_data.select_dtypes(exclude=object).drop(columns=['TARGET'])
X_train_categorical = train_data.select_dtypes(include=object)
columns_numerical = X_train_numerical.columns
columns_categorical = X_train_categorical.columns

## 4.3 Imputation and scaling of numerical data

In [None]:
#Imputation of missing data
imputer = SimpleImputer(missing_values=np.nan, strategy='median')
imputer.fit(X_train_numerical)
X_train_numerical_imputed = imputer.transform(X_train_numerical)

In [None]:
#Save imputer
file = open('imputer.pkl', 'wb')
pickle.dump(imputer, file)
file.close()

In [None]:
#Scaling of data
scaler = StandardScaler()
scaler.fit(X_train_numerical_imputed)
X_train_numerical_imputed_scaled = scaler.transform(X_train_numerical_imputed)
X_train_numerical_imputed_scaled_df = pd.DataFrame(data = X_train_numerical_imputed_scaled, columns = columns_numerical)

In [None]:
#Save scaler
file = open('scaler.pkl', 'wb')
pickle.dump(scaler, file)
file.close()

## 4.4 One hot encoding of categorical data

In [None]:
#Imputation of missing data
imputer_constant = SimpleImputer(strategy='constant', fill_value='missing_vale')
imputer_constant.fit(X_train_categorical)
X_train_categorical_imputed = imputer_constant.transform(X_train_categorical)

In [None]:
#Save imputer_constant
file = open('imputer_constant.pkl', 'wb')
pickle.dump(imputer_constant, file)
file.close()

In [None]:
#One hot encoding of categorical data
ohe = OneHotEncoder(handle_unknown='ignore')
ohe.fit(X_train_categorical_imputed)
X_train_categorical_imputed_ohe = ohe.transform(X_train_categorical_imputed)
columns_ohe = ohe.get_feature_names(input_features=columns_categorical)
X_train_categorical_imputed_ohe_df = pd.DataFrame(data = X_train_categorical_imputed_ohe.toarray(), columns = list(columns_ohe))

In [None]:
#Save ohe columns
file = open('columns_ohe.pkl', 'wb')
pickle.dump(columns_ohe, file)
file.close()

In [None]:
#Save ohe
file = open('ohe.pkl', 'wb')
pickle.dump(ohe, file)
file.close()

## 4.5 Define train data with all columns

In [None]:
#Define train data with all columns
X_train_all_columns = pd.concat([X_train_numerical_imputed_scaled_df, X_train_categorical_imputed_ohe_df], axis = 1)

## 4.6 Outlier removal

In [None]:
#Define outlier detector and fit it to X_train_all_columns with contamination = 0.05
clf = CBLOF(contamination=0.05, check_estimator=False, random_state=42)
clf.fit(X_train_all_columns)
scores_pred = clf.decision_function(X_train_all_columns) * -1

#Predict the datapoints as outlier or inlier
outlier_prediction = clf.predict(X_train_all_columns)
inliers = len(outlier_prediction) - np.count_nonzero(outlier_prediction)
outliers = np.count_nonzero(outlier_prediction == 1)

In [None]:
#Remove outliers
X_train_all_columns_outlier_label = X_train_all_columns.copy()
X_train_all_columns_outlier_label['outlier'] = outlier_prediction.tolist()
X_y_train_all_columns_outlier_label = pd.concat([X_train_all_columns_outlier_label, y_train], axis = 1)
X_y_train_final_outlier_removed = X_y_train_all_columns_outlier_label[X_y_train_all_columns_outlier_label['outlier'] != 1]
X_train = X_y_train_final_outlier_removed.drop(columns = ['TARGET', 'outlier'])
y_train = X_y_train_final_outlier_removed['TARGET']

## 4.7 Feature Selection

In [None]:
#Define model for feature slecetion
model_feature_slection = GradientBoostingClassifier(random_state=0).fit(X_train, y_train)

In [None]:
#Select features
feature_importance = pd.DataFrame(model_feature_slection.feature_importances_, index=X_train.columns, columns=['importance']).sort_values('importance', ascending=False)
selected_features = list(feature_importance['importance'].head(175).index)

In [None]:
#Save selected columns
file = open('selected_features.pkl', 'wb')
pickle.dump(selected_features, file)
file.close()

## 4.8 Train model

In [None]:
#Define model
model = GradientBoostingClassifier(random_state=0).fit(X_train[selected_features], y_train)

In [None]:
#Save model
file = open('model.pkl', 'wb')
pickle.dump(model, file)
file.close()

## 4.9 Create Pipeline and predict

In [None]:
#Import saved data and pickle files
bureau_numerical_merge = dataframe_optimizer(pd.read_csv('/content/drive/MyDrive/AI_ML_Project/Data/bureau_numerical_merge.csv'))
bureau_categorical_merge = dataframe_optimizer(pd.read_csv('/content/drive/MyDrive/AI_ML_Project/Data/bureau_categorical_merge.csv'))
previous_numerical_merge = dataframe_optimizer(pd.read_csv('/content/drive/MyDrive/AI_ML_Project/Data/previous_numerical_merge.csv'))
previous_categorical_merge = dataframe_optimizer(pd.read_csv('/content/drive/MyDrive/AI_ML_Project/Data/previous_categorical_merge.csv'))
filename = open('/content/drive/MyDrive/AI_ML_Project/Data/columns_input.pkl', 'rb')
columns_input = pickle.load(filename)
filename.close()
filename1 = open('/content/drive/MyDrive/AI_ML_Project/Data/model.pkl', 'rb')
model = pickle.load(filename1)
filename1.close()
filename2 = open('/content/drive/MyDrive/AI_ML_Project/Data/imputer.pkl', 'rb')
imputer = pickle.load(filename2)
filename2.close()
filename3 = open('/content/drive/MyDrive/AI_ML_Project/Data/scaler.pkl', 'rb')
scaler = pickle.load(filename3)
filename3.close()
filename4 = open('/content/drive/MyDrive/AI_ML_Project/Data/imputer_constant.pkl', 'rb')
imputer_constant = pickle.load(filename4)
filename4.close()
filename5 = open('/content/drive/MyDrive/AI_ML_Project/Data/ohe.pkl', 'rb')
ohe = pickle.load(filename5)
filename5.close()
filename6 = open('/content/drive/MyDrive/AI_ML_Project/Data/selected_features.pkl', 'rb')
selected_features = pickle.load(filename6)
filename6.close()
filename7 = open('/content/drive/MyDrive/AI_ML_Project/Data/columns_ohe.pkl', 'rb')
columns_ohe = pickle.load(filename7)
filename7.close()

In [None]:
#Define a function to create a pipeline for prediction
def inference(query):
  #Add columns titled DEBT_INCOME_RATIO, LOAN_VALUE_RATIO & LOAN_INCOME_RATIO to a copy of query data
  query_with_additinal_features = query.copy()
  query_with_additinal_features['DEBT_INCOME_RATIO'] = query_with_additinal_features['AMT_ANNUITY']/query_with_additinal_features['AMT_INCOME_TOTAL']
  query_with_additinal_features['LOAN_VALUE_RATIO'] = query_with_additinal_features['AMT_CREDIT']/query_with_additinal_features['AMT_GOODS_PRICE']
  query_with_additinal_features['LOAN_INCOME_RATIO'] = query_with_additinal_features['AMT_CREDIT']/query_with_additinal_features['AMT_INCOME_TOTAL']

  #Merge numerical features from bureau to query data
  query_bureau = query_with_additinal_features.merge(bureau_numerical_merge, on='SK_ID_CURR', how='left', suffixes=('', '_BUREAU'))

  #Merge categorical features from bureau to query data
  query_bureau = query_bureau.merge(bureau_categorical_merge, on='SK_ID_CURR', how='left', suffixes=('', '_BUREAU'))

  #Drop SK_ID_BUREAU
  query_bureau = query_bureau.drop(columns = ['SK_ID_BUREAU'])

  #Shape of query and bureau data combined
  #print('The shape of query and bureau data merged: ', query_bureau.shape)
  
  #Merge numerical features from previous_application to query_bureau
  query_bureau_previous = query_bureau.merge(previous_numerical_merge, on='SK_ID_CURR', how='left', suffixes=('', '_PREVIOUS'))

  #Merge categorical features from previous_application to query_bureau
  query_bureau_previous = query_bureau_previous.merge(previous_categorical_merge, on='SK_ID_CURR', how='left', suffixes=('', '_PREVIOUS'))

  #Drop SK_ID_PREV
  query_bureau_previous = query_bureau_previous.drop(columns = ['SK_ID_PREV'])

  #Shape of query_bureau and previous_application data combined
  #print('The shape of query_bureau and previous_application data merged: ', query_bureau_previous.shape)
  
  #Drop SK_ID_CURR
  query_bureau_previous = query_bureau_previous.drop(columns = ['SK_ID_CURR'])

  query_numerical = query_bureau_previous.select_dtypes(exclude=object)
  query_categorical = query_bureau_previous.select_dtypes(include=object)

  columns_numerical = query_numerical.columns
  columns_categorical = query_categorical.columns

  query_numerical_imputed_scaled_df = imputer.transform(query_numerical)
  query_numerical_imputed_scaled_df = scaler.transform(query_numerical_imputed_scaled_df)
  query_numerical_imputed_scaled_df = pd.DataFrame(data = query_numerical_imputed_scaled_df, columns = columns_numerical)

  query_categorical_imputed_ohe_df = imputer_constant.transform(query_categorical)
  query_categorical_imputed_ohe_df = ohe.transform(query_categorical_imputed_ohe_df)
  query_categorical_imputed_ohe_df = pd.DataFrame(data = query_categorical_imputed_ohe_df.toarray(), columns = list(columns_ohe))

  query_data_all_features = pd.concat([query_numerical_imputed_scaled_df, query_categorical_imputed_ohe_df], axis = 1)
  query_data = query_data_all_features[selected_features]

  predictions = model.predict(query_data)
  return predictions

  #missing_columns = set(list(columns)) - set(['TARGET']) - set(list(query_bureau_previous.columns))
  #if len(missing_columns) != 0:
  #  print("Please enter values for all columns")
  #else:
  #  predictions = predict_model(tuned_model, query_bureau_previous)
  #  return predictions

In [None]:
#Read query data point(s)
query = dataframe_optimizer(pd.read_csv('/content/drive/MyDrive/AI_ML_Project/Data/application_test.csv'))

columns_query = list(query.columns)

if columns_query == columns_input:
  query_prediction = inference(query)
  query_data_with_prediction = query.copy()
  query_data_with_prediction['LABEL'] = query_prediction
  conditions = [(query_data_with_prediction['LABEL'] == 0), (query_data_with_prediction['LABEL'] == 1)]
  values = ['NO', 'YES']
  query_data_with_prediction['DEFAULT TENDENCY'] = np.select(conditions, values)
  query_data_with_prediction = query_data_with_prediction.drop(columns = ['LABEL'])
  display(query_data_with_prediction)
else:
  print("Query columns do not match the columns of required format. Please input in the given format.")

Unnamed: 0,SK_ID_CURR,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,NAME_TYPE_SUITE,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,DAYS_ID_PUBLISH,OWN_CAR_AGE,FLAG_MOBIL,FLAG_EMP_PHONE,FLAG_WORK_PHONE,FLAG_CONT_MOBILE,FLAG_PHONE,FLAG_EMAIL,OCCUPATION_TYPE,CNT_FAM_MEMBERS,REGION_RATING_CLIENT,REGION_RATING_CLIENT_W_CITY,WEEKDAY_APPR_PROCESS_START,HOUR_APPR_PROCESS_START,REG_REGION_NOT_LIVE_REGION,REG_REGION_NOT_WORK_REGION,LIVE_REGION_NOT_WORK_REGION,REG_CITY_NOT_LIVE_CITY,REG_CITY_NOT_WORK_CITY,LIVE_CITY_NOT_WORK_CITY,ORGANIZATION_TYPE,EXT_SOURCE_1,EXT_SOURCE_2,EXT_SOURCE_3,APARTMENTS_AVG,BASEMENTAREA_AVG,YEARS_BEGINEXPLUATATION_AVG,YEARS_BUILD_AVG,COMMONAREA_AVG,ELEVATORS_AVG,ENTRANCES_AVG,FLOORSMAX_AVG,FLOORSMIN_AVG,LANDAREA_AVG,LIVINGAPARTMENTS_AVG,LIVINGAREA_AVG,NONLIVINGAPARTMENTS_AVG,NONLIVINGAREA_AVG,APARTMENTS_MODE,BASEMENTAREA_MODE,YEARS_BEGINEXPLUATATION_MODE,YEARS_BUILD_MODE,COMMONAREA_MODE,ELEVATORS_MODE,ENTRANCES_MODE,FLOORSMAX_MODE,FLOORSMIN_MODE,LANDAREA_MODE,LIVINGAPARTMENTS_MODE,LIVINGAREA_MODE,NONLIVINGAPARTMENTS_MODE,NONLIVINGAREA_MODE,APARTMENTS_MEDI,BASEMENTAREA_MEDI,YEARS_BEGINEXPLUATATION_MEDI,YEARS_BUILD_MEDI,COMMONAREA_MEDI,ELEVATORS_MEDI,ENTRANCES_MEDI,FLOORSMAX_MEDI,FLOORSMIN_MEDI,LANDAREA_MEDI,LIVINGAPARTMENTS_MEDI,LIVINGAREA_MEDI,NONLIVINGAPARTMENTS_MEDI,NONLIVINGAREA_MEDI,FONDKAPREMONT_MODE,HOUSETYPE_MODE,TOTALAREA_MODE,WALLSMATERIAL_MODE,EMERGENCYSTATE_MODE,OBS_30_CNT_SOCIAL_CIRCLE,DEF_30_CNT_SOCIAL_CIRCLE,OBS_60_CNT_SOCIAL_CIRCLE,DEF_60_CNT_SOCIAL_CIRCLE,DAYS_LAST_PHONE_CHANGE,FLAG_DOCUMENT_2,FLAG_DOCUMENT_3,FLAG_DOCUMENT_4,FLAG_DOCUMENT_5,FLAG_DOCUMENT_6,FLAG_DOCUMENT_7,FLAG_DOCUMENT_8,FLAG_DOCUMENT_9,FLAG_DOCUMENT_10,FLAG_DOCUMENT_11,FLAG_DOCUMENT_12,FLAG_DOCUMENT_13,FLAG_DOCUMENT_14,FLAG_DOCUMENT_15,FLAG_DOCUMENT_16,FLAG_DOCUMENT_17,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR,DEFAULT TENDENCY
0,100001,Cash loans,F,N,Y,0,135000.0,568800.0,20560.5,450000.0,Unaccompanied,Working,Higher education,Married,House / apartment,0.018845,-19241,-2329,-5168.0,-812,,1,1,0,1,0,1,,2.0,2,2,TUESDAY,18,0,0,0,0,0,0,Kindergarten,0.752441,0.789551,0.159546,0.065979,0.058990,0.973145,,,,0.137939,0.125000,,,,0.050507,,,0.067200,0.061188,0.973145,,,,0.137939,0.125000,,,,0.052612,,,0.066589,0.058990,0.973145,,,,0.137939,0.125000,,,,0.051392,,,,block of flats,0.039215,"Stone, brick",No,0.0,0.0,0.0,0.0,-1740.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,NO
1,100005,Cash loans,M,N,Y,0,99000.0,222768.0,17370.0,180000.0,Unaccompanied,Working,Secondary / secondary special,Married,House / apartment,0.035797,-18064,-4469,-9120.0,-1623,,1,1,0,1,0,0,Low-skill Laborers,2.0,2,2,FRIDAY,9,0,0,0,0,0,0,Self-employed,0.564941,0.291748,0.432861,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,3.0,NO
2,100013,Cash loans,M,Y,Y,0,202500.0,663264.0,69777.0,630000.0,,Working,Higher education,Married,House / apartment,0.019104,-20038,-4458,-2176.0,-3503,5.0,1,1,0,1,0,0,Drivers,2.0,2,2,MONDAY,14,0,0,0,0,0,0,Transport: type 3,,0.699707,0.610840,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,-856.0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,1.0,4.0,NO
3,100028,Cash loans,F,N,Y,2,315000.0,1575000.0,49018.5,1575000.0,Unaccompanied,Working,Secondary / secondary special,Married,House / apartment,0.026398,-13976,-1866,-2000.0,-4208,,1,1,0,1,1,0,Sales staff,4.0,2,2,WEDNESDAY,11,0,0,0,0,0,0,Business Entity Type 3,0.525879,0.509766,0.612793,0.305176,0.197388,0.997070,0.958984,0.116516,0.320068,0.275879,0.375000,0.041687,0.204224,0.240356,0.367188,0.038605,0.080017,0.310791,0.204956,0.997070,0.960938,0.117615,0.322266,0.275879,0.375000,0.041687,0.208862,0.262695,0.382812,0.03891,0.084717,0.308105,0.197388,0.997070,0.959473,0.11731,0.320068,0.275879,0.375000,0.041687,0.207764,0.244629,0.373779,0.038788,0.081726,reg oper account,block of flats,0.370117,Panel,No,0.0,0.0,0.0,0.0,-1805.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,3.0,NO
4,100038,Cash loans,M,Y,N,1,180000.0,625500.0,32067.0,625500.0,Unaccompanied,Working,Secondary / secondary special,Married,House / apartment,0.010033,-13040,-2191,-4000.0,-4262,16.0,1,1,1,1,0,0,,3.0,2,2,FRIDAY,5,0,0,0,0,1,1,Business Entity Type 3,0.202148,0.425781,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,-821.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,,,,,,NO
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48739,456221,Cash loans,F,N,Y,0,121500.0,412560.0,17473.5,270000.0,Unaccompanied,Working,Secondary / secondary special,Widow,House / apartment,0.002043,-19970,-5169,-9096.0,-3399,,1,1,1,1,1,0,,1.0,3,3,WEDNESDAY,16,0,0,0,0,0,0,Other,,0.648438,0.643066,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,0.0,1.0,0.0,-684.0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0,NO
48740,456222,Cash loans,F,N,N,2,157500.0,622413.0,31909.5,495000.0,Unaccompanied,Commercial associate,Secondary / secondary special,Married,House / apartment,0.035797,-11186,-1149,-3016.0,-3003,,1,1,0,1,0,0,Sales staff,4.0,2,2,MONDAY,11,0,0,0,0,1,1,Trade: type 7,,0.684570,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2.0,0.0,2.0,0.0,0.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,,,,,,NO
48741,456223,Cash loans,F,Y,Y,1,202500.0,315000.0,33205.5,315000.0,Unaccompanied,Commercial associate,Secondary / secondary special,Married,House / apartment,0.026398,-15922,-3037,-2680.0,-1504,4.0,1,1,0,1,1,0,,3.0,2,2,WEDNESDAY,12,0,0,0,0,0,0,Business Entity Type 3,0.733398,0.632812,0.283691,0.111328,0.136353,0.995605,,,0.160034,0.137939,0.333252,,,,0.138306,,0.054199,0.113403,0.141479,0.995605,,,0.161133,0.137939,0.333252,,,,0.144043,,0.057404,0.112427,0.136353,0.995605,,,0.160034,0.137939,0.333252,,,,0.140747,,0.055389,,block of flats,0.166260,"Stone, brick",No,0.0,0.0,0.0,0.0,-838.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,3.0,1.0,NO
48742,456224,Cash loans,M,N,N,0,225000.0,450000.0,25128.0,450000.0,Family,Commercial associate,Higher education,Married,House / apartment,0.018845,-13968,-2731,-1461.0,-1364,,1,1,1,1,1,0,Managers,2.0,2,2,MONDAY,10,0,1,1,0,1,1,Self-employed,0.373047,0.445801,0.595215,0.162842,0.072327,0.989746,,,0.160034,0.068970,0.625000,,,,0.156250,,0.149048,0.166016,0.075012,0.989746,,,0.161133,0.068970,0.625000,,,,0.120422,,0.157715,0.164551,0.072327,0.989746,,,0.160034,0.068970,0.625000,,,,0.159058,,0.152100,,block of flats,0.197388,Panel,No,0.0,0.0,0.0,0.0,-2308.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,2.0,NO


## 4.10 Conclusion

1. Number of lines of code increases significantly when compared to Pycaret.
2. Models created are significantly smaller in size compared to models created by Pycaret.
3. Deployment on Heroku did not throw any error.
4. Deployment can be accessed from https://deployment-0.herokuapp.com/.