# A Predictive Analysis of Loan Approvals through Classification Modeling and Cloud Computing

Saba Alemayehu, Dennis Myasnyankin, and Anusia Edward

# Necessary pips 
! pip install pyathena
! pip install awswrangler 
! pip install fast_ml
! pip install smclarify
! pip install sdv
# from sdv.tabular import GaussianCopula

In [21]:
# Necessary Imports 
import boto3, os, sagemaker
import io
import pandas as pd
from pandas.core.internals import concat
import seaborn as sns
import numpy as np
import fast_ml
from fast_ml.model_development import train_valid_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn import preprocessing
from sklearn.utils import resample 
from smclarify.bias import report 
import matplotlib.pyplot as plt
import pyathena as pa
from pyathena import connect
from pyathena.pandas.cursor import PandasCursor
import awswrangler as wr 
import warnings
warnings.filterwarnings("ignore")

## Data Wrangling 

In [22]:
# database (db) set-up using athena 
sess = sagemaker.Session()
bucket = sess.default_bucket()
role = sagemaker.get_execution_role()
region = boto3.Session().region_name
ingest_create_athena_db_passed = False

In [23]:
# database (db) name set-up
db_name = "SDAloans"
# s3 staging directory 
s3_sg_dir = "s3://{0}/athena/staging".format(bucket)
# connection via directory for querying
conn = connect(region_name=region, s3_staging_dir=s3_sg_dir)

In [24]:
# creating the database (db = SDAloans)
statement = "CREATE DATABASE IF NOT EXISTS {}".format(db_name)
print(statement)
pd.read_sql(statement, conn)

CREATE DATABASE IF NOT EXISTS SDAloans


In [25]:
# verification of db creation
statement = "SHOW DATABASES"
df_show = pd.read_sql(statement, conn)
df_show.head(3)

Unnamed: 0,database_name
0,default
1,sdaloans


In [26]:
# setting directory to s3 bucket with files
# SDAloans_dir = 's3://ads508loanapproval/datasets/data1'

In [27]:
# SQL: reading in the 1st dataset=trans.csv as a table (tb) into directory
tb1_name ='trans'
pd.read_sql(f'DROP TABLE IF EXISTS {db_name}.{tb1_name}', conn)

create_table = f"""
CREATE EXTERNAL TABLE IF NOT EXISTS {db_name}.{tb1_name}(
                index int,
                trans_id int,
                account_id int,
                date date,
                type string,
                operation string,
                amount int,
                balance int,
                k_symbol string,
                bank string,
                account int
                )
                
                ROW FORMAT DELIMITED
                FIELDS TERMINATED BY ','
                LOCATION 's3://ads508loanapproval/datasets/data1/'
                TBLPROPERTIES ('skip.header.line.count'='1')
"""
pd.read_sql(create_table, conn)
pd.read_sql(f'SELECT * FROM {db_name}.{tb1_name} LIMIT 3', conn)

Unnamed: 0,index,trans_id,account_id,date,type,operation,amount,balance,k_symbol,bank,account
0,515583,839750,2859,,VYDAJ,VYBER,3600,64639,,,
1,515584,839751,2859,,VYDAJ,VYBER,3900,26081,,,
2,515585,839754,2859,,VYDAJ,VYBER,6000,29981,,,


In [28]:
# SQL: reading in the 2nd dataset=trans_2.csv as a table (tb) into directory
tb2_name ='trans_2'
pd.read_sql(f'DROP TABLE IF EXISTS {db_name}.{tb2_name}', conn)

create_table = f"""
CREATE EXTERNAL TABLE IF NOT EXISTS {db_name}.{tb2_name}(
                index int,
                trans_id int,
                type string,
                operation string,
                amount2 int,
                balance int
                )
                
                ROW FORMAT DELIMITED
                FIELDS TERMINATED BY ','
                LOCATION 's3://ads508loanapproval/datasets/data2/'
                TBLPROPERTIES ('skip.header.line.count'='1')
"""
pd.read_sql(create_table, conn)
pd.read_sql(f'SELECT * FROM {db_name}.{tb2_name} LIMIT 3', conn)

Unnamed: 0,index,trans_id,type,operation,amount2,balance
0,0,289,Credit,Collection,0,0
1,1,290,Credit,Collection,0,0
2,2,291,Credit,Collection,0,0


In [29]:
# SQL: reading in the 3rd dataset=loan.csv as a table (tb) into directory
tb3_name ='loan'
pd.read_sql(f'DROP TABLE IF EXISTS {db_name}.{tb3_name}', conn)

create_table = f"""
CREATE EXTERNAL TABLE IF NOT EXISTS {db_name}.{tb3_name}(
                index int,
                loan_id int,
                account_id int,
                date date,
                amount int,
                duration int,
                payments float,
                status string
                )
                
                ROW FORMAT DELIMITED
                FIELDS TERMINATED BY ','
                LOCATION 's3://ads508loanapproval/datasets/data3/'
                TBLPROPERTIES ('skip.header.line.count'='1')
"""
pd.read_sql(create_table, conn)
pd.read_sql(f'SELECT * FROM {db_name}.{tb3_name} LIMIT 3', conn)

Unnamed: 0,index,loan_id,account_id,date,amount,duration,payments,status
0,0,4959,2,,80952,24,3373.0,A
1,1,4961,19,,30276,12,2523.0,B
2,2,4962,25,,30276,12,2523.0,A


In [30]:
# SQL: reading in the 4th dataset=account.csv as a table (tb) into directory
tb4_name ='account'
pd.read_sql(f'DROP TABLE IF EXISTS {db_name}.{tb4_name}', conn)

create_table = f"""
CREATE EXTERNAL TABLE IF NOT EXISTS {db_name}.{tb4_name}(
                index int,
                account_id int,
                district_id int,
                frequency string,
                date date
                )
                
                ROW FORMAT DELIMITED
                FIELDS TERMINATED BY ','
                LOCATION 's3://ads508loanapproval/datasets/data4/'
                TBLPROPERTIES ('skip.header.line.count'='1')
"""
pd.read_sql(create_table, conn)
pd.read_sql(f'SELECT * FROM {db_name}.{tb4_name} LIMIT 3', conn)

Unnamed: 0,index,account_id,district_id,frequency,date
0,0,1,18,POPLATEK MESICNE,
1,1,2,1,POPLATEK MESICNE,
2,2,3,5,POPLATEK MESICNE,


In [31]:
# verification of db creation + storing 
statement = "SHOW DATABASES"
df_show = pd.read_sql(statement, conn)
df_show.head(3)

Unnamed: 0,database_name
0,default
1,sdaloans


In [32]:
if db_name in df_show.values:
    ingest_create_athena_db_passed = True

In [33]:
%store ingest_create_athena_db_passed

Stored 'ingest_create_athena_db_passed' (bool)


# SQL: merging tb 1-4 + saving as a df 
df=pd.read_sql(f'SELECT * FROM (SELECT t4.frequency, t3.account_id, t3.account,\
t3.duration, t3.payments, t3.status FROM {db_name}.{tb4_name} t4 RIGHT JOIN \
                            {db_name}.{tb3_name} t3 ON t4.account_id \
                            = t3.account_id) c1 LEFT JOIN (SELECT t1.trans_id,\
                            t1.amount, t1.balance FROM {db_name}.{tb1_name}) t1\
                             ON c1.account_id=t1.account_id) c2 LEFT JOIN\
                              (SELECT t2.operation FROM {db_name}.{tb2_name})\
                              t2 ON c2.trans_id=t2.trans_id', conn)

## Data Preparation

# visualization of merged tb 1-4 into a df 
df.head(5)

## Feature Engineering + Removal of Unnecessary Attributes