## Imports

In [1]:
from snowflake.snowpark.session import Session
import snowflake.snowpark.types as T
import snowflake.snowpark.functions as F
from snowflake.snowpark.functions import col

from snowflake.snowpark.functions import udf
from snowflake.snowpark.types import IntegerType, FloatType, StringType,StructType, StructField

import snowflake.ml.modeling.preprocessing as snowml
from snowflake.ml.modeling.xgboost import XGBClassifier
from snowflake.ml.modeling.preprocessing import KBinsDiscretizer, OrdinalEncoder, OneHotEncoder
from snowflake.ml.modeling.impute import SimpleImputer

import json

import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

In [2]:
my_dir = os.getcwd()
connection_parameters = json.load(open(f'/{my_dir}/creds.json'))
session = Session.builder.configs(connection_parameters).create()

# Prepare Data 

In [4]:
session.sql('ALTER WAREHOUSE SSK_RESEARCH SET WAREHOUSE_SIZE = "LARGE"').collect()

[Row(status='Statement executed successfully.')]

In [6]:
session.use_database('ML_SNOWPARK_CI_CD')
session.use_schema('DATA_PROCESSING')

In [7]:
# Creating a Snowpark DataFrame
application_record_sdf = session.table('APPLICATION_RECORD')
credit_record_sdf = session.table('CREDIT_RECORD')
print('Application table size\t: ',application_record_sdf.count(), 
      '\nCredit table size\t: ', credit_record_sdf.count())

Application table size	:  438557 
Credit table size	:  1048575


In [8]:
credit_record_sdf.limit(5).to_pandas()

Unnamed: 0,ID,MONTHS_BALANCE,STATUS
0,5001711,0,X
1,5001711,-1,0
2,5001711,-2,0
3,5001711,-3,0
4,5001712,0,C


In [9]:
# We will create a new column, TARGET, that will have a 1 for high-risk and 0 for low-risk.
credit_record_sdf = credit_record_sdf.group_by('ID')\
                        .agg(F.sum(F.iff(F.col('STATUS').in_(['2', '3','4','5']), 1, 0)).as_("CNT_LATE"))\
                        .with_column('TARGET', F.when(F.col('CNT_LATE') > 0, 1).otherwise(0)).drop("CNT_LATE")

In [10]:
# Join Credit Record data with Application Record Data
joined_sdf = application_record_sdf.join(credit_record_sdf, using_columns='ID', join_type='inner')

In [11]:
# Duplicate Removal - Use the **drop_duplicates** to remove duplicated rows
joined_sdf = joined_sdf.drop_duplicates('ID')

In [12]:
joined_sdf.count()

36457

In [13]:
# Selecting a few columns for modeling
cols_numerical = ['AMT_INCOME_TOTAL', 'DAYS_EMPLOYED', 'FLAG_MOBIL', 'CNT_FAM_MEMBERS', 'TARGET']
cols_categorical = ['CODE_GENDER', 'NAME_HOUSING_TYPE', 'OCCUPATION_TYPE']
joined_sdf = joined_sdf[cols_numerical+cols_categorical]

In [14]:
joined_sdf.limit(10).to_pandas()

Unnamed: 0,AMT_INCOME_TOTAL,DAYS_EMPLOYED,FLAG_MOBIL,CNT_FAM_MEMBERS,TARGET,CODE_GENDER,NAME_HOUSING_TYPE,OCCUPATION_TYPE
0,67500.0,-213,1,4.0,0,F,With parents,Core staff
1,90000.0,-1536,1,2.0,0,F,Rented apartment,Core staff
2,202500.0,-127,1,3.0,0,M,House / apartment,Laborers
3,112500.0,-1508,1,3.0,0,M,House / apartment,Drivers
4,135000.0,-4428,1,2.0,0,F,House / apartment,Managers
5,202500.0,-3166,1,2.0,0,F,House / apartment,Sales staff
6,157500.0,-3078,1,1.0,0,F,House / apartment,Managers
7,126000.0,-5553,1,2.0,0,M,House / apartment,
8,360000.0,-2475,1,3.0,0,F,House / apartment,
9,270000.0,-2408,1,2.0,0,F,House / apartment,


In [15]:
# Perform One-Hot-Encoding for categorical columns
my_ohe_encoder = OneHotEncoder(input_cols=cols_categorical, output_cols=cols_categorical, drop_input_cols=True)
prepared_sdf = my_ohe_encoder.fit(joined_sdf).transform(joined_sdf)

In [16]:
prepared_sdf.columns

['CODE_GENDER_F',
 'CODE_GENDER_M',
 '"NAME_HOUSING_TYPE_Co-op apartment"',
 '"NAME_HOUSING_TYPE_House / apartment"',
 '"NAME_HOUSING_TYPE_Municipal apartment"',
 '"NAME_HOUSING_TYPE_Office apartment"',
 '"NAME_HOUSING_TYPE_Rented apartment"',
 '"NAME_HOUSING_TYPE_With parents"',
 '"OCCUPATION_TYPE_Accountants"',
 '"OCCUPATION_TYPE_Cleaning staff"',
 '"OCCUPATION_TYPE_Cooking staff"',
 '"OCCUPATION_TYPE_Core staff"',
 '"OCCUPATION_TYPE_Drivers"',
 '"OCCUPATION_TYPE_HR staff"',
 '"OCCUPATION_TYPE_High skill tech staff"',
 '"OCCUPATION_TYPE_IT staff"',
 '"OCCUPATION_TYPE_Laborers"',
 '"OCCUPATION_TYPE_Low-skill Laborers"',
 '"OCCUPATION_TYPE_Managers"',
 '"OCCUPATION_TYPE_Medicine staff"',
 '"OCCUPATION_TYPE_Private service staff"',
 '"OCCUPATION_TYPE_Realty agents"',
 '"OCCUPATION_TYPE_Sales staff"',
 '"OCCUPATION_TYPE_Secretaries"',
 '"OCCUPATION_TYPE_Security staff"',
 '"OCCUPATION_TYPE_Waiters/barmen staff"',
 '"OCCUPATION_TYPE_None"',
 'AMT_INCOME_TOTAL',
 'DAYS_EMPLOYED',
 'FLAG_MO

In [17]:
prepared_sdf.limit(5).to_pandas()

Unnamed: 0,CODE_GENDER_F,CODE_GENDER_M,NAME_HOUSING_TYPE_Co-op apartment,NAME_HOUSING_TYPE_House / apartment,NAME_HOUSING_TYPE_Municipal apartment,NAME_HOUSING_TYPE_Office apartment,NAME_HOUSING_TYPE_Rented apartment,NAME_HOUSING_TYPE_With parents,OCCUPATION_TYPE_Accountants,OCCUPATION_TYPE_Cleaning staff,...,OCCUPATION_TYPE_Sales staff,OCCUPATION_TYPE_Secretaries,OCCUPATION_TYPE_Security staff,OCCUPATION_TYPE_Waiters/barmen staff,OCCUPATION_TYPE_None,AMT_INCOME_TOTAL,DAYS_EMPLOYED,FLAG_MOBIL,CNT_FAM_MEMBERS,TARGET
0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,112500.0,-141,1,3.0,0
1,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,112500.0,-361,1,2.0,0
2,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,126000.0,-157,1,3.0,0
3,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,315000.0,-2851,1,3.0,0
4,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,562500.0,-2836,1,2.0,0


In [18]:
# Cleaning column names to make it easier for future referencing
import re

cols = prepared_sdf.columns
for old_col in cols:
    new_col = re.sub(r'[^a-zA-Z0-9_]', '', old_col)
    new_col = new_col.upper()
    prepared_sdf = prepared_sdf.rename(col(old_col), new_col)

In [19]:
# Split the data and save the train and test sets as tables in Snowflake
snowdf_train, snowdf_test, snowdf_processed = prepared_sdf.random_split([0.8, 0.1, 0.1], seed=99) 
snowdf_train.write.mode("overwrite").save_as_table("CREDIT_DEFAULT_TRAIN")
snowdf_test.write.mode("overwrite").save_as_table("CREDIT_DEFAULT_TEST")
snowdf_processed.write.mode("overwrite").save_as_table("CREDIT_DEFAULT_PROCESSED")

# Close Session

In [20]:
session.close()