In [1]:
# Add module folder to notebook
import os
import sys

from dotenv import find_dotenv
sys.path.append(os.path.dirname(find_dotenv()))

In [2]:
from snowflake.snowpark import functions
import datetime

from app.snowpark_session.session import snowpark_session

In [3]:
session = snowpark_session()

session.use_database("HOL_DB")

In [4]:
print(f"Current Database and schema: {session.get_fully_qualified_current_schema()}")
print(f"Current Warehouse: {session.get_current_warehouse()}")

Current Database and schema: "HOL_DB"."PUBLIC"
Current Warehouse: "LEIT_WH"


In [5]:
snowpark_df = session.table('APPLICATION_RECORD')

In [6]:
snowpark_df.limit(10).to_pandas()

Unnamed: 0,ID,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,DAYS_BIRTH,DAYS_EMPLOYED,FLAG_MOBIL,FLAG_WORK_PHONE,FLAG_PHONE,FLAG_EMAIL,OCCUPATION_TYPE,CNT_FAM_MEMBERS
0,5008804,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,-12005,-4542,1,1,0,0,,2.0
1,5008805,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,-12005,-4542,1,1,0,0,,2.0
2,5008806,M,Y,Y,0,112500.0,Working,Secondary / secondary special,Married,House / apartment,-21474,-1134,1,0,0,0,Security staff,2.0
3,5008808,F,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,-19110,-3051,1,0,1,1,Sales staff,1.0
4,5008809,F,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,-19110,-3051,1,0,1,1,Sales staff,1.0
5,5008810,F,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,-19110,-3051,1,0,1,1,Sales staff,1.0
6,5008811,F,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,-19110,-3051,1,0,1,1,Sales staff,1.0
7,5008812,F,N,Y,0,283500.0,Pensioner,Higher education,Separated,House / apartment,-22464,365243,1,0,0,0,,1.0
8,5008813,F,N,Y,0,283500.0,Pensioner,Higher education,Separated,House / apartment,-22464,365243,1,0,0,0,,1.0
9,5008814,F,N,Y,0,283500.0,Pensioner,Higher education,Separated,House / apartment,-22464,365243,1,0,0,0,,1.0


## EXERCISE: Answer the following Questions

How many variables (columns) does the dataset have?

In [12]:
len(snowpark_df.columns)

18

How many variables (columns) does the dataset have?

In [13]:
snowpark_df.count()

438557

What is the average income?

In [23]:
snowpark_df.agg(functions.avg('AMT_INCOME_TOTAL').as_('AVG_AMT_INCOME_TOTAL')).show()

--------------------------
|"AVG_AMT_INCOME_TOTAL"  |
--------------------------
|187524.2860095039       |
--------------------------



How many people are Single?

In [25]:
snowpark_df.filter(functions.col('NAME_FAMILY_STATUS') == 'Single / not married').count()

55271

## EXERCISE: Create / Drop Features

Create a new feature containing the years of  Formula: Absolute Value of DAYS_EMPLOYED divided by 365 days rounded down

In [31]:
snowpark_df = snowpark_df.withColumn('YEARS_OF_EMPLOYMENT', functions.floor(functions.abs(functions.col('DAYS_EMPLOYED')) / 365))
snowpark_df.select(['ID', 'DAYS_EMPLOYED', 'YEARS_OF_EMPLOYMENT']).limit(10).to_pandas()

Unnamed: 0,ID,DAYS_EMPLOYED,YEARS_OF_EMPLOYMENT
0,5008804,-4542,12
1,5008805,-4542,12
2,5008806,-1134,3
3,5008808,-3051,8
4,5008809,-3051,8
5,5008810,-3051,8
6,5008811,-3051,8
7,5008812,365243,1000
8,5008813,365243,1000
9,5008814,365243,1000


Drop the variable DAYS_EMPLOYED

In [32]:
snowpark_df = snowpark_df.drop('DAYS_EMPLOYED')

In [33]:
snowpark_df.limit(20).to_pandas()

Unnamed: 0,ID,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,DAYS_BIRTH,FLAG_MOBIL,FLAG_WORK_PHONE,FLAG_PHONE,FLAG_EMAIL,OCCUPATION_TYPE,CNT_FAM_MEMBERS,YEARS_OF_EMPLOYMENT
0,5008804,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,-12005,1,1,0,0,,2.0,12
1,5008805,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,-12005,1,1,0,0,,2.0,12
2,5008806,M,Y,Y,0,112500.0,Working,Secondary / secondary special,Married,House / apartment,-21474,1,0,0,0,Security staff,2.0,3
3,5008808,F,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,-19110,1,0,1,1,Sales staff,1.0,8
4,5008809,F,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,-19110,1,0,1,1,Sales staff,1.0,8
5,5008810,F,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,-19110,1,0,1,1,Sales staff,1.0,8
6,5008811,F,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,-19110,1,0,1,1,Sales staff,1.0,8
7,5008812,F,N,Y,0,283500.0,Pensioner,Higher education,Separated,House / apartment,-22464,1,0,0,0,,1.0,1000
8,5008813,F,N,Y,0,283500.0,Pensioner,Higher education,Separated,House / apartment,-22464,1,0,0,0,,1.0,1000
9,5008814,F,N,Y,0,283500.0,Pensioner,Higher education,Separated,House / apartment,-22464,1,0,0,0,,1.0,1000


In [34]:
session.close()