In [3]:
!pip install snowflake-snowpark-python==0.10.0

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.2.2[0m[39;49m -> [0m[32;49m22.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [4]:
import os
from pprint import pprint
import re
import datetime

from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report



# Snowflake Connection

In [5]:
from snowflake.snowpark import (
    Column,
    DataFrame,
    Session,
    Window
)

import snowflake.snowpark
from snowflake.snowpark import functions as f
from snowflake.snowpark.types import IntegerType, StringType, StructType, DateType, StructField, MapType

  warn_incompatible_dep(


In [6]:
connection_parameters = {
"account": "gi02106.eu-west-2.aws",
"user": "pujaverma",
"password": "Itzme#123",
"role": "accountadmin",
"warehouse": "workshopwh",
"database": "workshopdb",
"schema": "workshopsch",
}

In [7]:
snowflake_conn_session = Session.builder.configs(connection_parameters).create()

In [8]:
snowflake_conn_session.sql("select current_warehouse(), current_database(), current_schema()").show()

---------------------------------------------------------------------
|"CURRENT_WAREHOUSE()"  |"CURRENT_DATABASE()"  |"CURRENT_SCHEMA()"  |
---------------------------------------------------------------------
|WORKSHOPWH             |WORKSHOPDB            |WORKSHOPSCH         |
---------------------------------------------------------------------



In [9]:
snow_df = snowflake_conn_session.table("Snowpark_dataset")

In [10]:
snow_df.schema.fields

[StructField('ID', LongType(), nullable=True),
 StructField('CHECKING_STATUS', StringType(), nullable=True),
 StructField('DURATION', LongType(), nullable=True),
 StructField('CREDIT_HISTORY', StringType(), nullable=True),
 StructField('PURPOSE', StringType(), nullable=True),
 StructField('CREDIT_AMOUNT', LongType(), nullable=True),
 StructField('SAVINGS_STATUS', StringType(), nullable=True),
 StructField('EMPLOYMENT', StringType(), nullable=True),
 StructField('INSTALLMENT_COMMITMENT', LongType(), nullable=True),
 StructField('PERSONAL_STATUS', StringType(), nullable=True),
 StructField('THER_PARTIES', StringType(), nullable=True),
 StructField('RESIDENCE_SINCE', LongType(), nullable=True),
 StructField('PROPERTY_MAGNITUDE', StringType(), nullable=True),
 StructField('AGE', StringType(), nullable=True),
 StructField('OTHER_PAYMENT_PLANS', StringType(), nullable=True),
 StructField('HOUSING', StringType(), nullable=True),
 StructField('EXISTING_CREDITS', LongType(), nullable=True),
 St

In [11]:
snow_df.show(2)

---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|"ID"  |"CHECKING_STATUS"  |"DURATION"  |"CREDIT_HISTORY"                |"PURPOSE"  |"CREDIT_AMOUNT"  |"SAVINGS_STATUS"  |"EMPLOYMENT"  |"INSTALLMENT_COMMITMENT"  |"PERSONAL_STATUS"   |"THER_PARTIES"  |"RESIDENCE_SINCE"  |"PROPERTY_MAGNITUDE"  |"AGE"  |"OTHER_PAYMENT_PLANS"  |"HOUSING"  |"EXISTING_CREDITS"  |"JOB"    |"NUM_DEPENDENTS"  |"OWN_TELEPHONE"  |"FOREIGN_WORKER"  |"CLASS"  |
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [12]:
snow_df = snow_df.with_column('TARGET', f.when(f.col('CLASS') == 'good',1).otherwise(0))
snow_df = snow_df.drop(f.col('CLASS'))

snow_df.show(2)

----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|"ID"  |"CHECKING_STATUS"  |"DURATION"  |"CREDIT_HISTORY"                |"PURPOSE"  |"CREDIT_AMOUNT"  |"SAVINGS_STATUS"  |"EMPLOYMENT"  |"INSTALLMENT_COMMITMENT"  |"PERSONAL_STATUS"   |"THER_PARTIES"  |"RESIDENCE_SINCE"  |"PROPERTY_MAGNITUDE"  |"AGE"  |"OTHER_PAYMENT_PLANS"  |"HOUSING"  |"EXISTING_CREDITS"  |"JOB"    |"NUM_DEPENDENTS"  |"OWN_TELEPHONE"  |"FOREIGN_WORKER"  |"TARGET"  |
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [13]:
snow_df_train, snow_df_inf = snow_df.random_split([0.8, 0.2], seed=1234)

snow_df_train.write.save_as_table('training_table', mode="overwrite", create_temp_table=False)
snow_df_inf.write.save_as_table('inference_table', mode="overwrite", create_temp_table=False)

In [None]:
session = Session.builder.configs(connection_parameters).create()
session.add_packages('snowflake-snowpark-python', 'scikit-learn', 'pandas', 'numpy', 'joblib', 'cachetools')

def save_file(session, model, path):
    input_stream = io.BytesIO()
    joblib.dump(model, input_stream)
    session._conn._cursor.upload_stream(input_stream, path)
    return "successfully created file: " + path

def train_model(session: snowflake.snowpark.Session) -> float:
    
    
    df_train = session.table('training_table')

    df_train_pd = df_train.to_pandas()
    
    numerical_cols = list(df_train_pd.select_dtypes(['float64', 'int64']).columns)
    categorical_cols = list(df_train_pd.drop('TARGET', axis=1).select_dtypes(['object']).columns)

    X = df_train_pd.drop('TARGET', axis=1)
    y = df_train_pd['TARGET']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1234)

    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())])
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
        ('one_hot', OneHotEncoder(handle_unknown='ignore'))])
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numerical_cols),
            ('cat', categorical_transformer, categorical_cols)])

    rf_pipe = Pipeline(steps=[('preprocessor', preprocessor),
                          ('classifier',  RandomForestClassifier(class_weight='balanced',
                                                                 random_state=0))])

    rf_clf = rf_pipe.fit(X_train, y_train)
    
    save_file(session, rf_clf, "@SNOWFLAKESTAGE/credit_g_model.joblib")

    y_pred = rf_clf.predict(X_test)
    return classification_report(y_test, y_pred)

train_model_sp = f.sproc(train_model(session), replace=True)

train_model_sp()

In [1]:
#!pip install cachetools

Collecting cachetools
  Downloading cachetools-5.2.0-py3-none-any.whl (9.3 kB)
Installing collected packages: cachetools
Successfully installed cachetools-5.2.0
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.2.2[0m[39;49m -> [0m[32;49m22.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
