## Notebook Index
1. [Feature Store ](https://app.snowflake.com/sfpscogs/rpegu_aiml/#/notebooks/ML_MODELS.DS.%2201_FeatureStore_Creation%22)
2. [Feature Reduction ](https://app.snowflake.com/sfpscogs/rpegu_aiml/#/notebooks/ML_MODELS.DS.%2202_Feature_Reduction%22)
3. Model Training 👈
4. [Model Inference & Schdeuling](https://app.snowflake.com/sfpscogs/rpegu_aiml/#/notebooks/ML_MODELS.DS.%2204_Batch_Inferencing%22)


## Notebook Overview?

In this notebook, we will train machine learning models on a reduced dataset. The reduction process involves removing features that are either highly correlated or exhibit very low variance (less than 0.1).

We will then perform the following preprocessing steps:
* Convert categorical variables into numerical format using one-hot encoding
* Apply Min-Max scaling to standardize numerical features
* After preprocessing, we will experiment with several modeling techniques, including:
    * XGBoost 
    * LightGBM
    * Random Forest
* Model tuning will be performed using GridSearch, and all training will be executed using Snowflake ML.


In [None]:
# Import python packages
import streamlit as st
import pandas as pd

# Snowpark ML
import snowflake.snowpark.functions as F
from snowflake.ml.modeling.pipeline import Pipeline 
from snowflake.ml.modeling.preprocessing import MinMaxScaler , OneHotEncoder
from snowflake.ml.modeling.xgboost import XGBClassifier
from snowflake.ml.modeling.lightgbm import LGBMClassifier

from snowflake.ml.modeling.ensemble import RandomForestClassifier
from snowflake.ml.modeling.model_selection import GridSearchCV, RandomizedSearchCV
from snowflake.ml.registry import Registry
from snowflake.ml._internal.utils import identifier

# snowpark ML metrics
from snowflake.ml.modeling.metrics import accuracy_score,f1_score,precision_score,roc_auc_score,roc_curve,recall_score

from snowflake.snowpark.types import DecimalType, DoubleType, StringType

# We can also use Snowpark for our analyses!
from snowflake.snowpark.context import get_active_session
from snowflake.snowpark.version import VERSION
import joblib
session = get_active_session()


In [None]:

#database
input_database          = 'ML_MODELS'
working_database       = 'ML_MODELS'

#schema
input_schema            = 'DS'
working_schema          = 'DS'
fs_schema               = 'FEATURE_STORE'
model_registry_schema   = 'ML_REGISTRY'
stage_name = 'MODEL_OBJECT'
stage = f"@{working_database }.{working_schema }.{stage_name }"



warehouse = 'DS_W'
snowpark_opt_warehouse  = 'SNOWPARK_OPT_WH'
session.use_warehouse(warehouse )
session.use_role('FR_SCIENTIST')
snowflake_environment = session.sql('SELECT current_user(), current_version()').collect()
snowpark_version = VERSION
# Current Environment Details
print('\nConnection Established with the following parameters:')
print('User                        : {}'.format(snowflake_environment[0][0]))
print('Role                        : {}'.format(session.get_current_role()))
print('stage                        : {}'.format(stage))

print('Database                    : {}'.format(session.get_current_database()))
print('Schema                      : {}'.format(session.get_current_schema()))
print('Warehouse                   : {}'.format(session.get_current_warehouse()))
print('Snowflake version           : {}'.format(snowflake_environment[0][1]))
print('Snowpark for Python version : {}.{}.{}'.format(snowpark_version[0],snowpark_version[1],snowpark_version[2]))

In [None]:
from snowflake.ml import dataset
# Create a DataConnector from a Snowflake Dataset
ds = dataset.load_dataset(session, "Reduced_Dataset", "V_1")

# Get a Snowpark DataFrame
df = ds.read.to_snowpark_dataframe()

## get all columns with stringType= type
excluded = ['MEMBER_ID', 'TARGET','REF_MMYY','CAT_1','CAT_2','CAT_3','CAT_4','CAT_5']
num_cols = [col for col in df.columns if col not in excluded]


excluded = ['MEMBER_ID', 'TARGET','REF_MMYY']
# Get string columns
string_columns = [field.name for field in df.schema.fields if isinstance(field.datatype, StringType)]

# Filter out excluded columns
cat_cols = [col for col in string_columns if col not in excluded]

def fix_values(columnn):
    return F.upper(F.regexp_replace(F.col(columnn), '[^a-zA-Z0-9]+', '_'))
for col in cat_cols:
    df = df.with_column(col, fix_values(col))


In [None]:
#df.select('CAT_1','CAT_2','CAT_3','CAT_4','CAT_5')

In [None]:
CREATE STAGE if not exists MODEL_OBJECT ENCRYPTION = (TYPE = 'SNOWFLAKE_SSE');

In [None]:
df.show()

In [None]:

preprocessing_pipeline = Pipeline(
    steps=[
        
        ("OHE",
         OneHotEncoder(input_cols=cat_cols,
                       output_cols=cat_cols,
                       drop_input_cols=True,
                       drop="first",
                       handle_unknown="ignore",)
         ),
        ("MMS",MinMaxScaler(clip=True, 
                            input_cols=num_cols,
                            output_cols=num_cols,))
    ]

)


joblib.dump(preprocessing_pipeline, 'preprocessing_pipeline.joblib')
#upload
session.file.put('preprocessing_pipeline.joblib',
                 stage,auto_compress=False)

In [None]:
#download the preprocessor from the stage

# Load the preprocessing pipeline object from stage- to do this, we download the preprocessing_pipeline.joblib.gz file to the warehouse
# where our notebook is running, and then load it using joblib.
session.file.get(f'{stage}/preprocessing_pipeline.joblib', '/tmp')
PIPELINE_FILE = '/tmp/preprocessing_pipeline.joblib'

preprocessing_pipeline = joblib.load(PIPELINE_FILE)

## Training and Validation Strategy
For model training, we will use data where **REF_MMYYYY = '042025'**, and for validation, we will use data with **REF_MMYYYY = '052025'**.The training dataset will be further split into training and test sets using an 80/20 random split to evaluate model performance

In [None]:
## register this a udf
label_col = 'TARGET'
train_yrmo ='042025' # thid cwan be variable or parametrized


df = df.filter(F.col("REF_MMYY") == train_yrmo).drop("REF_MMYY")
#train and test split
train_df, test_df = df.random_split(weights = [0.80,0.20],seed=62)

def get_features(df, label_col:str):
    return [col for col in df.columns if col not in ('MEMBER_ID', 'REF_MMYY',label_col)]



In [None]:
session.use_warehouse(snowpark_opt_warehouse )
FEATURE_COLS = get_features(train_df, label_col)
print(f"Total featurs before preprocessing {len(FEATURE_COLS )} ")

##applying pre-processor
train_df=preprocessing_pipeline.fit(train_df).transform(train_df)
test_df=preprocessing_pipeline.transform(test_df)

print(f"Total featuress before preprocessing {len(train_df.columns)} ")
print(f"Total rows : {train_df.count()}")



In [None]:
train_df.show()

## XGBOOST CLASSIFIER

In [None]:
FEATURE_COLS = get_features(train_df, label_col)
OUTPUT_COLUMNS="PREDICTED_TARGET"
label_col='TARGET'


In [None]:
XGB_Classifier= XGBClassifier(
    input_cols=FEATURE_COLS ,
    label_cols=label_col,
    output_cols=OUTPUT_COLUMNS
)
# Train
XGB_Classifier.fit(train_df)

#  evaluation 
predict_on_training_data = XGB_Classifier.predict(train_df)

training_accuracy = accuracy_score(df=predict_on_training_data, 
                                   y_true_col_names=["TARGET"],
                                   y_pred_col_names=["PREDICTED_TARGET"])


result = XGB_Classifier.predict(test_df)


In [None]:
from snowflake.ml.modeling.metrics import confusion_matrix


metrics = {
"accuracy":accuracy_score(df=result, 
                          y_true_col_names="TARGET", 
                          y_pred_col_names="PREDICTED_TARGET"),

"precision":precision_score(df=result,
                            y_true_col_names="TARGET", 
                            y_pred_col_names="PREDICTED_TARGET"),


"recall": recall_score(df=result, 
                       y_true_col_names="TARGET",
                       y_pred_col_names="PREDICTED_TARGET"),



"f1_score":f1_score(df=result,
                   y_true_col_names="TARGET",
                   y_pred_col_names="PREDICTED_TARGET"),
"confusion_matrix":confusion_matrix(df=result, 
                                    y_true_col_name="TARGET",
                                    y_pred_col_name="PREDICTED_TARGET").tolist()
}

print(f" The Score for the xgboost model :\n {metrics}")


In [None]:
# Get sample input data to pass into the registry logging function
X = train_df.select(FEATURE_COLS).limit(100)
db = working_database 
schema =model_registry_schema 

# Create a registry and log the model
reg = Registry(session=session, database_name=db, schema_name=schema)


model_name = "ML_XGBOOST_MODEL"
version_name = "v1"

# Let's first log the very first model we trained
mv = reg.log_model(
    model_name=model_name,
    version_name=version_name,
    model= XGB_Classifier,
    metrics=metrics ,
    sample_input_data=X, # to provide the feature schema
)


# Add a description
mv.comment = """This is the first iteration of our ml poc  model. 
It is used for demo purposes and it is simple xgboost model."""


# Let's confirm they were added
reg.get_model(model_name).show_versions()

## RANDOM FOREST 

In [None]:
FEATURE_COLS = get_features(train_df, label_col)
OUTPUT_COLUMNS="PREDICTED_TARGET"
label_col='TARGET'


RandomForest= RandomForestClassifier(
    input_cols=FEATURE_COLS ,
    label_cols=label_col,
    output_cols=OUTPUT_COLUMNS
)
# Train
RandomForest.fit(train_df)

#  evaluation 
predict_on_training_data = RandomForest.predict(train_df)

training_accuracy = accuracy_score(df=predict_on_training_data, 
                                   y_true_col_names=["TARGET"],
                                   y_pred_col_names=["PREDICTED_TARGET"])


result = RandomForest.predict(test_df)

## metrics

metrics = {
"accuracy":accuracy_score(df=result ,
                          y_true_col_names="TARGET", 
                          y_pred_col_names="PREDICTED_TARGET"),

"precision":precision_score(df=result,
                            y_true_col_names="TARGET", 
                            y_pred_col_names="PREDICTED_TARGET"),


"recall": recall_score(df=result, 
                       y_true_col_names="TARGET",
                       y_pred_col_names="PREDICTED_TARGET"),



"f1_score":f1_score(df=result,
                   y_true_col_names="TARGET",
                   y_pred_col_names="PREDICTED_TARGET"),
"confusion_matrix":confusion_matrix(df=result, 
                                    y_true_col_name="TARGET",
                                    y_pred_col_name="PREDICTED_TARGET").tolist()
}

print(f" The Score for the xgboost model :\n {metrics}")


## register the model 

model_name = "ML_RANDOMFOREST_MODEL"
version_name = "v1"

# Let's first log the very first model we trained
mv = reg.log_model(
    model_name=model_name,
    version_name=version_name,
    model= RandomForest,
    metrics=metrics ,
    sample_input_data=X, # to provide the feature schema
)


# Add a description
mv.comment = """This is the first iteration of poc  random forest model. 
It is used for demo purposes and it is simple random_forest model."""


# Let's confirm they were added
reg.get_model(model_name).show_versions()


## HYPERPARAMETER TUNING

We will perform a **grid search** using **Snowflake ML** to find the optimal hyperparameters for our model. Grid search systematically tests combinations of hyperparameter values to identify the configuration that yields the best model performance.

This process will be executed within Snowflake using its built-in machine learning capabilities, ensuring that all computation stays close to the data and leverages the scalability of the Snowflake platform.


In [None]:
## parameter grid 
FEATURE_COLS = get_features(train_df, label_col)
OUTPUT_COLUMNS="PREDICTED_TARGET"
label_col='TARGET'



parameters = {
        "n_estimators": [100, 300, 500],
        "learning_rate": [0.1, 0.3, 0.5],
        "max_depth": list(range(3, 5, 1)),
        "min_child_weight": list(range(3, 5, 1)),
    }
    
n_folds = 5

estimator = XGBClassifier()

GridSearch_clf = GridSearchCV(estimator= estimator,
                   param_grid=parameters ,
                   cv = n_folds,
                   input_cols=FEATURE_COLS ,
                   label_cols=label_col,
                   output_cols=OUTPUT_COLUMNS
                   )
GridSearch_clf.fit(train_df)

result = GridSearch_clf.predict(test_df )
print(GridSearch_clf.to_sklearn().best_estimator_)


In [None]:
metrics = {
"accuracy":accuracy_score(df=result, 
                          y_true_col_names="TARGET", 
                          y_pred_col_names="PREDICTED_TARGET"),

"precision":precision_score(df=result,
                            y_true_col_names="TARGET", 
                            y_pred_col_names="PREDICTED_TARGET"),


"recall": recall_score(df=result, 
                       y_true_col_names="TARGET",
                       y_pred_col_names="PREDICTED_TARGET"),



"f1_score":f1_score(df=result,
                   y_true_col_names="TARGET",
                   y_pred_col_names="PREDICTED_TARGET"),
"confusion_matrix":confusion_matrix(df=result, 
                                    y_true_col_name="TARGET",
                                    y_pred_col_name="PREDICTED_TARGET").tolist()
}


In [None]:


# Now, let's log the optimal model from GridSearchCV
model_name = "ML_GRIDSEARCH_POC_MODEL"
version_name = "v2"

# Let's first log the very second model we trained
mv = reg.log_model(
    model_name=model_name,
    version_name=version_name,
    model= XGB_Classifier,
    metrics=metrics ,
    sample_input_data=X, # to provide the feature schema
)

# Add evaluation metric
#mv2.set_metric(metric_name="accuracy_score", value=accuracy)

# Add a description
mv.comment = "This is the first iteration of our POC model \
                        where we performed hyperparameter optimization with Grid Search."