In [None]:
!pip install fosforml

## Sklearn Models Registration

### session creation

In [None]:
from fosforml.model_manager.snowflakesession import get_session
session = get_session()

### modeling

#### Multiclass classification

In [None]:
from sklearn.ensemble._forest import RandomForestClassifier
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
import pandas as pd

m_dataset = load_iris() 


In [None]:

feature_names = m_dataset.feature_names
u_features = [col.replace(" ","_").replace("(","").replace(")","") for col in feature_names]
m_df = pd.DataFrame(m_dataset.data,columns=u_features)
m_df.columns = m_df.columns.str.upper()

m_df["Target"] = m_dataset.target


x_train,x_test,y_train,y_test = train_test_split(m_df.iloc[:,:-1],m_df['Target'])
model = RandomForestClassifier()
model.fit(x_train,y_train)

y_pred = pd.DataFrame(model.predict(x_test),columns=["Predicted"])

In [None]:
x_train

In [None]:
from fosforml import register_model
register_model(
  model_obj=model,
  session=session,
  x_train=x_train,
  y_train=y_train,
  x_test=x_test,
  y_test=y_test,
  y_pred=y_pred,
  dataset_name="RandomMulticlassMOdelForTesting",
  dataset_source="Snowflake",
  name="Mahesh_sklearn_28Aug",
  description="This is a test sklearn model",
  flavour="sklearn",
  model_type="classification",
  conda_dependencies=["scikit-learn==1.3.2"]
)

In [None]:
x_test_df = session.create_dataframe(x_test)

In [None]:
from fosforml.model_manager import DatasetManager

dataset_manager = DatasetManager(model_name="MODEL_5AABC4FE_FC3D_4121_9E11_52C83C9FECC9_FDC_RANDOMMULTICLASSMODELFORTESTING", version_name="v2", session=session)
dataset_manager.upload_datasets(session=session, datasets={"x_train": x_train_df})

#### Custom Model using score_func

In [None]:
from snowflake.ml.model import custom_model
import pandas as pd

In [None]:
class CustomTestModel(custom_model.CustomModel):
    def __init__(self, context: custom_model.CustomModel) -> None:
        super().__init__(context)

    @custom_model.inference_api
    def predict(self, input_data: pd.DataFrame) -> pd.DataFrame:
        score_func = self.context.model_ref('feature_preproc')
        model = self.context.model_ref('model')
        return score_func(
            model,
            input_data
        )

In [None]:
def feature_preproc(model,input_data):
    return pd.DataFrame(model.predict(input_data),columns=["Predicted"])

mc = custom_model.ModelContext(
    models={
        'model': model,
        'feature_preproc': feature_preproc
    }
)

In [None]:
custom_test_model = CustomTestModel(mc)

In [None]:
custom_model_predicted = custom_test_model.predict(x_test)

In [None]:
custom_model_predicted.head()

In [None]:
# custom_model_predicted

In [None]:
from fosforml.model_manager import snowflakesession
session_instance = snowflakesession()
params = session_instance.connection_params

In [None]:
params

In [None]:
from snowflake.ml.registry import Registry

In [None]:
# model_reg = Registry(
#     session=session,
#     database_name="FDC_DEV_VISHWASMAHESHWARI",
#     schema_name="FDC_DEV_SCHMEA"
# )

model_reg = Registry(
    session=session
)

In [None]:
m = model_reg.get_model('MODEL_8D3FB8A6_A886_4236_9B35_73EB1303C4BF_FDC_MAHESH_SKLEARN_28AUG')

In [None]:
mv = m.version('V1')

In [None]:
from snowflake.snowpark.session import Session
from snowflake.ml.registry.registry import Registry
from snowflake.ml.modeling.pipeline import Pipeline
from snowflake.ml.modeling.preprocessing import MinMaxScaler, LabelEncoder, OneHotEncoder
from snowflake.snowpark.functions import col, when
from snowflake.snowpark.types import StringType, DateType, BooleanType
import re

In [75]:
def get_valid_dataset_name(s):
    pattern = r'^[\d$]|[a-z\s]|[^\w$]'
    if re.search(pattern, s):
        return '"' + s + '"'
    return s

In [73]:
def append_prediction_cols(original_df, predict_df, cols):
    for col_name in cols:
        # proba_col = predict_df.select(col(col_name)).to_pandas()
        original_df[col_name] = predict_df[col_name]
    return original_df

In [66]:
original_data = session.table('Iris_inference_dataset')
remote_prediction = mv.run(original_data, function_name="predict")
original_data_df = original_data.to_pandas()
remote_prediction_df = remote_prediction.to_pandas()
predict_columns = [col for col in remote_prediction_df.columns if col not in original_data_df.columns]
print(predict_columns)
original_data_df = append_prediction_cols(original_data_df, remote_prediction_df, predict_columns)

['output_feature_0']


In [67]:
type(original_data_df)

pandas.core.frame.DataFrame

In [69]:
metrics = mv.show_metrics()
if metrics['model_type'].upper() == 'CLASSIFICATION':
    proba_prediction = mv.run(original_data, function_name="predict_proba")
    proba_prediction_df = proba_prediction.to_pandas()
    proba_columns = [col for col in proba_prediction_df.columns if col not in original_data_df.columns]
    print(proba_columns)
    # original_data_df = append_prediction_cols(original_data_df, proba_prediction_df, proba_columns)

['output_feature_1', 'output_feature_2']


In [70]:
original_data_df = append_prediction_cols(original_data_df, proba_prediction_df, proba_columns)

In [63]:
original_data_df['output_feature_1'] = proba_prediction_df['output_feature_1']

In [None]:
remote_prediction = remote_prediction.to_pandas()

In [None]:
proba_col = remote_prediction.select(col('output_feature_0')).to_pandas()

In [None]:
proba_col

In [72]:
def batch_prediction(session, model_id, version_id, input_table, filter_cond='', output_table=''):
    reg = Registry(session=session)
    m = reg.get_model(model_id)
    mv = m.version(version_id)
    metrics = mv.show_metrics()
    source = metrics['source']
    input_table = get_valid_dataset_name(input_table)
    if source.upper() == 'EXPERIMENT':
        target_column = metrics['dataset_details'][0].get('target_column')
        original_data = session.table(input_table)
        data = apply_data_cleansing(original_data)
        numerical_features, le_column_features, oh_column_features = get_feature_columns(data, target_column)
        data = create_and_run_preprocessing(data, numerical_features, le_column_features, oh_column_features)
        remote_prediction = mv.run(data, function_name="predict")
        # creating prediction with original data
        prediction_column = remote_prediction.select(col('PREDICTIONS')).to_pandas()
        original_data = original_data.to_pandas()
        original_data['PREDICTIONS'] = prediction_column['PREDICTIONS']

        if metrics['model_type'].upper() == 'CLASSIFICATION':
            proba_prediction = mv.run(data, function_name="predict_proba")
            proba_prediction = proba_prediction.to_pandas()
            proba_columns = [col for col in proba_prediction.columns if col not in data.columns]
            original_data = append_prediction_cols(original_data, proba_prediction, proba_columns)

            table_prefix = "PREDICTION_"
            new_table_name = (table_prefix + output_table).upper()
            session.write_pandas(original_data, new_table_name, auto_create_table=True, overwrite=True)
            return output_table

    else:
        original_data = session.table('Iris_inference_dataset')
        remote_prediction = mv.run(original_data, function_name="predict")
        original_data_df = original_data.to_pandas()
        remote_prediction_df = remote_prediction.to_pandas()
        print(remote_prediction_df.head())
        predict_columns = [col for col in remote_prediction_df.columns if col not in original_data_df.columns]
        original_data_df = append_prediction_cols(original_data_df, remote_prediction_df, predict_columns)

        if metrics['model_type'].upper() == 'CLASSIFICATION':
            proba_prediction = mv.run(original_data, function_name="predict_proba")
            proba_prediction_df = proba_prediction.to_pandas()
            proba_columns = [col for col in proba_prediction_df.columns if col not in original_data_df.columns]
            original_data_df = append_prediction_cols(original_data_df, proba_prediction_df, proba_columns)

            table_prefix = "PREDICTION_"
            new_table_name = (table_prefix + output_table).upper()
            session.write_pandas(original_data_df, new_table_name, auto_create_table=True, overwrite=True)
            return original_data_df

In [76]:
op = batch_prediction(session, 'MODEL_8D3FB8A6_A886_4236_9B35_73EB1303C4BF_FDC_MAHESH_SKLEARN_28AUG', 'V1', 'Iris_inference_dataset', '', '28282828')


'28282828'

In [None]:
remote_prediction.head()

In [None]:
remote_prediction = mv.run(x_test, function_name="predict_proba")

In [None]:
remote_prediction.head()

In [78]:
original_data = session.table('Iris_inference_dataset')
remote_prediction_1 = mv.run(original_data, function_name="predict")

In [None]:
x_test_df.write.save_as_table("Iris_inference_dataset", mode="overwrite")

In [79]:
remote_prediction_1.show()

----------------------------------------------------------------------------------------------------
|"SEPAL_LENGTH_CM"  |"SEPAL_WIDTH_CM"  |"PETAL_LENGTH_CM"  |"PETAL_WIDTH_CM"  |"output_feature_0"  |
----------------------------------------------------------------------------------------------------
|6.2                |2.2               |4.5                |1.5               |1                   |
|7.2                |3.6               |6.1                |2.5               |2                   |
|6.8                |3.0               |5.5                |2.1               |2                   |
|5.4                |3.4               |1.7                |0.2               |0                   |
|6.0                |2.9               |4.5                |1.5               |1                   |
|6.0                |3.4               |4.5                |1.6               |1                   |
|5.8                |2.7               |5.1                |1.9               |2           

In [None]:
custom_test_model.context.model_refs.keys()

In [None]:
!python -c "import platform;print(platform.python_version())"

In [None]:
model_reg.log_model(
       model=custom_test_model,
       model_name="custom_test_model_feature_pre",
       version_name="custom_test_model_v1",
       comment="Testing score function with custom_test_model",
       conda_dependencies=['scikit-learn==1.3.2'],
       metrics={},
       sample_input_data=m_df.drop('Target',axis=1),
       python_version="3.10.13"
)

In [None]:
from snowflake.ml.modeling.metrics import (confusion_matrix,
                                        accuracy_score,
                                        f1_score, recall_score,
                                        precision_score,
                                        log_loss ,
                                        roc_auc_score ,
                                        roc_curve)

In [None]:
roc_auc_score(df=self.sf_df,y_true_col_names=self.true_cn, y_score_col_names=self.pred_cn)

In [None]:
from fosforml import register_model
register_model(
  model_obj=custom_test_model,
  session=session,
  x_train=x_train,
  y_train=y_train,
  x_test=x_test,
  y_test=y_test,
  y_pred=y_pred,
  dataset_name="Snowflake_dataset",
  dataset_source="Snowflake",
  name="SklearnMulitClassModel",
  description="This is a test sklearn model",
  flavour="sklearn",
  model_type="classification",
  conda_dependencies=["scikit-learn==1.3.2"]
)

#### Binary Classfication

In [None]:
from sklearn.ensemble._forest import RandomForestClassifier
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
import pandas as pd

m_dataset = load_breast_cancer() 
feature_names = m_dataset.feature_names
u_features = [feature.replace(" ","_") for feature in feature_names]
m_df = pd.DataFrame(m_dataset.data,columns=u_features)
m_df["Target"] = m_dataset.target


x_train,x_test,y_train,y_test = train_test_split(m_df.iloc[:,:-1],m_df['Target'])
model = RandomForestClassifier()
model.fit(x_train,y_train)

y_pred = pd.DataFrame(model.predict(x_test),columns=["Predicted"])

In [None]:
session

In [None]:
from fosforml import register_model
register_model(
  model_obj=model,
  session=session,
  x_train=x_train,
  y_train=y_train,
  x_test=x_test,
  y_test=y_test,
  y_pred=y_pred,
  dataset_name="Binary_Model",
  dataset_source = "SNOWFLAKE_STAGE",
  name="SklearnBinaryModel",
  description="This is a test sklearn model",
  flavour="sklearn",
  model_type="classification",
  conda_dependencies=["scikit-learn==1.3.2"]
)

#### Regression

In [None]:
from sklearn.ensemble._forest import RandomForestRegressor
from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split
import pandas as pd

m_dataset = load_diabetes() 
feature_names = m_dataset.feature_names
u_features = [feature.replace(" ","_") for feature in feature_names]

m_df = pd.DataFrame(m_dataset.data,columns=u_features)
m_df["Target"] = m_dataset.target

x_train,x_test,y_train,y_test = train_test_split(m_df.iloc[:,:-1],m_df['Target'])
model = RandomForestRegressor()
model.fit(x_train,y_train)

y_pred = pd.DataFrame(model.predict(x_test),columns=["Predicted"])

In [None]:
from fosforml import register_model
register_model(
  name="SklearnReyhtyjhgressionModel",
  model_obj=model,
  session=session,
  x_train=x_train,
  y_train=y_train,
  x_test=x_test,
  y_test=y_test,
  y_pred=y_pred,
  dataset_name="RegressionModel",
  dataset_source = "SNOWFLAKE_STAGE",
  description="This is a test sklearn model",
  flavour="sklearn",
  model_type="regression",
  conda_dependencies=["scikit-learn==1.3.2"]
)