# Package Installation for Snowflake Connection and Data Processing

In [None]:
!pip install  dask[complete]  snowflake  snowflake-connector-python snowflake-snowpark-python snowflake-snowpark-python[pandas] seaborn matplotlib numpy pandas scikit-learn  fosforml plotly

# Import necessary libraries

In [1]:
# Importing general libraries
import pandas as pd  # For data manipulation
import numpy as np  # For numerical operations
import seaborn as sns  # For data visualization
from datetime import datetime  # For date and time manipulation
import matplotlib.pyplot as plt  # For plotting graphs

# Importing Snowflake session management
from snowflake.snowpark.session import Session  # For Snowflake integration

# Importing advanced plotting libraries
from plotly.subplots import make_subplots  # For creating subplots in Plotly
import plotly.graph_objects as go  # For creating interactive visualizations with Plotly
import plotly.express as px  # For simplified plotting using Plotly

# Setting Pandas display option to show more columns in the output
pd.set_option('display.max_columns', 100)

# Importing libraries for machine learning and feature selection
from sklearn.ensemble import RandomForestClassifier  # For Random Forest model
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, precision_recall_curve, auc  # For model evaluation metrics
from sklearn.feature_selection import SelectKBest, f_classif  # For feature selection
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder  # For encoding categorical data and scaling
from sklearn.impute import SimpleImputer  # For handling missing values
from sklearn.compose import ColumnTransformer  # For applying different preprocessing steps to different columns
from sklearn.pipeline import Pipeline  # For creating machine learning pipelines
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV  # For splitting data and performing grid search for hyperparameter tuning

# Importing additional libraries for visualization and matplotlib handling
import matplotlib.pyplot as plt  # For plotting static visualizations

# Setting up environment for plotting using Seaborn and Matplotlib
sns.set(style="whitegrid")  # Setting the style for Seaborn plots


# This section connects to Snowflake using fosforml's Snowflake session manager, retrieves data from a specified Snowflake table, and loads the data into a Pandas DataFrame for further processing and modeling.

In [2]:

# Importing the get_session function from fosforml's Snowflake session manager
from fosforml.model_manager.snowflakesession import get_session

# Establishing a Snowflake session for executing queries and performing operations
my_session = get_session()

# Define the name of the Snowflake table to query
table_name = 'ORDER_DATA_TRAINING'

# Execute a SQL query to select all records from the specified table in Snowflake
df_sample = my_session.sql("select * from {}".format(table_name)).to_pandas()

# Filtering and Preparing the Training Dataset for Returned Status Analysis

In [3]:
# Import Dask for handling larger datasets efficiently
import dask.dataframe as dd

#  Dask DataFrame allows for efficient handling of large datasets and delayed execution
df_sample_dask = dd.from_pandas(df_sample, npartitions=4)  # Adjust npartitions based on data size and memory

#  We use Dask to filter the records based on the 'RETURNED_STATUS' column
df_train_dask = df_sample_dask[df_sample_dask['RETURNED_STATUS'].isin(['CANCELLED', 'RETURNED', 'DELIVERED', 'IN PROCESS'])]

#  Dask performs lazy computation, so we need to explicitly call .compute() to trigger the actual computation
df_train = df_train_dask.compute()

#  Display the count of each 'RETURNED_STATUS' to verify the filtering step
print(df_train['RETURNED_STATUS'].value_counts())


RETURNED_STATUS
CANCELLED     10000
DELIVERED     10000
IN PROCESS    10000
RETURNED      10000
Name: count, dtype: int64[pyarrow]


# Creating a Binary Target Variable for Classification

In [4]:
# If the 'RETURNED_STATUS' is 'CANCELLED' or 'RETURNED', assign 1, else assign 0
df_train['TARGET'] = df_train['RETURNED_STATUS'].apply(lambda x: 1 if x in ['CANCELLED', 'RETURNED'] else 0)


# This will give us an understanding of how many records are marked as 1 (CANCELLED/RETURNED) or 0 (DELIVERED/IN PROCESS)
target_counts = df_train['TARGET'].value_counts()

# Display the count of each target class
print(target_counts)


TARGET
1    20000
0    20000
Name: count, dtype: int64


In [5]:
df_train

Unnamed: 0,DIVISION_CODE,DIVISION_NAME,BRAND_CODE,BRAND_NAME,CLASS_CODE,CLASS_NAME,SELLING_CHANNEL,CHAIN,WEB_ORDER_NUMBER,OMS_ORDER_NUMBER,OMS_LINE_ITEM_ID,OMS_TICKET_ID,SKU_ID,QUANTITY,UNIT_PRICE,CURRENT_STATUS,CURRENT_STATUS_DESCRIPTION,TRANSACTION_DATE,SHIP_FROM_WAREHOUSE_CODE,SHIP_FROM_WAREHOUSE_DESCRIPTION,ORDER_DATE,READY_TO_PRINT_DATE,PRINT_TICKET_DATE,VERIFIED_SHIPPED_DATE,BACK_ORDERED_DATE,ORDER_AGE,GIFT_ARTICLE_FLAG,CARRIER_NAME,CARRIER_TRACKING_NUMBER,DROPSHIP_FLAG,ORDER_STATUS,ORDER_CREATION_DATE,ORDER_CONFIRMATION_DATE,WM_ORDER_ID,WM_ORDER_LINE_ID,WM_ORDER_STATUS,WM_PICKING_START_TIME,WM_PICKING_END_TIME,WM_PICKING_AGE,WM_PACKING_START_TIME,WM_PACKING_END_TIME,WM_PACKING_AGE,WM_CREATED_DATE,WM_UPDATED_DATE,WM_SHIPPED_DATE,WM_ORDER_AGE,STORE_ID,STORE_NAME,SHIP_METHOD_CODE,SHIP_METHOD_NAME,SHIP_METHOD_SERVICE,SHIPMENT_SLA,NEW_ORDER_DATE,RETURN_REASON,RETURN_FLAG,RECORD_DATE,RECORD_TIME,GROSS_SALES,RETURNED_STATUS,SHIPPING_DELAY,TARGET
0,34,Ladies Shoes,25611,VEJA,55,Sneakers,Online,Chain1,WC100004243065,52054403,52054403*1,52054403-1,301222652013,1,145.0,Cancelled,Cancelled,2024-03-15,6,STORES,2024-04-03,2024-03-15 13:00:00,NaT,NaT,,0,false,,,false,,NaT,NaT,,,,NaT,NaT,,NaT,NaT,,NaT,NaT,2024-02-03,,,,PAC3,PacMan Ground,GROUND,,2023-08-03,,,,,145,CANCELLED,False,1
1,44,Men's,12569,Givenchy,1,Shirts/Tops,Others,Chain1,STA000000000557916,52102974,52102974*1,52102974-1,301234392709,1,850.0,Cancelled,Cancelled,2024-03-22,5,Oakbrook,2024-07-11,2024-03-18 22:12:20,NaT,NaT,22-03-2024,4,false,FedEx,,false,,NaT,NaT,,,,NaT,NaT,,NaT,NaT,,NaT,NaT,2024-05-11,,,,ON,FedEx - Standard Overnight,OVERNIGHT,,2023-11-11,,,,,850,CANCELLED,False,1
2,53,Beauty,11778,Valentino,313,Gifts & Sets,Others,Chain1,STA000000000562029,52150523,52150523*2,52150523-1,301243441894,1,180.0,Cancelled,Cancelled,2024-03-22,3,Tampa Bay,2024-08-27,2024-03-22 13:00:00,NaT,NaT,,0,false,FedEx SmartPost,,false,,1900-01-01,1900-01-01,,******,Not in MAO,NaT,NaT,,NaT,NaT,,1900-01-01,1900-01-01,1900-01-01,-9999.0,,,FXPOS,FedEx SmartPost,GROUND,,2023-02-18,,,,,180,CANCELLED,False,1
3,81,Women's Designer RTW,11807,Etro,1,Shirts/Tops,Store_POS,Chain1,SP0020015704953032424,52180398,52180398*1,52180398-1,301234272278,1,436.0,Cancelled,Cancelled,2024-03-25,5,Oakbrook,2024-01-29,2024-03-24 12:21:21,NaT,NaT,,1,false,,,false,,NaT,NaT,,,,NaT,NaT,,NaT,NaT,,NaT,NaT,2023-11-29,,,,,,GROUND,,2023-05-29,,,,,436,CANCELLED,False,1
4,11,Women's Apparel,29475,HELSI,136,Jumpsuit,Online,Chain1,WC100004182213,51956423,51956423*1,51956423-1,301237267196,1,421.0,Cancelled,Cancelled,2024-03-07,5,Oakbrook,2023-03-30,2024-03-07 05:25:34,NaT,NaT,,0,false,,,false,,NaT,NaT,,,,NaT,NaT,,NaT,NaT,,NaT,NaT,2023-01-30,,,,,,GROUND,,2022-07-30,,,,,421,CANCELLED,False,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39995,34,Ladies Shoes,11135,Christian Louboutin,54,Sandals,Online,Chain1,WC100004207804,52001186,52001186*2,52001186-1,301233262416,-1,1195.0,Ready to Ship,Returned,2024-03-28,5,Oakbrook,2024-01-19,2024-03-12 12:26:05,2024-03-12 13:16:47,NaT,,18,false,FedEx SmartPost,******08155361239845,false,,NaT,NaT,,,,NaT,NaT,,NaT,NaT,,NaT,NaT,2023-11-19,,6247,,FXPOS,FedEx SmartPost,GROUND,,2023-05-19,Size Issues,,,,-1195,RETURNED,False,1
39996,11,Women's Apparel,13657,Misook,124,Dress,Online,Chain1,WC100004227187,52030192,52030192*5,52030192-2,301228013924,-1,348.0,Ready to Ship,Returned,2024-03-28,5,Oakbrook,2024-03-08,NaT,NaT,NaT,,15,false,FedEx,******059997,true,,NaT,NaT,,,,NaT,NaT,,NaT,NaT,,NaT,NaT,2024-01-08,,6247,,FEDXH,FedEx Home Delivery,GROUND,,2022-08-15,Quality Concerns,,,,-348,RETURNED,False,1
39997,44,Men's,12185,Golden Goose,3,Jackets,Others,Chain1,STA000000000563867,52171833,52171833*2,52171833-1,301220155158,-1,545.0,Ready to Ship,Returned,2024-03-29,4,Denver,2024-09-10,2024-03-23 17:02:28,2024-03-25 12:44:01,NaT,,6,false,FedEx,******934595,false,,NaT,NaT,,,,NaT,NaT,,NaT,NaT,,NaT,NaT,2024-07-10,,6088,,FEDXH,FedEx Home Delivery,GROUND,,2023-04-21,Size Issues,,,,-545,RETURNED,False,1
39998,14,Contemporary Apparel,10005,R13,6,Denim,Online,Chain2,WC200001278395,51920021,51920021*3,51920021-2,301242613322,-1,545.0,Ready to Ship,Returned,2024-04-01,4,Denver,2024-04-01,2024-03-09 12:08:26,2024-03-09 12:46:42,NaT,,29,false,FedEx,******679260,false,,NaT,NaT,,,,NaT,NaT,,NaT,NaT,,NaT,NaT,2024-02-01,,6088,,FEDXH,FedEx Home Delivery,GROUND,,2022-08-07,Size Issues,,,,-545,RETURNED,False,1


In [6]:
df_train['SHIPPING_DELAY']

0        False
1        False
2        False
3        False
4        False
         ...  
39995    False
39996    False
39997    False
39998    False
39999    False
Name: SHIPPING_DELAY, Length: 40000, dtype: bool

# Preparing Features and Target for Model Training

In [7]:
# The 'TARGET' column has already been created where 'CANCELLED' and 'RETURNED' are assigned 1, and the rest are 0
# Drop 'RETURNED_STATUS' and 'TARGET' from the feature set (X_train) as they are not input features for the model
X_train = df_train.drop(columns=['RETURNED_STATUS', 'TARGET'])

# The target variable (y_train) is the 'TARGET' column we created earlier
y_train = df_train['TARGET']

# Identify the numerical and categorical columns for separate preprocessing
numerical_cols = ['UNIT_PRICE', 'WM_PICKING_AGE', 'WM_PACKING_AGE', 'WM_ORDER_AGE', 'STORE_ID',]  # Numerical features
categorical_cols = [ 'SHIPMENT_SLA', 'DIVISION_CODE', 'DIVISION_NAME', 'BRAND_CODE', 'BRAND_NAME', 'CLASS_CODE', 'CLASS_NAME', 'SELLING_CHANNEL', 'CHAIN', 'WEB_ORDER_NUMBER', 'OMS_LINE_ITEM_ID', 'OMS_TICKET_ID', 'SKU_ID', 'CURRENT_STATUS', 'CURRENT_STATUS_DESCRIPTION', 'SHIP_FROM_WAREHOUSE_DESCRIPTION',  'CARRIER_NAME', 'CARRIER_TRACKING_NUMBER', 'DROPSHIP_FLAG', 'ORDER_STATUS', 'WM_ORDER_ID', 'WM_ORDER_LINE_ID', 'WM_ORDER_STATUS', 'STORE_NAME', 'SHIP_METHOD_CODE', 'SHIP_METHOD_NAME', 'SHIP_METHOD_SERVICE',  'RETURN_REASON', 'RETURN_FLAG']  # Categorical features

# Display the list of numerical and categorical columns for verification
print("Numerical Columns:", numerical_cols)
print("\n\nCategorical Columns:", categorical_cols)


Numerical Columns: ['UNIT_PRICE', 'WM_PICKING_AGE', 'WM_PACKING_AGE', 'WM_ORDER_AGE', 'STORE_ID']


Categorical Columns: ['SHIPMENT_SLA', 'DIVISION_CODE', 'DIVISION_NAME', 'BRAND_CODE', 'BRAND_NAME', 'CLASS_CODE', 'CLASS_NAME', 'SELLING_CHANNEL', 'CHAIN', 'WEB_ORDER_NUMBER', 'OMS_LINE_ITEM_ID', 'OMS_TICKET_ID', 'SKU_ID', 'CURRENT_STATUS', 'CURRENT_STATUS_DESCRIPTION', 'SHIP_FROM_WAREHOUSE_DESCRIPTION', 'CARRIER_NAME', 'CARRIER_TRACKING_NUMBER', 'DROPSHIP_FLAG', 'ORDER_STATUS', 'WM_ORDER_ID', 'WM_ORDER_LINE_ID', 'WM_ORDER_STATUS', 'STORE_NAME', 'SHIP_METHOD_CODE', 'SHIP_METHOD_NAME', 'SHIP_METHOD_SERVICE', 'RETURN_REASON', 'RETURN_FLAG']


In [8]:
X_train[categorical_cols] = X_train[categorical_cols].fillna(X_train[categorical_cols].mode().iloc[0])

In [9]:
df_train[categorical_cols]

Unnamed: 0,SHIPMENT_SLA,DIVISION_CODE,DIVISION_NAME,BRAND_CODE,BRAND_NAME,CLASS_CODE,CLASS_NAME,SELLING_CHANNEL,CHAIN,WEB_ORDER_NUMBER,OMS_LINE_ITEM_ID,OMS_TICKET_ID,SKU_ID,CURRENT_STATUS,CURRENT_STATUS_DESCRIPTION,SHIP_FROM_WAREHOUSE_DESCRIPTION,CARRIER_NAME,CARRIER_TRACKING_NUMBER,DROPSHIP_FLAG,ORDER_STATUS,WM_ORDER_ID,WM_ORDER_LINE_ID,WM_ORDER_STATUS,STORE_NAME,SHIP_METHOD_CODE,SHIP_METHOD_NAME,SHIP_METHOD_SERVICE,RETURN_REASON,RETURN_FLAG
0,,34,Ladies Shoes,25611,VEJA,55,Sneakers,Online,Chain1,WC100004243065,52054403*1,52054403-1,301222652013,Cancelled,Cancelled,STORES,,,false,,,,,,PAC3,PacMan Ground,GROUND,,
1,,44,Men's,12569,Givenchy,1,Shirts/Tops,Others,Chain1,STA000000000557916,52102974*1,52102974-1,301234392709,Cancelled,Cancelled,Oakbrook,FedEx,,false,,,,,,ON,FedEx - Standard Overnight,OVERNIGHT,,
2,,53,Beauty,11778,Valentino,313,Gifts & Sets,Others,Chain1,STA000000000562029,52150523*2,52150523-1,301243441894,Cancelled,Cancelled,Tampa Bay,FedEx SmartPost,,false,,,******,Not in MAO,,FXPOS,FedEx SmartPost,GROUND,,
3,,81,Women's Designer RTW,11807,Etro,1,Shirts/Tops,Store_POS,Chain1,SP0020015704953032424,52180398*1,52180398-1,301234272278,Cancelled,Cancelled,Oakbrook,,,false,,,,,,,,GROUND,,
4,,11,Women's Apparel,29475,HELSI,136,Jumpsuit,Online,Chain1,WC100004182213,51956423*1,51956423-1,301237267196,Cancelled,Cancelled,Oakbrook,,,false,,,,,,,,GROUND,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39995,,34,Ladies Shoes,11135,Christian Louboutin,54,Sandals,Online,Chain1,WC100004207804,52001186*2,52001186-1,301233262416,Ready to Ship,Returned,Oakbrook,FedEx SmartPost,******08155361239845,false,,,,,,FXPOS,FedEx SmartPost,GROUND,Size Issues,
39996,,11,Women's Apparel,13657,Misook,124,Dress,Online,Chain1,WC100004227187,52030192*5,52030192-2,301228013924,Ready to Ship,Returned,Oakbrook,FedEx,******059997,true,,,,,,FEDXH,FedEx Home Delivery,GROUND,Quality Concerns,
39997,,44,Men's,12185,Golden Goose,3,Jackets,Others,Chain1,STA000000000563867,52171833*2,52171833-1,301220155158,Ready to Ship,Returned,Denver,FedEx,******934595,false,,,,,,FEDXH,FedEx Home Delivery,GROUND,Size Issues,
39998,,14,Contemporary Apparel,10005,R13,6,Denim,Online,Chain2,WC200001278395,51920021*3,51920021-2,301242613322,Ready to Ship,Returned,Denver,FedEx,******679260,false,,,,,,FEDXH,FedEx Home Delivery,GROUND,Size Issues,


In [10]:
# Count NaN values in each column
df_sample[categorical_cols].isna().sum()

SHIPMENT_SLA                       40000
DIVISION_CODE                         82
DIVISION_NAME                         82
BRAND_CODE                            82
BRAND_NAME                            82
CLASS_CODE                            82
CLASS_NAME                            82
SELLING_CHANNEL                        0
CHAIN                                  0
WEB_ORDER_NUMBER                       0
OMS_LINE_ITEM_ID                       0
OMS_TICKET_ID                       1246
SKU_ID                                82
CURRENT_STATUS                         0
CURRENT_STATUS_DESCRIPTION             0
SHIP_FROM_WAREHOUSE_DESCRIPTION        0
CARRIER_NAME                        6439
CARRIER_TRACKING_NUMBER            20158
DROPSHIP_FLAG                        141
ORDER_STATUS                       29543
WM_ORDER_ID                        32875
WM_ORDER_LINE_ID                   29543
WM_ORDER_STATUS                    29543
STORE_NAME                             0
SHIP_METHOD_CODE

# Preprocessing Data and Building a Machine Learning Pipeline with Random Forest Classifier

In [11]:

#  Define a pipeline for processing numerical columns
numerical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),  # Impute missing numerical values with the mean
    ('scaler', StandardScaler())  # Scale the numerical features to have zero mean and unit variance
])

#  Define a pipeline for processing categorical columns
categorical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),  # Impute missing categorical values with the most frequent value
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))  # One-hot encode categorical features, ignoring unknown categories
])

#  Combine both pipelines into a ColumnTransformer
# The ColumnTransformer applies the numerical and categorical transformations to their respective columns
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_pipeline, numerical_cols),  # Apply numerical pipeline to numerical columns
        ('cat', categorical_pipeline, categorical_cols)  # Apply categorical pipeline to categorical columns
    ]
)

# The full pipeline consists of the preprocessing step followed by the RandomForest classifier
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),  # First, apply preprocessing
    ('classifier', RandomForestClassifier(random_state=42, class_weight='balanced'))  # Then, train the RandomForest model
])

pipeline

#  Hyperparameter Tuning with GridSearchCV for Random Forest Classifier

In [12]:
# This grid contains possible values for several hyperparameters of the Random Forest model.
param_grid = {
    'classifier__n_estimators': [100, 200, 300],  # Number of trees in the forest
    'classifier__max_depth': [10, 20, None],  # Maximum depth of the trees
    'classifier__min_samples_split': [2, 5, 10],  # Minimum samples required to split a node
    'classifier__min_samples_leaf': [1, 2, 4],  # Minimum samples required at each leaf node
    'classifier__max_features': ['sqrt', 'log2']  # Number of features to consider when looking for the best split
}

#  GridSearchCV will evaluate different combinations of the hyperparameters to find the best-performing model.
grid_search = GridSearchCV(pipeline, param_grid, cv=3, n_jobs=-1, verbose=2, scoring='f1_weighted')

# This split is necessary to evaluate model performance on unseen data (the test set).
X_train_split, X_test_split, y_train_split, y_test_split = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

#  Fit the model on the training data using GridSearchCV to find the optimal hyperparameters.
grid_search.fit(X_train_split, y_train_split)

# Step 11: Display the best hyperparameters found by GridSearchCV
print("Best Hyperparameters:", grid_search.best_params_)


Fitting 3 folds for each of 162 candidates, totalling 486 fits
[CV] END classifier__max_depth=10, classifier__max_features=sqrt, classifier__min_samples_leaf=1, classifier__min_samples_split=2, classifier__n_estimators=100; total time=   0.0s
[CV] END classifier__max_depth=10, classifier__max_features=sqrt, classifier__min_samples_leaf=1, classifier__min_samples_split=2, classifier__n_estimators=200; total time=   0.0s
[CV] END classifier__max_depth=10, classifier__max_features=sqrt, classifier__min_samples_leaf=1, classifier__min_samples_split=2, classifier__n_estimators=200; total time=   0.0s
[CV] END classifier__max_depth=10, classifier__max_features=sqrt, classifier__min_samples_leaf=1, classifier__min_samples_split=2, classifier__n_estimators=200; total time=   0.0s
[CV] END classifier__max_depth=10, classifier__max_features=sqrt, classifier__min_samples_leaf=1, classifier__min_samples_split=2, classifier__n_estimators=300; total time=   0.1s
[CV] END classifier__max_depth=10, cl

ValueError: 
All the 486 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
486 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/conda/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/conda/lib/python3.9/site-packages/sklearn/base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/opt/conda/lib/python3.9/site-packages/sklearn/pipeline.py", line 652, in fit
    Xt = self._fit(X, y, routed_params, raw_params=params)
  File "/opt/conda/lib/python3.9/site-packages/sklearn/pipeline.py", line 586, in _fit
    X, fitted_transformer = fit_transform_one_cached(
  File "/opt/conda/lib/python3.9/site-packages/joblib/memory.py", line 312, in __call__
    return self.func(*args, **kwargs)
  File "/opt/conda/lib/python3.9/site-packages/sklearn/pipeline.py", line 1540, in _fit_transform_one
    res = transformer.fit_transform(X, y, **params.get("fit_transform", {}))
  File "/opt/conda/lib/python3.9/site-packages/sklearn/utils/_set_output.py", line 319, in wrapped
    data_to_wrap = f(self, X, *args, **kwargs)
  File "/opt/conda/lib/python3.9/site-packages/sklearn/base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/opt/conda/lib/python3.9/site-packages/sklearn/compose/_column_transformer.py", line 1000, in fit_transform
    result = self._call_func_on_transformers(
  File "/opt/conda/lib/python3.9/site-packages/sklearn/compose/_column_transformer.py", line 909, in _call_func_on_transformers
    return Parallel(n_jobs=self.n_jobs)(jobs)
  File "/opt/conda/lib/python3.9/site-packages/sklearn/utils/parallel.py", line 77, in __call__
    return super().__call__(iterable_with_config)
  File "/opt/conda/lib/python3.9/site-packages/joblib/parallel.py", line 1918, in __call__
    return output if self.return_generator else list(output)
  File "/opt/conda/lib/python3.9/site-packages/joblib/parallel.py", line 1847, in _get_sequential_output
    res = func(*args, **kwargs)
  File "/opt/conda/lib/python3.9/site-packages/sklearn/utils/parallel.py", line 139, in __call__
    return self.function(*args, **kwargs)
  File "/opt/conda/lib/python3.9/site-packages/sklearn/pipeline.py", line 1540, in _fit_transform_one
    res = transformer.fit_transform(X, y, **params.get("fit_transform", {}))
  File "/opt/conda/lib/python3.9/site-packages/sklearn/base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/opt/conda/lib/python3.9/site-packages/sklearn/pipeline.py", line 716, in fit_transform
    Xt = self._fit(X, y, routed_params)
  File "/opt/conda/lib/python3.9/site-packages/sklearn/pipeline.py", line 586, in _fit
    X, fitted_transformer = fit_transform_one_cached(
  File "/opt/conda/lib/python3.9/site-packages/joblib/memory.py", line 312, in __call__
    return self.func(*args, **kwargs)
  File "/opt/conda/lib/python3.9/site-packages/sklearn/pipeline.py", line 1540, in _fit_transform_one
    res = transformer.fit_transform(X, y, **params.get("fit_transform", {}))
  File "/opt/conda/lib/python3.9/site-packages/sklearn/utils/_set_output.py", line 319, in wrapped
    data_to_wrap = f(self, X, *args, **kwargs)
  File "/opt/conda/lib/python3.9/site-packages/sklearn/base.py", line 921, in fit_transform
    return self.fit(X, y, **fit_params).transform(X)
  File "/opt/conda/lib/python3.9/site-packages/sklearn/base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/opt/conda/lib/python3.9/site-packages/sklearn/impute/_base.py", line 434, in fit
    X = self._validate_input(X, in_fit=True)
  File "/opt/conda/lib/python3.9/site-packages/sklearn/impute/_base.py", line 344, in _validate_input
    X = validate_data(
  File "/opt/conda/lib/python3.9/site-packages/sklearn/utils/validation.py", line 2944, in validate_data
    out = check_array(X, input_name="X", **check_params)
  File "/opt/conda/lib/python3.9/site-packages/sklearn/utils/validation.py", line 1055, in check_array
    array = _asarray_with_order(array, order=order, dtype=dtype, xp=xp)
  File "/opt/conda/lib/python3.9/site-packages/sklearn/utils/_array_api.py", line 832, in _asarray_with_order
    array = numpy.asarray(array, order=order, dtype=dtype)
  File "/opt/conda/lib/python3.9/site-packages/pandas/core/generic.py", line 2150, in __array__
    arr = np.asarray(values, dtype=dtype)
TypeError: float() argument must be a string or a number, not 'NAType'


In [13]:
X_train_split.info()

<class 'pandas.core.frame.DataFrame'>
Index: 32000 entries, 14307 to 15795
Data columns (total 59 columns):
 #   Column                           Non-Null Count  Dtype         
---  ------                           --------------  -----         
 0   DIVISION_CODE                    32000 non-null  string        
 1   DIVISION_NAME                    32000 non-null  string        
 2   BRAND_CODE                       32000 non-null  string        
 3   BRAND_NAME                       32000 non-null  string        
 4   CLASS_CODE                       32000 non-null  string        
 5   CLASS_NAME                       32000 non-null  string        
 6   SELLING_CHANNEL                  32000 non-null  string        
 7   CHAIN                            32000 non-null  string        
 8   WEB_ORDER_NUMBER                 32000 non-null  string        
 9   OMS_ORDER_NUMBER                 32000 non-null  string        
 10  OMS_LINE_ITEM_ID                 32000 non-null  string    

#  Evaluating the Best Model on the Test Data

In [14]:
# The best_pipeline contains the model with the best hyperparameters found by GridSearchCV
best_pipeline = grid_search.best_estimator_

#  Predict the labels (TARGET) for the test data and evaluate model performance
y_pred_split = best_pipeline.predict(X_test_split)

#  The classification report provides key metrics (precision, recall, F1-score) for both classes (CANCELLED/RETURNED and DELIVERED/IN PROCESS)
print("Classification Report on CANCELLED/RETURNED records:")
print(classification_report(y_test_split, y_pred_split))

#  Display the confusion matrix for further evaluation
conf_matrix = confusion_matrix(y_test_split, y_pred_split)
print("Confusion Matrix:")
print(conf_matrix)


AttributeError: 'GridSearchCV' object has no attribute 'best_estimator_'

In [15]:
grid_search.best

#  Making Predictions on DELIVERED and IN PROCESS Orders

In [None]:
# We only want to predict whether DELIVERED or IN PROCESS orders will be CANCELLED or RETURNED
df_predict = df_sample[df_sample['RETURNED_STATUS'].isin(['DELIVERED', 'IN PROCESS'])]

#  Drop the 'RETURNED_STATUS' column, as it's not needed for making predictions
X_predict = df_predict.drop(columns=['RETURNED_STATUS'])

#  Use the best pipeline (with preprocessing and the trained model) to predict on the filtered dataset
predictions = best_pipeline.predict(X_predict)

#  The 'PREDICTED_CANCELLED_RETURNED' column contains the predicted labels (1 for CANCELLED/RETURNED, 0 for not)
df_predict['PREDICTED_CANCELLED_RETURNED'] = predictions

#  Display the original 'RETURNED_STATUS' along with the new 'PREDICTED_CANCELLED_RETURNED' column
df_predict[['RETURNED_STATUS', 'PREDICTED_CANCELLED_RETURNED']]

In [None]:
df_predict[['RETURNED_STATUS', 'PREDICTED_CANCELLED_RETURNED']].value_counts()

In [None]:
# Step 11: Get the best model
best_pipeline = grid_search.best_estimator_

# Step 12: Evaluate the best model on the test split
y_pred_split = best_pipeline.predict(X_test_split)
print("Classification Report on CANCELLED/RETURNED/DELIVERED/IN PROCESS records:")
print(classification_report(y_test_split, y_pred_split))

# Step 13: Plot Confusion Matrix
conf_matrix = confusion_matrix(y_test_split, y_pred_split)

plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', cbar=False)
plt.title('Confusion Matrix')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()

# Step 14: Classification Report (Precision, Recall, F1-Score) as a bar plot
report = classification_report(y_test_split, y_pred_split, output_dict=True)
report_df = pd.DataFrame(report).transpose()

plt.figure(figsize=(10, 6))
report_df[['precision', 'recall', 'f1-score']].iloc[:-3].plot(kind='bar')
plt.title('Precision, Recall, F1-Score by Class')
plt.xlabel('Class')
plt.ylabel('Score')
plt.xticks(rotation=0)
plt.ylim(0, 1)
plt.show()

# Step 15: ROC Curve and AUC (for binary classification tasks)
if len(best_pipeline.classes_) == 2:  # Check if binary classification
    # Predict probabilities for the positive class
    y_prob = best_pipeline.predict_proba(X_test_split)[:, 1]
    fpr, tpr, thresholds = roc_curve(y_test_split, y_prob)
    roc_auc = auc(fpr, tpr)

    plt.figure(figsize=(8, 6))
    plt.plot(fpr, tpr, label=f'AUC = {roc_auc:.2f}')
    plt.plot([0, 1], [0, 1], color='gray', linestyle='--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate (Sensitivity)')
    plt.title('ROC Curve')
    plt.legend(loc='lower right')
    plt.show()

# Step 16: Precision-Recall Curve
precision, recall, thresholds_pr = precision_recall_curve(y_test_split, y_prob)

plt.figure(figsize=(8, 6))
plt.plot(recall, precision, label=f'AUC = {auc(recall, precision):.2f}')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.legend(loc='lower left')
plt.show()

In [None]:
df['PREDICTED_CANCELLED_RETURNED'] =  grid_search.best_estimator_.predict(df[df_sample.columns.tolist()])

In [None]:
df[['RETURNED_STATUS', 'PREDICTED_CANCELLED_RETURNED']].value_counts()

In [None]:
df.drop('PREDICTED_RETURN_STATUS', axis=1, inplace=True)

In [None]:

# Importing the get_session function from fosforml's Snowflake session manager
from fosforml.model_manager.snowflakesession import get_session

# Establishing a Snowflake session for executing queries and performing operations
my_session = get_session()

# Define the name of the Snowflake table to query
table_name = 'ORDER_DATA_3009'

# Execute a SQL query to select all records from the specified table in Snowflake
df_sample = my_session.sql("select * from {}".format(table_name)).to_pandas()

df_sample

In [None]:
df_sample[categorical_cols]

In [None]:
df_sample.drop('PREDICTED_CANCELLED_RETURNED', axis=1, inplace=True)

In [None]:
df_sample.info()

In [None]:
# Convert the Pandas DataFrame (cust_df) into a Snowflake DataFrame
training_datadf = my_session.createDataFrame(df_sample)

# Write the Snowflake DataFrame to a Snowflake table named 'casino_customers'
# The 'overwrite' mode ensures that the table is replaced if it already exists
training_datadf.write.mode("overwrite").save_as_table("ORDER_DATA_FINAL")
