# Package Installation for Snowflake Connection and Data Processing

In [None]:
!pip install  dask[complete]  snowflake  snowflake-connector-python snowflake-snowpark-python snowflake-snowpark-python[pandas] seaborn matplotlib numpy pandas scikit-learn  fosforml plotly

# Import necessary libraries

In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from plotly import express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go

# Set display options for better output visibility
pd.set_option('display.max_columns', 100)

# This section connects to Snowflake using **fosforml's Snowflake session manager**, retrieves data from a specified Snowflake table, and loads the data into a Pandas DataFrame for further processing and modeling.


In [2]:
# Import the get_session function from fosforml's Snowflake session manager
# This function helps in establishing a Snowflake session for executing queries
from fosforml.model_manager.snowflakesession import get_session

# Establish a Snowflake session using the configured session manager
# The session will allow you to execute SQL queries and retrieve data from Snowflake
my_session = get_session()

#  Define the name of the table you want to query from Snowflake
# This table should contain the data for order statuses, including DELIVERED, IN PROCESS, CANCELLED, and RETURNED orders
table_name = 'ORDER_DATA_ML'

#  Execute a SQL query to select all records from the specified Snowflake table
# The data is retrieved from Snowflake and loaded into a Pandas DataFrame for further processing
df = my_session.sql(f"SELECT * FROM {table_name}").to_pandas()

# Display the first few rows of the retrieved DataFrame to verify the data load
df.head()


Unnamed: 0,DIVISION_CODE,DIVISION_NAME,BRAND_CODE,BRAND_NAME,CLASS_CODE,CLASS_NAME,SELLING_CHANNEL,CHAIN,WEB_ORDER_NUMBER,OMS_ORDER_NUMBER,OMS_LINE_ITEM_ID,OMS_TICKET_ID,SKU_ID,QUANTITY,UNIT_PRICE,CURRENT_STATUS,CURRENT_STATUS_DESCRIPTION,TRANSACTION_DATE,SHIP_FROM_WAREHOUSE_CODE,SHIP_FROM_WAREHOUSE_DESCRIPTION,ORDER_DATE,READY_TO_PRINT_DATE,PRINT_TICKET_DATE,VERIFIED_SHIPPED_DATE,BACK_ORDERED_DATE,ORDER_AGE,GIFT_ARTICLE_FLAG,CARRIER_NAME,CARRIER_TRACKING_NUMBER,DROPSHIP_FLAG,ORDER_STATUS,ORDER_CREATION_DATE,ORDER_CONFIRMATION_DATE,WM_ORDER_ID,WM_ORDER_LINE_ID,WM_ORDER_STATUS,WM_PICKING_START_TIME,WM_PICKING_END_TIME,WM_PICKING_AGE,WM_PACKING_START_TIME,WM_PACKING_END_TIME,WM_PACKING_AGE,WM_CREATED_DATE,WM_UPDATED_DATE,WM_SHIPPED_DATE,WM_ORDER_AGE,STORE_ID,STORE_NAME,SHIP_METHOD_CODE,SHIP_METHOD_NAME,SHIP_METHOD_SERVICE,SHIPMENT_SLA,NEW_ORDER_DATE,RETURN_REASON,RETURN_FLAG,RECORD_DATE,RECORD_TIME,GROSS_SALES,RETURNED_STATUS
0,81,Women's Designer RTW,24422,Khaite,6,Denim,Online,Chain1,WC100004182263,51956479,51956479*2,51956479-2,301236905235,1,408.0,VS,Shipped,07-03-2024,2,Northpark,2024-07-10,2024-03-07 06:02:18,2024-03-07 06:16:22,2024-03-07 18:27:01,,0,False,FedEx SmartPost,******08155361113886,False,Fulfilled,2024-03-07 06:16:44.510,2024-03-07 06:16:44.553,51956479-21,******8157259087989,Shipped,2024-03-07 07:45:18.127,2024-03-07 12:45:20.378,0.0,2024-03-07 12:44:43.385,2024-03-07 18:02:34.563,0.0,2024-03-07 06:50:15.747,2024-03-07 18:20:37.326,2024-03-07 18:19:07.075,0.0,,,FXPOS,FedEx SmartPost,GROUND,,2023-07-15,,Fully Returned,2024-10-01,04:23:00.597000,408,DELIVERED
1,18,Intimate Apparel,12291,Hanro,106,Slips,Store_POS,Chain1,SP0230011004272030324,51915266,51915266*1,51915266-1,301101027338,1,95.0,VS,Shipped,05-03-2024,2,Northpark,2023-05-05,2024-03-03 15:00:51,2024-03-03 16:16:40,2024-03-05 14:39:23,,2,False,FedEx,******392510,False,Fulfilled,2024-03-03 16:17:34.041,2024-03-03 16:17:34.065,51915266-11,******2776707735174,Shipped,2024-03-03 17:15:28.243,2024-03-04 11:37:34.277,1.0,2024-03-04 11:37:33.706,2024-03-05 14:18:02.183,1.0,2024-03-03 16:51:17.681,2024-03-05 14:35:33.418,2024-03-05 14:34:15.753,2.0,,,FEDXH,FedEx Home Delivery,GROUND,,2022-09-05,,,,,95,DELIVERED
2,53,Beauty,14510,ROC,995,GWP's,Online,Chain1,WC100004139570,51884603,51884603*4,51884603-1,301240888869,1,0.0,VS,Shipped,04-03-2024,2,Northpark,2024-05-27,2024-03-02 18:34:43,2024-03-02 19:15:09,2024-03-04 07:41:19,,2,True,FedEx SmartPost,******08155361015975,False,Fulfilled,2024-03-02 19:15:59.198,2024-03-02 19:15:59.241,51884603-11,******6416082188687,Shipped,2024-03-03 04:47:47.651,2024-03-04 06:23:12.281,1.0,2024-03-04 06:22:25.940,2024-03-04 07:24:41.313,0.0,2024-03-02 19:50:41.624,2024-03-04 07:35:35.015,2024-03-04 07:34:04.119,2.0,,,FXPOS,FedEx SmartPost,GROUND,,2023-06-18,,,,,0,DELIVERED
3,18,Intimate Apparel,12291,Hanro,1,Shirts/Tops,Online,Chain1,WC100004176779,51947005,51947005*1,51947005-1,301224599477,1,240.0,VS,Shipped,07-03-2024,2,Northpark,2024-10-25,2024-03-06 11:13:06,2024-03-06 11:17:00,2024-03-07 07:24:39,,1,False,FedEx SmartPost,******08155361094611,False,Fulfilled,2024-03-06 11:17:54.549,2024-03-06 11:17:54.649,51947005-11,******5231432220836,Shipped,2024-03-06 12:15:22.319,2024-03-06 18:33:34.663,0.0,2024-03-06 18:32:11.086,2024-03-06 22:36:20.716,0.0,2024-03-06 11:52:03.166,2024-03-07 07:20:22.051,2024-03-06 22:49:12.199,0.0,,,FXPOS,FedEx SmartPost,GROUND,,2022-09-14,,Fully Returned,2024-10-01,04:23:00.597000,240,DELIVERED
4,53,Beauty,11796,Estee Lauder,995,GWP's,Online,Chain1,WC100004190974,51971715,51971715*2,51971715-2,301238247418,1,0.0,VS,Shipped,09-03-2024,2,Northpark,2024-08-25,2024-03-08 11:25:43,2024-03-08 12:17:08,2024-03-09 11:10:42,,1,True,FedEx SmartPost,******08155361151543,False,Fulfilled,2024-03-08 12:18:06.080,2024-03-08 12:18:06.117,51971715-21,******8981902098719,Shipped,2024-03-08 13:45:31.626,2024-03-08 21:23:51.280,0.0,2024-03-08 21:23:18.936,2024-03-09 10:49:00.467,1.0,2024-03-08 12:51:38.198,2024-03-09 11:05:21.268,2024-03-09 11:04:05.578,1.0,,,FXPOS,FedEx SmartPost,GROUND,,2023-06-14,,,,,0,DELIVERED


In [None]:
df['RETURNED_STATUS'].value_counts()

In [None]:
df_sample = df[df['RETURNED_STATUS'].isin(['CANCELLED', 'RETURNED', 'DELIVERED', 'IN PROCESS'])]
df_sample = df.groupby('RETURNED_STATUS').sample(n=10000, replace=True)
df_sample


# Dictionary to map status codes to full forms

In [None]:
status_map = {
    'VS': 'Verified Shipped', 
    'RT': 'Ready to Ship',
    'CX': 'Cancelled', #cancelled #training columns
    'RX': 'Return to Sender',
    'BO': 'Backordered', #cancelled #training columns
    'PT': 'Print Ticket', #cancelled #training columns
    'DS': 'Drop Shipped', 
    'RN': 'Returned',
    'RP': 'Replacement',
    'CB': 'Chargeback', #cancelled #training columns
    'RD': 'Ready to Drop' #cancelled #training columns
}

# Replace the values in the CURRENT_STATUS column
df_sample['CURRENT_STATUS'] = df_sample['CURRENT_STATUS'].replace(status_map)

df_sample['CURRENT_STATUS'].value_counts()

# Converting Date Columns to Datetime Format with Error Handling

In [None]:
# List of columns that contain date-related data
date_columns = ['ORDER_DATE', 'TRANSACTION_DATE', 'VERIFIED_SHIPPED_DATE', 'READY_TO_PRINT_DATE', 
                'PRINT_TICKET_DATE', 'WM_PICKING_START_TIME', 'WM_PICKING_END_TIME', 'WM_SHIPPED_DATE']

#  Loop through each column in the list and convert it to a datetime object
for col in date_columns:
    # Convert each column to datetime, using 'coerce' to handle errors in formatting
    df_sample[col] = pd.to_datetime(df_sample[col], errors='coerce')

#  Converting Quantity and Unit Price to Numeric Format

In [None]:
#  Convert the 'QUANTITY' column to a numeric integer type
df_sample['QUANTITY'] = pd.to_numeric(df_sample['QUANTITY'], downcast='integer')

#  Convert the 'UNIT_PRICE' column to a numeric float type
df_sample['UNIT_PRICE'] = pd.to_numeric(df_sample['UNIT_PRICE'], downcast='float')


# Converting SHIPMENT_SLA to Numeric and Calculating Shipping Delay

In [None]:
# Check if SHIPMENT_SLA is a string and convert it to numeric (assuming it's a number)
df_sample['SHIPMENT_SLA'] = pd.to_numeric(df_sample['SHIPMENT_SLA'], errors='coerce')

# Now calculate the shipping delay
df_sample['SHIPPING_DELAY'] = (df_sample['WM_SHIPPED_DATE'] - df_sample['ORDER_DATE']).dt.days > df_sample['SHIPMENT_SLA']

# Handling Missing Values in Shipping Delay and Date Columns

In [None]:
# Fill NaN values in SHIPPING_DELAY if necessary
df_sample['SHIPPING_DELAY'].fillna(False, inplace=True)  # Assuming False for missing data

# Ensure that date columns are in datetime format
df_sample['ORDER_DATE'] = pd.to_datetime(df_sample['ORDER_DATE'], errors='coerce')
df_sample['WM_SHIPPED_DATE'] = pd.to_datetime(df_sample['WM_SHIPPED_DATE'], errors='coerce')

# Fill missing date columns with a default value if necessary (e.g., using the current date or another business logic)
df_sample['WM_SHIPPED_DATE'].fillna(df_sample['ORDER_DATE'], inplace=True)  # Assuming shipped same as order date if missing


# Converting SHIPMENT_SLA and Handling Shipping Delay Calculation

In [None]:
# Ensure 'SHIPMENT_SLA' values are converted to numeric, coercing errors for non-numeric values
df_sample['SHIPMENT_SLA'] = pd.to_numeric(df_sample['SHIPMENT_SLA'], errors='coerce')

# Convert 'ORDER_DATE' and 'WM_SHIPPED_DATE' to datetime, coercing errors for invalid formats
df_sample['ORDER_DATE'] = pd.to_datetime(df_sample['ORDER_DATE'], errors='coerce')
df_sample['WM_SHIPPED_DATE'] = pd.to_datetime(df_sample['WM_SHIPPED_DATE'], errors='coerce')

# Assuming that if 'WM_SHIPPED_DATE' is missing, the order was shipped on the same day as the order was placed
df_sample['WM_SHIPPED_DATE'].fillna(df_sample['ORDER_DATE'], inplace=True)


# Calculate whether the actual shipping time (in days) exceeded the SLA, resulting in a shipping delay
df_sample['SHIPPING_DELAY'] = (df_sample['WM_SHIPPED_DATE'] - df_sample['ORDER_DATE']).dt.days > df_sample['SHIPMENT_SLA']

# Fill missing values in 'SHIPPING_DELAY' with False, assuming no delay if data is missing
df_sample['SHIPPING_DELAY'].fillna(False, inplace=True)


# Writing Pandas DataFrame to Snowflake Table

In [None]:
# Create a Snowflake DataFrame from the existing Pandas DataFrame (df) using the Snowflake session
training_datadf = my_session.createDataFrame(df_sample)

# Save the Snowflake DataFrame as a table named 'ORDER_DATA_TRAINING' in Snowflake
# The 'overwrite' mode ensures that the table is replaced if it already exists
training_datadf.write.mode("overwrite").save_as_table("ORDER_DATA_TRAINING")
