In [75]:
import datetime
import pandas as pd
from sklearn.model_selection import train_test_split, TimeSeriesSplit, GridSearchCV
from xgboost import XGBRegressor

In [76]:
#import pandas as pd
#import datetime
#import matplotlib.pyplot as plt
#from sklearn.model_selection import GridSearchCV, train_test_split, TimeSeriesSplit
#import numpy as np
#from xgboost import XGBRegressor
#from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
#import joblib
#import random
#import string

In [77]:
# Define the path to the CSV file
file_path = './data/hobbies_dep_sales.csv'

# Read the CSV file into a DataFrame
hobbies_df = pd.read_csv(file_path)


In [78]:
# Define the path to the CSV file
file_path = './data/calendar.csv'

# Read the CSV file into a DataFrame
calendar_df = pd.read_csv(file_path)


In [79]:
def generate_random_string(length=6):
    # Generate a random string of letters and digits
    return ''.join(random.choices(string.ascii_letters + string.digits, k=length))

In [80]:
# Defining function used below, to slice and therefore standardize formats between dfs

def day_slicer(row):
    slice_list = row.split("_")
    return slice_list[1]

In [81]:

calendar_df['d'] = calendar_df['d'].apply(day_slicer)
calendar_df['date'] = pd.to_datetime(calendar_df['date'])

In [82]:
# This function fecthes the following attributes from calendar_df and pastes them to sales_df
# snap, based on state
# weekday and if is_weekend
# events of each day, if any

def fetch_calendar_info(row):
    # Filter calendar_df for the matching date
    calendar_row = calendar_df[calendar_df['date'] == row['date']]
    
    # Retrieve the relevant snap value based on the state
    if not calendar_row.empty:
        if row['state_id'] == 'CA':
            row['snap'] = calendar_row['snap_CA'].values[0]
        elif row['state_id'] == 'TX':
            row['snap'] = calendar_row['snap_TX'].values[0]
        elif row['state_id'] == 'WI':
            row['snap'] = calendar_row['snap_WI'].values[0]
        
    # Fetching add weekday from calendar_df
        row['weekday'] = calendar_row['weekday'].values[0]

    # Fetching Event_1
        row["event_name_1"] = calendar_row["event_name_1"].values[0]
        row["event_type_1"] = calendar_row["event_type_1"].values[0]

    # Fetching Event_2, if it is not NaN
        row["event_name_2"] = calendar_row["event_name_2"].values[0]
        row["event_type_2"] = calendar_row["event_type_2"].values[0]
    
    else:
        
    # Empty Error Handling
        row['snap'] = None  # or a default value
        row['weekday'] = None
        row["event_name_1"] = None
        row["event_type_1"] = None
        row["event_name_2"] = None
        row["event_type_2"] = None
        

    # Flag Weekend (Binary)
    if row["weekday"] == "Saturday" or row["weekday"] == "Sunday":
        row["is_weekend"] = 1
    else:
        row["is_weekend"] = 0

    return row

In [83]:
def get_volume_l4w(array, weeks_fetched=4):
    
    days_fetched = 7*weeks_fetched
    snip_end = len(array)
    snip_start = snip_end - days_fetched

    #List containing the values of the last 28 days
    last_28_days = array[snip_start:snip_end].tolist()

    # The Loop belows goes through the values of the last 28 days
    # and sums them into 4 weeks
    last_4_weeks = [0,0,0,0]

    for index, day in enumerate(last_28_days):
        if index < 6: ## Week 1 -> index 0 to 6
            last_4_weeks[0] += day
        elif index < 13: ## Week 2 -> index 7 to 13
            last_4_weeks[1] += day
        elif index < 20: ## Week 3 -> index 14 to 20
            last_4_weeks[2] += day
        else:
            last_4_weeks[3] += day

    # Rounding values before returning
    for i in range(len(last_4_weeks)):
        last_4_weeks[i] = round(last_4_weeks[i],2)

    
    return last_4_weeks, last_28_days

In [84]:
hobbies_df.shape

(1096665, 9)

In [85]:
# Generate dynamic product codes in the format HOBBIES_1_XXX

num_products = 10  # or any other number you want
my_articles = [f'HOBBIES_1_{i:03}' for i in range(1, num_products + 1)]
print(my_articles)


['HOBBIES_1_001', 'HOBBIES_1_002', 'HOBBIES_1_003', 'HOBBIES_1_004', 'HOBBIES_1_005', 'HOBBIES_1_006', 'HOBBIES_1_007', 'HOBBIES_1_008', 'HOBBIES_1_009', 'HOBBIES_1_010']


In [94]:
### THE MODEL FUNCTION

def the_model(dep_df, list_articles, be_verbose=False, create_csv=False):

    weeks = ["Week 1", "Week 2", "Week 3", "Week 4"]
    data_gathering = {'week': weeks, 'DUMMY': [1, 2, 3, 4]}
    start_date = datetime.datetime(2011, 1, 29)

    for article in list_articles:
        temp_df = dep_df[dep_df['item_id'] == article].copy()
        
        # Convert the 'day' column to integer day numbers
        #temp_df['day_num'] = temp_df['day'].str.extract(r'd_(\d+)').astype(int)
        #temp_df['date'] = temp_df['day_num'].apply(lambda x: start_date + datetime.timedelta(days=x-1))
        
        temp_df.loc[:, 'day_num'] = temp_df['day'].str.extract(r'd_(\d+)').astype(int)
        temp_df.loc[:, 'date'] = temp_df['day_num'].apply(lambda x: start_date + datetime.timedelta(days=x-1))


        # Sort by id and date before creating rolling features
        temp_df = temp_df.sort_values(by=['id', 'date'])

        # Create rolling averages
        temp_df['rolling_avg_7'] = temp_df.groupby('id')['sales'].transform(lambda x: x.rolling(7, min_periods=1).mean().round(2))
        temp_df['rolling_avg_30'] = temp_df.groupby('id')['sales'].transform(lambda x: x.rolling(30, min_periods=1).mean().round(2))

        # Execute fetch_calendar_info function
        temp_df = temp_df.apply(fetch_calendar_info, axis=1)

        # Define targets, drop leakage
        target_column = 'rolling_avg_7'
        X = temp_df.drop(columns=[target_column, 'sales', 'rolling_avg_30'])
        y = temp_df[target_column]

        X = pd.get_dummies(X)
        X['day'] = X['date'].dt.day
        X['month'] = X['date'].dt.month
        X['year'] = X['date'].dt.year
        X = X.drop(columns=['date'])

        # Train-test split
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

        # Initialize and optimize model
        model = XGBRegressor(objective='reg:squarederror', random_state=42)
        
        best_grid = {
            'colsample_bytree': [1.0],
            'learning_rate': [0.2],
            'max_depth': [3],
            'n_estimators': [500],
            'reg_alpha': [0],
            'reg_lambda': [1],
            'subsample': [1.0]
        }
        
        tscv = TimeSeriesSplit(n_splits=5)
        
        grid_search = GridSearchCV(
            estimator=model, 
            param_grid=best_grid, 
            cv=tscv, 
            scoring='neg_mean_absolute_percentage_error', 
            n_jobs=-1,
            verbose=0
        )
        
        grid_search.fit(X_train, y_train)

        # Predict and store results
        best_model = grid_search.best_estimator_
        y_pred = best_model.predict(X_test)
        last_4_weeks, last_28_days = get_volume_l4w(y_pred)

        if be_verbose:
            #print(f"_________________________________________________")
            print(f"Below, preliminary results for {article} product:")
            temp_data = {'week': weeks, article: last_4_weeks}
            article_df = pd.DataFrame(temp_data)
            print(article_df)
            print(f"_________________________________________________")

        data_gathering[article] = last_4_weeks

    # Compile results into a DataFrame
    spillback_df = pd.DataFrame(data_gathering)

    # Save to CSV if specified
    if create_csv:
        filename = f"outputs/output_{generate_random_string()}.csv"
        spillback_df.to_csv(filename, index=False)

    return spillback_df

In [None]:
the_model(hobbies_df, my_articles, True, True)

_________________________________________________
Below, preliminary results for HOBBIES_1_001 product:
     week  HOBBIES_1_001
0  Week 1           2.53
1  Week 2           1.45
2  Week 3           1.66
3  Week 4           1.73
_________________________________________________
_________________________________________________
Below, preliminary results for HOBBIES_1_002 product:
     week  HOBBIES_1_002
0  Week 1           1.74
1  Week 2           2.33
2  Week 3           2.15
3  Week 4           2.13
_________________________________________________
_________________________________________________
Below, preliminary results for HOBBIES_1_003 product:
     week  HOBBIES_1_003
0  Week 1           0.72
1  Week 2           0.37
2  Week 3           0.63
3  Week 4           1.46
_________________________________________________
_________________________________________________
Below, preliminary results for HOBBIES_1_004 product:
     week  HOBBIES_1_004
0  Week 1          12.18
1  Week 2

___________________________________