In [8]:
import pandas as pd
import datetime
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV, train_test_split, TimeSeriesSplit
import numpy as np
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib

In [9]:
# Define the path to the CSV file
file_path = './data/hobbies_dep_sales.csv'

# Read the CSV file into a DataFrame
hobbies_df = pd.read_csv(file_path)


In [10]:
# Define the path to the CSV file
file_path = './data/calendar.csv'

# Read the CSV file into a DataFrame
calendar_df = pd.read_csv(file_path)


In [11]:
# Defining function used below, to slice and therefore standardize formats between dfs

def day_slicer(row):
    slice_list = row.split("_")
    return slice_list[1]

In [12]:

calendar_df['d'] = calendar_df['d'].apply(day_slicer)
calendar_df['date'] = pd.to_datetime(calendar_df['date'])

In [13]:
# This function fecthes the following attributes from calendar_df and pastes them to sales_df
# snap, based on state
# weekday and if is_weekend
# events of each day, if any

def fetch_calendar_info(row):
    # Filter calendar_df for the matching date
    calendar_row = calendar_df[calendar_df['date'] == row['date']]
    
    # Retrieve the relevant snap value based on the state
    if not calendar_row.empty:
        if row['state_id'] == 'CA':
            row['snap'] = calendar_row['snap_CA'].values[0]
        elif row['state_id'] == 'TX':
            row['snap'] = calendar_row['snap_TX'].values[0]
        elif row['state_id'] == 'WI':
            row['snap'] = calendar_row['snap_WI'].values[0]
        
    # Fetching add weekday from calendar_df
        row['weekday'] = calendar_row['weekday'].values[0]

    # Fetching Event_1
        row["event_name_1"] = calendar_row["event_name_1"].values[0]
        row["event_type_1"] = calendar_row["event_type_1"].values[0]

    # Fetching Event_2, if it is not NaN
        row["event_name_2"] = calendar_row["event_name_2"].values[0]
        row["event_type_2"] = calendar_row["event_type_2"].values[0]
    
    else:
        
    # Empty Error Handling
        row['snap'] = None  # or a default value
        row['weekday'] = None
        row["event_name_1"] = None
        row["event_type_1"] = None
        row["event_name_2"] = None
        row["event_type_2"] = None
        

    # Flag Weekend (Binary)
    if row["weekday"] == "Saturday" or row["weekday"] == "Sunday":
        row["is_weekend"] = 1
    else:
        row["is_weekend"] = 0

    return row

In [14]:
def get_volume_l4w(array, weeks_fetched=4):
    
    days_fetched = 7*weeks_fetched
    snip_end = len(array)
    snip_start = snip_end - days_fetched

    #List containing the values of the last 28 days
    last_28_days = array[snip_start:snip_end].tolist()

    # The Loop belows goes through the values of the last 28 days
    # and sums them into 4 weeks
    last_4_weeks = [0,0,0,0]

    for index, day in enumerate(last_28_days):
        if index < 6: ## Week 1 -> index 0 to 6
            last_4_weeks[0] += day
        elif index < 13: ## Week 2 -> index 7 to 13
            last_4_weeks[1] += day
        elif index < 20: ## Week 3 -> index 14 to 20
            last_4_weeks[2] += day
        else:
            last_4_weeks[3] += day

    # Rounding values before returning
    for i in range(len(last_4_weeks)):
        last_4_weeks[i] = round(last_4_weeks[i],2)

    
    return last_4_weeks, last_28_days

In [15]:
hobbies_df.shape

(1096665, 9)

In [49]:
# Generate dynamic product codes in the format HOBBIES_1_XXX

num_products = 10  # or any other number you want
my_articles = [f'HOBBIES_1_{i:03}' for i in range(1, num_products + 1)]
print(my_articles)


['HOBBIES_1_001', 'HOBBIES_1_002', 'HOBBIES_1_003', 'HOBBIES_1_004', 'HOBBIES_1_005', 'HOBBIES_1_006', 'HOBBIES_1_007', 'HOBBIES_1_008', 'HOBBIES_1_009', 'HOBBIES_1_010']


In [50]:
def the_model(dep_df, list_articles):

    weeks = ["Week 1", "Week 2", "Week 3", "Week 4"]
    data_gathering = {
            'week': weeks,
            'DUMMY': [1,2,3,4]
        }

    ### For each article in list_articles, this function executes its entire code

    for article in list_articles:
        temp_df = dep_df[dep_df['item_id'] == article]
        #print(article)
        #print(temp_df)
        
        # Convert the 'day' column to an integer representing the day number
        temp_df['day_num'] = temp_df['day'].str.extract('d_(\d+)').astype(int)

        # Assume the first day is 2011-01-29, add the day numbers to get actual dates
        start_date = datetime.datetime(2011, 1, 29)
        temp_df['date'] = temp_df['day_num'].apply(lambda x: start_date + datetime.timedelta(days=x-1))

        # Sort by id and date before creating rolling features
        temp_df = temp_df.sort_values(by=['id', 'date'])

        # Create a 7-day rolling average
        temp_df['rolling_avg_7'] = temp_df.groupby('id')['sales'].transform(lambda x: x.rolling(7, min_periods=1).mean().round(2))
        temp_df['rolling_avg_30'] = temp_df.groupby('id')['sales'].transform(lambda x: x.rolling(30, min_periods=1).mean().round(2))

        # Making sure of datetime type
        temp_df['date'] = pd.to_datetime(temp_df['date'])

        # Executing fetch_calendar_info function
        temp_df = temp_df.apply(fetch_calendar_info, axis=1)

        ## START OF ACTUAL MODEL

        # Defining targets, dropping leakage
        target_column = 'rolling_avg_7'
        columns_to_drop =  ['sales', 'rolling_avg_7', 'rolling_avg_30']
        X = temp_df.drop(columns=[target_column])
        y = temp_df[target_column]

        X = pd.get_dummies(X)

        # Training the model
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

        # Extracting date features
        X_train['day'] = X_train['date'].dt.day
        X_train['month'] = X_train['date'].dt.month
        X_train['year'] = X_train['date'].dt.year
        
        X_test['day'] = X_test['date'].dt.day
        X_test['month'] = X_test['date'].dt.month
        X_test['year'] = X_test['date'].dt.year

        # Drop the original 'date' column after extracting features
        X_train = X_train.drop(columns=['date'])
        X_test = X_test.drop(columns=['date'])

        # Initialize the Model: Set up the model with default parameters or customize them as needed.

        model = XGBRegressor(
        objective='reg:squarederror',  # Use 'reg:squarederror' for regression
        n_estimators=100,              # Number of trees
        learning_rate=0.1,             # Step size shrinkage
        max_depth=5,                   # Maximum depth of trees
        random_state=42,                # Seed for reproducibility
        enable_categorical=True
        )

        best_grid = {'colsample_bytree': [1.0],
             'learning_rate': [0.2],
             'max_depth': [3],
             'n_estimators': [500],
             'reg_alpha': [0],
             'reg_lambda': [1],
             'subsample': [1.0]
        }

        
        # Step 4: Set up GridSearchCV with TimeSeriesSplit
        tscv = TimeSeriesSplit(n_splits=5)  # This respects the order of time-series data

        grid_search = GridSearchCV(
        estimator=model,
        param_grid=best_grid,
        cv=tscv,
        scoring='neg_mean_absolute_percentage_error',  # Choose a scoring metric
        verbose=2,
        n_jobs=-1
        )

        # Fit GridSearchCV on the training data
        grid_search.fit(X_train, y_train)
        
        # Retrieve the best model from GridSearchCV
        best_model = grid_search.best_estimator_

        # Getting Predictions
        y_pred = best_model.predict(X_test)

        
        # Predictions become readable
        get_volume_l4w(y_pred)

        #weeks = ["Week 1", "Week 2", "Week 3", "Week 4"]
        days = list(range(1, 29))
        volume_l4w = get_volume_l4w(y_pred)
        last_4_weeks = get_volume_l4w(y_pred)[0]
        last_28_days = get_volume_l4w(y_pred)[1]

        temp_data = {
            'week': weeks,
            article: last_4_weeks
        }

        article_df = pd.DataFrame(temp_data)
        print(article_df)

        data_gathering[article] = last_4_weeks

    ### OUT OF LOOP

    spillback_df = pd.DataFrame(data_gathering)

    return spillback_df
    

  temp_df['day_num'] = temp_df['day'].str.extract('d_(\d+)').astype(int)


In [None]:
the_model(hobbies_df, my_articles)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_df['day_num'] = temp_df['day'].str.extract('d_(\d+)').astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_df['date'] = temp_df['day_num'].apply(lambda x: start_date + datetime.timedelta(days=x-1))


Fitting 5 folds for each of 1 candidates, totalling 5 fits
     week  HOBBIES_1_001
0  Week 1           2.73
1  Week 2           1.30
2  Week 3           1.50
3  Week 4           1.97


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_df['day_num'] = temp_df['day'].str.extract('d_(\d+)').astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_df['date'] = temp_df['day_num'].apply(lambda x: start_date + datetime.timedelta(days=x-1))


___________________________________