In [125]:
import os

class cfg():
    data_path = "/Users/user/Desktop/Algorun_24/data"

    seed = 42

    holidays_path = os.path.join(data_path, "holidays.csv")
    products_path = os.path.join(data_path, "products.csv")
    promotions_path = os.path.join(data_path, "promotions.csv")
    sample_submission_path = os.path.join(data_path, "sample_submission.csv")
    test_path = os.path.join(data_path, "test.csv")
    train_path = os.path.join(data_path, "train.csv")


import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt


holidays_df = pd.read_csv(cfg.holidays_path)
products_df = pd.read_csv(cfg.products_path)
promotions_df = pd.read_csv(cfg.promotions_path)
sample_submission_df = pd.read_csv(cfg.sample_submission_path, delimiter='|')
test_df = pd.read_csv(cfg.test_path)
train_df = pd.read_csv(cfg.train_path)

# Convert columns to datetime format

train_df["week_starting_date"] = pd.to_datetime(train_df["week_starting_date"])
test_df["week_starting_date"] = pd.to_datetime(test_df["week_starting_date"])
sample_submission_df["week_starting_date"] = pd.to_datetime(sample_submission_df["week_starting_date"])

In [126]:
train_df = train_df.merge(products_df, on='product_id', how='left')

train_df["product_group_1_2_3_code"] = train_df["product_group_1_code"].astype(str) + "_" + train_df["product_group_2_code"].astype(str) + "_" + train_df["product_group_3_code"].astype(str)

In [127]:
# Thirteenth submission based on the sale of product groups(group_1_2_3) during last year same_time, and ratio of sale of products in the last 2 months

# Assuming your DataFrame 'df' has columns: 'product_id', 'product_group', 'sale_date', 'sale_amount'

shifted_train_df = train_df.copy()
shifted_train_df["week_starting_date"] = train_df["week_starting_date"] + dt.timedelta(days=371)

important_dates1 = list(test_df["week_starting_date"].unique())
prepared_shifted_train_df = shifted_train_df[
    shifted_train_df["week_starting_date"].isin(important_dates1)
][["week_starting_date", "product_id", "sales_quantity", "product_group_1_2_3_code"]]

# Group by week_starting_date and product_group_1_2_3_code and sum the sales_quantity
grouped_sales = prepared_shifted_train_df.groupby(
    ['week_starting_date', 'product_group_1_2_3_code'])['sales_quantity'].sum().reset_index()

# If you want to reshape the data to have product groups as columns
# pivoted_sales = grouped_sales.pivot(
#     index='week_starting_date', 
#     columns='product_group_1_2_3_code', 
#     values='sales_quantity'
# )


# First calculate the ratios as before
last_date = train_df['week_starting_date'].max()
two_months_ago = last_date - pd.DateOffset(months=2)
last_2months_df = train_df[train_df['week_starting_date'] >= two_months_ago]

# Calculate ratios
product_sales = last_2months_df.groupby('product_id')['sales_quantity'].sum()
group_sales = last_2months_df.groupby('product_group_1_2_3_code')['sales_quantity'].sum()

sales_ratio = (
    last_2months_df[['product_id', 'product_group_1_2_3_code']]
    .drop_duplicates()
    .set_index('product_id')
)

sales_ratio['product_sales'] = product_sales
sales_ratio['group_total_sales'] = sales_ratio['product_group_1_2_3_code'].map(group_sales)
sales_ratio['sales_ratio'] = sales_ratio['product_sales'] / sales_ratio['group_total_sales']
sales_ratio = sales_ratio.reset_index()



# Perform a merge operation instead of iterating
merged_data = pd.merge(
    grouped_sales[['week_starting_date', 'product_group_1_2_3_code', 'sales_quantity']],
    sales_ratio[['product_group_1_2_3_code', 'product_id', 'sales_ratio']],
    on='product_group_1_2_3_code'
)

# Vectorized calculation of predicted sales
merged_data['predicted_sales'] = merged_data['sales_quantity'] * merged_data['sales_ratio']

# Select required columns for final output
predictions_df = merged_data[['week_starting_date', 'product_id', 'predicted_sales']]

In [128]:
merged_data.isna().sum()

week_starting_date           0
product_group_1_2_3_code     0
sales_quantity               0
product_id                   0
sales_ratio                 60
predicted_sales             60
dtype: int64

In [129]:
predictions_df.describe()

Unnamed: 0,product_id,predicted_sales
count,173776.0,173716.0
mean,43328.625794,17.86189
std,25188.413096,101.598082
min,0.0,-0.80857
25%,20935.75,0.0
50%,42691.5,0.0
75%,63920.75,0.247221
max,89717.0,6591.471846


In [130]:
thirteenth_submission_df = sample_submission_df.copy()

thirteenth_submission_df = thirteenth_submission_df.merge(predictions_df, on=["product_id","week_starting_date"], how="left")

thirteenth_submission_df["prediction"] = thirteenth_submission_df["predicted_sales"]

thirteenth_submission_df = thirteenth_submission_df[["product_id", "week_starting_date", "prediction"]]

# Making negative numbers in prediction equal to 0
thirteenth_submission_df.loc[thirteenth_submission_df["prediction"] < 0, "prediction"] = 0

In [131]:
thirteenth_submission_df.describe()

Unnamed: 0,product_id,prediction
count,144980.0,142380.0
mean,43665.162974,21.78275
std,24801.597894,111.835793
min,2.0,0.0
25%,22941.0,0.0
50%,42246.0,0.08333
75%,64544.0,0.427399
max,89717.0,6591.471846


In [132]:
thirteenth_submission_df.isna().sum()

product_id               0
week_starting_date       0
prediction            2600
dtype: int64

In [133]:
thirteenth_submission_df = sample_submission_df.copy()

# First merge with predictions
thirteenth_submission_df = thirteenth_submission_df.merge(predictions_df, on=["product_id","week_starting_date"], how="left")

# Merge with product information to get product_group_1_2_3_code
thirteenth_submission_df = thirteenth_submission_df.merge(
    train_df[['product_id', 'product_group_1_2_3_code']].drop_duplicates(),
    on='product_id',
    how='left'
)

# Fill NaN values with mean predictions for the same product group and week
thirteenth_submission_df['predicted_sales'] = thirteenth_submission_df.groupby(
    ['product_group_1_2_3_code', 'week_starting_date'])['predicted_sales'].transform(
    lambda x: x.fillna(x.mean())
)

thirteenth_submission_df["prediction"] = thirteenth_submission_df["predicted_sales"]



thirteenth_submission_df.fillna(3.5, inplace=True)

In [134]:
thirteenth_submission_df[thirteenth_submission_df["prediction"].isna()]

Unnamed: 0,product_id,week_starting_date,prediction,predicted_sales,product_group_1_2_3_code


In [135]:


# Keep only required columns
thirteenth_submission_df = thirteenth_submission_df[["product_id", "week_starting_date", "prediction"]]

# Making negative numbers in prediction equal to 0
thirteenth_submission_df.loc[thirteenth_submission_df["prediction"] < 0, "prediction"] = 0

thirteenth_submission_df.isna().sum()

product_id            0
week_starting_date    0
prediction            0
dtype: int64

In [136]:
thirteenth_submission_df.describe()

Unnamed: 0,product_id,prediction
count,144980.0,144980.0
mean,43665.162974,21.47373
std,24801.597894,110.855977
min,2.0,0.0
25%,22941.0,0.0
50%,42246.0,0.085933
75%,64544.0,0.501545
max,89717.0,6591.471846


In [137]:
thirteenth_submission_df.to_csv('thirteenth_submission.csv', index=False,sep="|")

In [138]:
# Fourteenth submission will be based on thirteenth submission, but the predictions will be multiplied with (thirteenth submission total sales + promotion calculation total sales)/ (2*thirteenth submission total sales)

