In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt
import datetime as dt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_log_error




In [2]:
train_file = "data/train.csv"
products_file = "data/products.csv"
holidays_file = "data/holidays.csv"
sample_submission_file = "data/sample_submission.csv"

train_df = pd.read_csv(train_file)
products_df = pd.read_csv(products_file)
holidays_df = pd.read_csv(holidays_file)
sample_submission_df = pd.read_csv(sample_submission_file, delimiter="|")
test_df = pd.read_csv("data/test.csv")


In [4]:
train_df["week_starting_date"] = pd.to_datetime(train_df["week_starting_date"])
test_df["week_starting_date"] = pd.to_datetime(test_df["week_starting_date"])
sample_submission_df["week_starting_date"] = pd.to_datetime(sample_submission_df["week_starting_date"])

In [6]:
merged_data = pd.merge(train_df, products_df, on="product_id", how="inner")


In [15]:
first_submission_df = sample_submission_df.copy()

# Shift train_df 53 weeks

shifted_train_df = merged_data.copy()
shifted_train_df["week_starting_date"] = train_df["week_starting_date"] + dt.timedelta(days=371)

prepared_shifted_train_df = shifted_train_df.loc[:, ["week_starting_date", "product_id", "sales_quantity"]]

first_submission_df = first_submission_df.merge(prepared_shifted_train_df, on=["week_starting_date", "product_id"], how="left")

In [20]:
train_df = pd.read_csv("data/train.csv")
products_df = pd.read_csv("data/products.csv")

# Handle negative sales
train_df["sales_quantity"] = train_df["sales_quantity"].apply(lambda x: max(x, 0))

# Merge train data with products data to include product groups
merged_data = pd.merge(train_df, products_df, on="product_id", how="inner")

# Group by brand and product_group_1_code to calculate mean sales
grouped_sales = (
    merged_data.groupby(["brand", "product_group_1_code"])
    .agg(mean_sales=("sales_quantity", "mean"))
    .reset_index()
)

# Merge grouped data back to the product data
final_data = pd.merge(
    products_df, grouped_sales, on=["brand", "product_group_1_code"], how="left"
)

# Fill missing mean sales with the overall mean sales from train data
overall_mean_sales = train_df["sales_quantity"].mean()
final_data["mean_sales"] = final_data["mean_sales"].fillna(overall_mean_sales)

# Select final columns: product_id, brand, product_group_1_code, mean_sales
final_data = final_data[["product_id", "brand", "product_group_1_code", "mean_sales"]]

# Print the final data
print("Final Data with Mean Sales by Brand and Product Group 1:")
print(final_data)

# Save to a CSV file (optional)
final_data.to_csv("final_mean_sales_by_brand_and_group1.csv", index=False)

Final Data with Mean Sales by Brand and Product Group 1:
       product_id  brand  product_group_1_code  mean_sales
0               2    200                     2   85.602346
1               3     77                     4   18.101756
2               4    322                     4   61.248182
3               6    125                     4   12.114438
4              16    152                     4    0.415120
...           ...    ...                   ...         ...
89713       89677     43                     0   31.743792
89714       89685    125                     4   12.114438
89715       89688    337                     4   46.567579
89716       89691    247                     1   32.108188
89717       89716     81                     4   14.174549

[89718 rows x 4 columns]


In [28]:
train_df = pd.read_csv("data/train.csv")
products_df = pd.read_csv("data/products.csv")

# Handle negative sales
train_df["sales_quantity"] = train_df["sales_quantity"].apply(lambda x: max(x, 0))

# Convert week_starting_date to datetime and extract the month
train_df["week_starting_date"] = pd.to_datetime(train_df["week_starting_date"])
train_df["month"] = train_df["week_starting_date"].dt.month

# Merge train data with products data
merged_data = pd.merge(train_df, products_df, on="product_id", how="inner")

# Group by brand, product_group_1_code, and month to calculate mean sales
grouped_monthly_sales = (
    merged_data.groupby(["brand", "product_group_1_code", "product_group_2_code", "month"])
    .agg(mean_sales=("sales_quantity", "mean"))
    .reset_index()
)

# Create a DataFrame with all combinations of brand, product_group_1_code, and months
brands = products_df["brand"].unique()
product_groups = products_df["product_group_1_code"].unique()
product_groups2 = products_df["product_group_2_code"].unique()
months = list(range(1, 13))

# Generate the full cartesian product of brand, product_group_1_code, and months
full_combinations = pd.MultiIndex.from_product(
    [brands, product_groups, product_groups2, months],
    names=["brand", "product_group_1_code", "product_group_2_code", "month"]
).to_frame(index=False)

# Merge full combinations with grouped sales data
final_data = pd.merge(
    full_combinations, grouped_monthly_sales, on=["brand", "product_group_1_code","product_group_2_code", "month"], how="left"
)

# Fill missing mean sales with the overall monthly mean sales (if needed)
overall_monthly_mean_sales = train_df.groupby("month")["sales_quantity"].mean()
final_data["mean_sales"] = final_data.apply(
    lambda row: overall_monthly_mean_sales[row["month"]]
    if pd.isnull(row["mean_sales"])
    else row["mean_sales"],
    axis=1,
)

# Merge back with product_id from products_df
final_data = pd.merge(
    products_df[["product_id", "brand", "product_group_1_code","product_group_2_code"]],
    final_data,
    on=["brand", "product_group_1_code", "product_group_2_code"],
    how="inner"
)

# Select final columns: product_id, brand, product_group_1_code, month, mean_sales
final_data = final_data[
    ["product_id", "brand", "product_group_1_code","product_group_2_code", "month", "mean_sales"]
]

# Print the final data
print("Final Data with Mean Sales by Brand, Product Group 1, and Month:")
print(final_data)

# Save to a CSV file (optional)
final_data.to_csv("final_monthly_mean_sales_by_brand_and_group1.csv", index=False)


Final Data with Mean Sales by Brand, Product Group 1, and Month:
         product_id  brand  product_group_1_code  product_group_2_code  month  \
0                 2    200                     2                     4      1   
1                 2    200                     2                     4      2   
2                 2    200                     2                     4      3   
3                 2    200                     2                     4      4   
4                 2    200                     2                     4      5   
...             ...    ...                   ...                   ...    ...   
1076611       82002    317                     0                     1      8   
1076612       82002    317                     0                     1      9   
1076613       82002    317                     0                     1     10   
1076614       82002    317                     0                     1     11   
1076615       82002    317                  

In [32]:
train_df = pd.read_csv("data/train.csv")

# Preprocess train data
train_df["week_starting_date"] = pd.to_datetime(train_df["week_starting_date"])
train_df["month"] = train_df["week_starting_date"].dt.month

merged_data = pd.merge(train_df, products_df, on="product_id", how="inner")

train_df=merged_data

# Merge train data with final_data
merged_train = pd.merge(
    train_df,
    final_data,
    on=["product_id","brand", "product_group_1_code", "product_group_2_code", "month"],
    how="left"
)

# Handle missing mean_sales (if any)
merged_train["mean_sales"] = merged_train.apply(
    lambda row: overall_monthly_mean_sales[row["month"]]
    if pd.isnull(row["mean_sales"])
    else row["mean_sales"],
    axis=1,
)

# Prepare true and predicted values
y_true = merged_train["sales_quantity"].apply(lambda x: max(x, 0)) 
y_pred = merged_train["mean_sales"].apply(lambda x: max(x, 0))  # Ensure non-negative values

# Calculate RMSLE
rmsle = np.sqrt(mean_squared_log_error(y_true, y_pred))

# Print evaluation result
print(f"RMSLE on train data: {rmsle}")


RMSLE on train data: 2.298316132025498
