In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_log_error
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd


In [None]:
train_file = "data/train.csv"
products_file = "data/products.csv"
holidays_file = "data/holidays.csv"
sample_submission_file = "data/sample_submission.csv"

train_df = pd.read_csv(train_file)
products_df = pd.read_csv(products_file)
holidays_df = pd.read_csv(holidays_file)
sample_submission_df = pd.read_csv(sample_submission_file, delimiter="|")
test_df = pd.read_csv("data/test.csv")

train_df["week_starting_date"] = pd.to_datetime(train_df["week_starting_date"])
test_df["week_starting_date"] = pd.to_datetime(test_df["week_starting_date"])
sample_submission_df["week_starting_date"] = pd.to_datetime(sample_submission_df["week_starting_date"])
merged_data = pd.merge(train_df, products_df, on="product_id", how="inner")


In [1]:


# Assuming merged_data is already loaded
train_df = merged_data.copy()
train_df["week_starting_date"] = pd.to_datetime(train_df["week_starting_date"])

# Sort data by product_id and week_starting_date to ensure chronological order
train_df.sort_values(by=["product_id", "week_starting_date"], inplace=True)

# Set negative sales to 0 (to handle invalid or erroneous negative values)
train_df["sales_quantity"] = train_df["sales_quantity"].apply(lambda x: max(x, 0))

# Create lag features for previous weeks' sales (lag 1, lag 2, etc.)
for lag in range(1, 2):  # Using lags for 1 week, 2 weeks, etc.
    train_df[f"lag_{lag}_sales"] = train_df.groupby('product_id')['sales_quantity'].shift(lag)

# Create a feature for sales from the same week in the previous year (52 weeks ago)
train_df["lag_52_sales"] = train_df.groupby('product_id')['sales_quantity'].shift(53)

# Drop rows with missing values (caused by lagging)
train_df = train_df.dropna(subset=[f"lag_{lag}_sales" for lag in range(1, 2)] + ["lag_52_sales"])

# Feature columns (lags including 1 year ago)
features = [f"lag_{lag}_sales" for lag in range(1, 2)] + ["lag_52_sales"]

# Target column (sales quantity)
target = 'sales_quantity'

# Filter out rows with non-positive sales quantities (if applicable)
train_df = train_df[train_df[target] > 0]

# Split into training and validation sets (80% train, 20% validation)
X = train_df[features]
y = train_df[target]

# Retain product_id and week_starting_date for later use
meta_columns = train_df[["product_id", "week_starting_date"]]

X_train, X_val, y_train, y_val, meta_train, meta_val = train_test_split(
    X, y, meta_columns, test_size=0.2, shuffle=False
)

# Initialize and train the Random Forest Regressor
model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
model.fit(X_train, y_train)

# Predict on validation set
y_pred = model.predict(X_val)

# Clip predictions to avoid negatives
y_pred = np.maximum(y_pred, 0)

# Add a small constant to avoid log(0) or negative values for RMSLE
y_val = y_val + 1  # Adding 1 to both actual and predicted sales to avoid log(0)
y_pred = y_pred + 1

# Calculate RMSLE
rmsle = np.sqrt(mean_squared_log_error(y_val, y_pred))
print(f"Root Mean Squared Logarithmic Error (RMSLE): {rmsle:.4f}")

# Add predictions back to the validation metadata
meta_val["sales_quantity"] = y_pred - 1  # Subtract 1 to undo earlier constant addition





NameError: name 'merged_data' is not defined

In [None]:
test_df = sample_submission_df.copy()


combined_df = pd.concat([train_df[["product_id", "week_starting_date", "sales_quantity"]],
                         test_df[["product_id", "week_starting_date"]]], 
                        ignore_index=True)

# Sort combined data by product_id and week_starting_date
combined_df.sort_values(by=["product_id", "week_starting_date"], inplace=True)



# Fill missing sales_quantity for test rows with NaN (test rows will have this column missing)
combined_df["sales_quantity"] = combined_df["sales_quantity"].fillna(np.nan)

print(combined_df.head)

# Create lag features
for lag in range(1, 2):  # Lags: 1 week, 2 weeks, 3 weeks
    combined_df[f"lag_{lag}_sales"] = combined_df.groupby('product_id')['sales_quantity'].shift(lag + 53)

# Create a feature for sales from the same week in the previous year (52 weeks ago)
combined_df["lag_52_sales"] = combined_df.groupby('product_id')['sales_quantity'].shift(53)

# Separate back the test set
test_features = combined_df.loc[combined_df["sales_quantity"].isna(), 
                                 ["product_id", "week_starting_date"] + [f"lag_{lag}_sales" for lag in range(1, 2)] + ["lag_52_sales"]]

print(test_features.isna().sum())
# Drop rows with missing lag values in the test set
test_features = test_features.fillna(0)

# Make predictions on the test set
X_test = test_features[[f"lag_{lag}_sales" for lag in range(1, 2)] + ["lag_52_sales"]]
test_features["predicted_sales"] = model.predict(X_test)

# Clip negative predictions to 0
test_features["predicted_sales"] = test_features["predicted_sales"].apply(lambda x: max(x, 0))

# Final predicted test set
final_test_predictions = test_features[["product_id", "week_starting_date", "predicted_sales"]]

print(final_test_predictions.head)
