## 1. Import Libraries and Load Data

In [2]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import numpy as np
import joblib

# Load the dataset
file_path = '/content/drive/MyDrive/synthetic_walmart_data_tx.csv'
data = pd.read_csv(file_path)


# 2. Convert 'Date' to Datetime Format


In [None]:
# Convert 'Date' to datetime format
data['Date'] = pd.to_datetime(data['Date'], format='%d-%m-%Y')  # Changed the format string


# 3. Handle Categorical Variables using Label Encoding


In [None]:
# Handle categorical variables using Label Encoding
label_encoders = {}
categorical_columns = [
    'Product Category', 'Promotions/Discounts', 'Holidays/Events', 'Weather Conditions',
    'Gender', 'Income Level', 'Loyalty Program Membership', 'Preferred Product Categories', 'Preferred Purchase Channels'
]

for col in categorical_columns:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])
    label_encoders[col] = le


# 4. Add Date Features


In [None]:
# Add Date features
data['Week'] = data['Date'].dt.isocalendar().week
data['Month'] = data['Date'].dt.month
data['Year'] = data['Date'].dt.year


# 5. Select Features and Target Variable


In [None]:
# Select features and target variable
features = [
    'Product ID', 'Product Category', 'Store ID', 'Price', 'Promotions/Discounts', 'Stock Levels',
    'Holidays/Events', 'Weather Conditions', 'Economic Indicators', 'Age', 'Gender', 'Income Level',
    'Location', 'Purchase Frequency', 'Average Purchase Value', 'Recency of Last Purchase', 'Loyalty Program Membership',
    'Loyalty Points', 'Preferred Product Categories', 'Preferred Purchase Channels', 'Standard Deviation of Demand',
    'Lead Time', 'Ordering Cost', 'Holding Cost', 'Current Stock Level', 'Week', 'Month', 'Year'
]
target = 'Units Sold'

X = data[features]
y = data[target]


# 6. Split the Data into Training and Testing Sets


In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# 7. Train the Model


In [None]:
# Train a Random Forest Regressor model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)


# 8. Make Predictions


In [None]:
# Make predictions on the test set
y_pred = model.predict(X_test)


# 9. Add Predictions to the Test Set


In [None]:
# Add predictions to the test set
X_test['Predicted Units Sold'] = y_pred


# 10. Aggregate Predictions for Weekly, Monthly, and Yearly Demand


In [None]:
# Aggregate predictions for weekly, monthly, and yearly demand
weekly_demand_forecast = X_test.groupby(['Product Category', 'Product ID', 'Week'])['Predicted Units Sold'].sum().reset_index()
monthly_demand_forecast = X_test.groupby(['Product Category', 'Product ID', 'Month'])['Predicted Units Sold'].sum().reset_index()
yearly_demand_forecast = X_test.groupby(['Product Category', 'Product ID', 'Year'])['Predicted Units Sold'].sum().reset_index()

# Merge the aggregated data
demand_forecast = pd.merge(weekly_demand_forecast, monthly_demand_forecast, on=['Product Category', 'Product ID'], suffixes=('_Weekly', '_Monthly'))
demand_forecast = pd.merge(demand_forecast, yearly_demand_forecast, on=['Product Category', 'Product ID'])
demand_forecast.rename(columns={'Predicted Units Sold': 'Predicted Units Sold_Yearly'}, inplace=True)


# 11. Classify Products Based on Dynamic Thresholds


In [None]:
# Define dynamic thresholds for high and low demand classification using percentiles
high_threshold_weekly = demand_forecast['Predicted Units Sold_Weekly'].quantile(0.80)
low_threshold_weekly = demand_forecast['Predicted Units Sold_Weekly'].quantile(0.20)
high_threshold_monthly = demand_forecast['Predicted Units Sold_Monthly'].quantile(0.80)
low_threshold_monthly = demand_forecast['Predicted Units Sold_Monthly'].quantile(0.20)
high_threshold_yearly = demand_forecast['Predicted Units Sold_Yearly'].quantile(0.80)
low_threshold_yearly = demand_forecast['Predicted Units Sold_Yearly'].quantile(0.20)

# Classify products based on the dynamic thresholds
demand_forecast['Demand Level_Weekly'] = np.where(demand_forecast['Predicted Units Sold_Weekly'] > high_threshold_weekly, 'High',
                                                  np.where(demand_forecast['Predicted Units Sold_Weekly'] <= low_threshold_weekly, 'Low', 'Medium'))
demand_forecast['Demand Level_Monthly'] = np.where(demand_forecast['Predicted Units Sold_Monthly'] > high_threshold_monthly, 'High',
                                                   np.where(demand_forecast['Predicted Units Sold_Monthly'] <= low_threshold_monthly, 'Low', 'Medium'))
demand_forecast['Demand Level_Yearly'] = np.where(demand_forecast['Predicted Units Sold_Yearly'] > high_threshold_yearly, 'High',
                                                  np.where(demand_forecast['Predicted Units Sold_Yearly'] <= low_threshold_yearly, 'Low', 'Medium'))


# 12. Decode Product Categories


In [None]:
# Decode product categories back to original names
demand_forecast['Product Category'] = label_encoders['Product Category'].inverse_transform(demand_forecast['Product Category'])


# 13. Create DataFrames for High and Low Demand


In [None]:
# Create dataframes for high and low demand for weekly, monthly, and yearly
high_demand_weekly = demand_forecast[demand_forecast['Demand Level_Weekly'] == 'High'].groupby(['Product Category', 'Product ID'])['Predicted Units Sold_Weekly'].sum().reset_index()
low_demand_weekly = demand_forecast[demand_forecast['Demand Level_Weekly'] == 'Low'].groupby(['Product Category', 'Product ID'])['Predicted Units Sold_Weekly'].sum().reset_index()

high_demand_monthly = demand_forecast[demand_forecast['Demand Level_Monthly'] == 'High'].groupby(['Product Category', 'Product ID'])['Predicted Units Sold_Monthly'].sum().reset_index()
low_demand_monthly = demand_forecast[demand_forecast['Demand Level_Monthly'] == 'Low'].groupby(['Product Category', 'Product ID'])['Predicted Units Sold_Monthly'].sum().reset_index()

high_demand_yearly = demand_forecast[demand_forecast['Demand Level_Yearly'] == 'High'].groupby(['Product Category', 'Product ID'])['Predicted Units Sold_Yearly'].sum().reset_index()
low_demand_yearly = demand_forecast[demand_forecast['Demand Level_Yearly'] == 'Low'].groupby(['Product Category', 'Product ID'])['Predicted Units Sold_Yearly'].sum().reset_index()


# 14. Save the Model and DataFrames


In [None]:
# Save the model using joblib
model_file_path = 'demand_forecast_model.pkl'
joblib.dump(model, model_file_path)

# Save the high and low demand dataframes
high_demand_weekly.to_csv('high_demand_weekly.csv', index=False)
low_demand_weekly.to_csv('low_demand_weekly.csv', index=False)
high_demand_monthly.to_csv('high_demand_monthly.csv', index=False)
low_demand_monthly.to_csv('low_demand_monthly.csv', index=False)
high_demand_yearly.to_csv('high_demand_yearly.csv', index=False)
low_demand_yearly.to_csv('low_demand_yearly.csv', index=False)



# Error of the Model

In [None]:
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print(f'Root Mean Squared Error (RMSE): {rmse}')

In [None]:
high_demand_weekly

Unnamed: 0,Product Category,Product ID,Predicted Units Sold_Weekly
0,Clothing,1,57.01
1,Clothing,9,54.56
2,Clothing,24,1082.43
3,Clothing,28,505.08
4,Clothing,29,54.39
...,...,...,...
108,Toys,85,62.69
109,Toys,87,554.10
110,Toys,95,216.68
111,Toys,97,55.14


In [None]:
low_demand_weekly

Unnamed: 0,Product Category,Product ID,Predicted Units Sold_Weekly
0,Clothing,2,181.88
1,Clothing,7,266.58
2,Clothing,11,827.91
3,Clothing,25,180.64
4,Clothing,28,418.77
...,...,...,...
99,Toys,69,166.48
100,Toys,79,784.08
101,Toys,81,41.70
102,Toys,96,41.43
