In [1]:
import joblib
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import matplotlib.pyplot as plt
import numpy as np

# Daily Sales Prediction Model

In [2]:
sales_df = pd.read_csv("statsfinal.csv")
sales_df.head()

Unnamed: 0.1,Unnamed: 0,Date,Q-P1,Q-P2,Q-P3,Q-P4,S-P1,S-P2,S-P3,S-P4
0,0,13-06-2010,5422,3725,576,907,17187.74,23616.5,3121.92,6466.91
1,1,14-06-2010,7047,779,3578,1574,22338.99,4938.86,19392.76,11222.62
2,2,15-06-2010,1572,2082,595,1145,4983.24,13199.88,3224.9,8163.85
3,3,16-06-2010,5657,2399,3140,1672,17932.69,15209.66,17018.8,11921.36
4,4,17-06-2010,3668,3207,2184,708,11627.56,20332.38,11837.28,5048.04


In [3]:
# Convert 'Date' to datetime format
sales_df['Date'] = pd.to_datetime(sales_df['Date'], dayfirst=True, errors='coerce')

# Extract date-related features
sales_df['Year'] = sales_df['Date'].dt.year
sales_df['Month'] = sales_df['Date'].dt.month
sales_df['Day'] = sales_df['Date'].dt.day

# Optionally drop the 'Day' column if you think day-level granularity isn't needed
sales_df = sales_df.drop(columns=['Date', 'S-P1', 'S-P2', 'S-P3', 'S-P4', 'Unnamed: 0'])
sales_df.head()

Unnamed: 0,Q-P1,Q-P2,Q-P3,Q-P4,Year,Month,Day
0,5422,3725,576,907,2010.0,6.0,13.0
1,7047,779,3578,1574,2010.0,6.0,14.0
2,1572,2082,595,1145,2010.0,6.0,15.0
3,5657,2399,3140,1672,2010.0,6.0,16.0
4,3668,3207,2184,708,2010.0,6.0,17.0


In [4]:
# Drop rows with any NaN values
sales_df = sales_df.dropna()

# Checking for missing values
print(sales_df.isnull().sum())

Q-P1     0
Q-P2     0
Q-P3     0
Q-P4     0
Year     0
Month    0
Day      0
dtype: int64


In [5]:
# Checking the summary statistics of the dataset
print(sales_df.describe())

              Q-P1         Q-P2         Q-P3         Q-P4        Year  \
count  4574.000000  4574.000000  4574.000000  4574.000000  4574.00000   
mean   4123.342589  2129.705072  3143.769786  1123.738303  2016.26585   
std    2243.691134  1089.503315  1671.052866   497.813557     3.66205   
min     254.000000   251.000000   250.000000   250.000000  2010.00000   
25%    2149.500000  1167.250000  1695.250000   696.000000  2013.00000   
50%    4138.000000  2133.500000  3196.500000  1137.000000  2016.00000   
75%    6072.000000  3069.750000  4564.750000  1545.750000  2019.00000   
max    7998.000000  3998.000000  6000.000000  2000.000000  2023.00000   

             Month          Day  
count  4574.000000  4574.000000  
mean      6.576301    15.605378  
std       3.455217     8.726306  
min       1.000000     1.000000  
25%       4.000000     8.000000  
50%       7.000000    16.000000  
75%      10.000000    23.000000  
max      12.000000    31.000000  


In [6]:
# Define features
X = sales_df[['Year', 'Month', 'Day']]

# Define targets
y = sales_df[['Q-P1', 'Q-P2', 'Q-P3', 'Q-P4']]

In [7]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
# Initialize and train the RandomForestRegressor
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [9]:
# Predict on the test set
y_pred = model.predict(X_test)

In [10]:
# Calculate RMSE for each target
rmse_q1 = np.sqrt(mean_squared_error(y_test['Q-P1'], y_pred[:, 0]))
rmse_q2 = np.sqrt(mean_squared_error(y_test['Q-P2'], y_pred[:, 1]))
rmse_q3 = np.sqrt(mean_squared_error(y_test['Q-P3'], y_pred[:, 2]))
rmse_q4 = np.sqrt(mean_squared_error(y_test['Q-P4'], y_pred[:, 3]))

# Calculate R² for each target
r2_q1 = r2_score(y_test['Q-P1'], y_pred[:, 0])
r2_q2 = r2_score(y_test['Q-P2'], y_pred[:, 1])
r2_q3 = r2_score(y_test['Q-P3'], y_pred[:, 2])
r2_q4 = r2_score(y_test['Q-P4'], y_pred[:, 3])

print(f'RMSE for P1: {rmse_q1}')
print(f'RMSE for P2: {rmse_q2}')
print(f'RMSE for P3: {rmse_q3}')
print(f'RMSE for P4: {rmse_q4}')

print(f'R² for P1: {r2_q1}')
print(f'R² for P2: {r2_q2}')
print(f'R² for P3: {r2_q3}')
print(f'R² for P4: {r2_q4}')


RMSE for P1: 2401.7007055908134
RMSE for P2: 1193.9433877706729
RMSE for P3: 1828.4767702007662
RMSE for P4: 548.9974033126603
R² for P1: -0.20232987727028995
R² for P2: -0.2311506575791562
R² for P3: -0.18737713883779405
R² for P4: -0.2189206558952448


In [11]:
# Save trained model
joblib.dump(model, "models/randomforest-daily.pkl")

['models/randomforest-daily.pkl']

## Sample prediction:

In [12]:
# Load pre-trained model
loaded_model = joblib.load("models/randomforest-daily.pkl")

# Daily prediction
X_test2 = pd.DataFrame(np.array([2024,6,15]).reshape(1,-1), columns=["Year","Month","Day"])
predicted_df2 = loaded_model.predict(X_test2)
predicted_df2

array([[5605.1 , 1259.74,  861.82, 1074.92]])

# Monthly Sales Prediction Model

In [13]:
sales_df = pd.read_csv("statsfinal.csv")
sales_df.head()

Unnamed: 0.1,Unnamed: 0,Date,Q-P1,Q-P2,Q-P3,Q-P4,S-P1,S-P2,S-P3,S-P4
0,0,13-06-2010,5422,3725,576,907,17187.74,23616.5,3121.92,6466.91
1,1,14-06-2010,7047,779,3578,1574,22338.99,4938.86,19392.76,11222.62
2,2,15-06-2010,1572,2082,595,1145,4983.24,13199.88,3224.9,8163.85
3,3,16-06-2010,5657,2399,3140,1672,17932.69,15209.66,17018.8,11921.36
4,4,17-06-2010,3668,3207,2184,708,11627.56,20332.38,11837.28,5048.04


In [14]:
# Convert 'Date' to datetime format
sales_df['Date'] = pd.to_datetime(sales_df['Date'], dayfirst=True, errors='coerce')

# Extract date-related features
sales_df['Year'] = sales_df['Date'].dt.year
sales_df['Month'] = sales_df['Date'].dt.month
sales_df['Day'] = sales_df['Date'].dt.day

# Drop the date and target columns
sales_df = sales_df.drop(columns=['Date', 'S-P1', 'S-P2', 'S-P3', 'S-P4', 'Unnamed: 0'])
sales_df.head()

Unnamed: 0,Q-P1,Q-P2,Q-P3,Q-P4,Year,Month,Day
0,5422,3725,576,907,2010.0,6.0,13.0
1,7047,779,3578,1574,2010.0,6.0,14.0
2,1572,2082,595,1145,2010.0,6.0,15.0
3,5657,2399,3140,1672,2010.0,6.0,16.0
4,3668,3207,2184,708,2010.0,6.0,17.0


In [15]:
# Drop rows with any NaN values
sales_df = sales_df.dropna()

# Checking for missing values
print(sales_df.isnull().sum())

Q-P1     0
Q-P2     0
Q-P3     0
Q-P4     0
Year     0
Month    0
Day      0
dtype: int64


In [16]:
# Group by Year and Month and count the number of days
monthly_sales = sales_df.groupby(['Year', 'Month']).agg({
    'Q-P1': 'sum',
    'Q-P2': 'sum',
    'Q-P3': 'sum',
    'Q-P4': 'sum'
}).reset_index()

# Drop first and last row due to incomplete data for those months
monthly_sales = monthly_sales.iloc[1:-1].reset_index(drop=True)
monthly_sales

Unnamed: 0,Year,Month,Q-P1,Q-P2,Q-P3,Q-P4
0,2010.0,7.0,142711,59313,88764,33606
1,2010.0,8.0,98246,62008,100923,34600
2,2010.0,9.0,110010,65823,86726,34996
3,2010.0,10.0,130310,66011,90204,32434
4,2010.0,11.0,105429,72687,103171,31124
...,...,...,...,...,...,...
146,2022.0,9.0,106245,67514,88844,30799
147,2022.0,10.0,102327,59798,95427,32833
148,2022.0,11.0,124306,62804,81694,33990
149,2022.0,12.0,129852,61522,109323,28997


In [17]:
# Define features
X = monthly_sales[['Year', 'Month']]

# Define targets
y = monthly_sales[['Q-P1', 'Q-P2', 'Q-P3', 'Q-P4']]

In [18]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [19]:
# Initialize and train the RandomForestRegressor
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [20]:
# Predict on the test set
y_pred = model.predict(X_test)

# Calculate RMSE for each target
rmse_q1 = np.sqrt(mean_squared_error(y_test['Q-P1'], y_pred[:, 0]))
rmse_q2 = np.sqrt(mean_squared_error(y_test['Q-P2'], y_pred[:, 1]))
rmse_q3 = np.sqrt(mean_squared_error(y_test['Q-P3'], y_pred[:, 2]))
rmse_q4 = np.sqrt(mean_squared_error(y_test['Q-P4'], y_pred[:, 3]))

# Calculate R² for each target
r2_q1 = r2_score(y_test['Q-P1'], y_pred[:, 0])
r2_q2 = r2_score(y_test['Q-P2'], y_pred[:, 1])
r2_q3 = r2_score(y_test['Q-P3'], y_pred[:, 2])
r2_q4 = r2_score(y_test['Q-P4'], y_pred[:, 3])

print(f'RMSE for P1: {rmse_q1}')
print(f'RMSE for P2: {rmse_q2}')
print(f'RMSE for P3: {rmse_q3}')
print(f'RMSE for P4: {rmse_q4}')

print(f'R² for P1: {r2_q1}')
print(f'R² for P2: {r2_q2}')
print(f'R² for P3: {r2_q3}')
print(f'R² for P4: {r2_q4}')

RMSE for P1: 14832.919099048011
RMSE for P2: 7061.8318039895785
RMSE for P3: 12585.769411186126
RMSE for P4: 3397.3141427618493
R² for P1: -0.6064736817055385
R² for P2: -0.0572498388102729
R² for P3: -0.06023750787593296
R² for P4: -0.06943094427456287


In [21]:
# Save trained model
joblib.dump(model, "models/randomforest-monthly.pkl")

['models/randomforest-monthly.pkl']

## Sample Prediction

In [22]:
# Load pre-trained model
loaded_model = joblib.load("models/randomforest-monthly.pkl")

# Daily prediction
X_test2 = pd.DataFrame(np.array([2024,7]).reshape(1,-1), columns=["Year","Month"])
predicted_df2 = loaded_model.predict(X_test2)
predicted_df2

array([[131707.76,  64523.92,  91409.87,  34529.5 ]])