In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, median_absolute_error, explained_variance_score
from sklearn.preprocessing import StandardScaler
import pickle
from datetime import datetime, timedelta

In [3]:
# --- Data Loading and Preprocessing ---

# Load your data (replace 'train.csv' with your actual file name)
data = pd.read_csv("train.csv")  
data.head()
data.tail()

Unnamed: 0,date,store,item,sales
182595,27/12/2023,10,10,72
182596,28/12/2023,10,10,81
182597,29/12/2023,10,10,69
182598,30/12/2023,10,10,86
182599,31/12/2023,10,10,67


In [4]:
# Preprocess your data (assuming dates are in DD/MM/YYYY format)
data['date'] = pd.to_datetime(data['date'], format='%d/%m/%Y', errors='coerce')
data.dropna(subset=['date'], inplace=True) # Remove rows with missing dates

In [5]:
# Preprocess your data
data['date'] = pd.to_datetime(data['date'])
data['year'] = data['date'].dt.year
data['month'] = data['date'].dt.month
data['day'] = data['date'].dt.day
data['day_of_week'] = data['date'].dt.dayofweek

In [6]:
# Separate features (X) and target variable (y)
X = data[['year', 'month', 'day', 'day_of_week', 'store', 'item']] 
y = data['sales']

In [7]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
# Scale your features (optional but recommended for Random Forest)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [9]:
# --- Model Training ---

# Create a Random Forest Regressor object with chosen parameters
rf_model = RandomForestRegressor(n_estimators=100, max_depth=10, min_samples_split=5, min_samples_leaf=2, random_state=42)  

In [10]:
# Train the model
rf_model.fit(X_train, y_train)

# --- Model Evaluation ---

In [11]:
# Make predictions on the test set
y_pred = rf_model.predict(X_test)

In [12]:
# Evaluate the model's performance using multiple metrics
mse = mean_squared_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)  # Root Mean Squared Error
mae = mean_absolute_error(y_test, y_pred)
median_ae = median_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
explained_variance = explained_variance_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"Root Mean Squared Error: {rmse}")
print(f"Mean Absolute Error: {mae}")
print(f"Median Absolute Error: {median_ae}")
print(f"R-squared: {r2}")
print(f"Explained Variance: {explained_variance}")

Mean Squared Error: 97.67460621398416
Root Mean Squared Error: 9.883046403512642
Mean Absolute Error: 7.243801251362491
Median Absolute Error: 5.339892688373396
R-squared: 0.8676540168198292
Explained Variance: 0.8676568172854493


In [13]:
# --- Model Saving ---

# Save the model
filename = 'rf_model.pkl' 
pickle.dump(rf_model, open(filename, 'wb'))

In [14]:
# Load the saved best model
loaded_model = pickle.load(open(filename, 'rb'))

In [15]:
# Example prediction: 
#  - Assume you want to predict sales for item 1 on 2024-03-15
prediction_date = datetime(2024, 3, 15)
prediction_item = 1

# Create prediction DataFrame
prediction_data = pd.DataFrame({'date': [prediction_date]})
prediction_data['year'] = prediction_data['date'].dt.year
prediction_data['month'] = prediction_data['date'].dt.month
prediction_data['day'] = prediction_data['date'].dt.day
prediction_data['day_of_week'] = prediction_data['date'].dt.dayofweek
prediction_data['store'] = 1 # Assuming you want to forecast for store 1
prediction_data['item'] = prediction_item

# Scale the prediction data
scaled_data = scaler.transform(prediction_data[['year', 'month', 'day', 'day_of_week', 'store', 'item']])

# Make prediction
predicted_sales = loaded_model.predict(scaled_data)[0]

print(f"Predicted Sales for item {prediction_item} on {prediction_date}: {predicted_sales}") 

Predicted Sales for item 1 on 2024-03-15 00:00:00: 19.64914912325275
