Download and Load DataSets

In [None]:
import kagglehub
import pandas as pd
import os

# Download dataset
path = kagglehub.dataset_download("sonawanelalitsunil/warehouse-and-retail-sales")
print("Dataset downloaded to:", path)

# Check available files
for root, dirs, files in os.walk(path):
    for file in files:
        print(file)

# Load main CSV (replace 'sales.csv' with actual filename)
data = pd.read_csv(os.path.join(path, "sales.csv"))
print(data.head())


Exporatoly Data Analysis

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Check basic info
print(data.info())
print(data.describe())

# Plot sales trend over time
sns.lineplot(x='Date', y='Sales', data=data)
plt.title("Sales Trend Over Time")
plt.show()

# Check correlations
sns.heatmap(data.corr(), annot=True, cmap='coolwarm')
plt.show()


Data Preprocessing

In [None]:
# Handle missing values
data = data.fillna(0)

# Convert date column
data['Date'] = pd.to_datetime(data['Date'])

# Feature engineering: month, day, weekday
data['Month'] = data['Date'].dt.month
data['Day'] = data['Date'].dt.day
data['Weekday'] = data['Date'].dt.weekday

# Encode categorical variables
data = pd.get_dummies(data, columns=['Product_Category', 'Store_Location'])

# Create lag feature (previous day sales)
data['Sales_lag_1'] = data['Sales'].shift(1)
data = data.dropna()


Train/Test split

In [None]:
train = data[data['Date'] < '2023-01-01']
test = data[data['Date'] >= '2023-01-01']

X_train = train.drop(['Sales','Date'], axis=1)
y_train = train['Sales']
X_test = test.drop(['Sales','Date'], axis=1)
y_test = test['Sales']


Build ML models

In [None]:
#Linear Regression
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)

print("Linear Regression R²:", r2_score(y_test, y_pred_lr))



In [None]:
#Random Forest
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

print("Random Forest R²:", r2_score(y_test, y_pred_rf))
    

In [None]:
#LSTM
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
import numpy as np

# Reshape for LSTM: [samples, timesteps, features]
X_train_lstm = np.array(X_train).reshape((X_train.shape[0], 1, X_train.shape[1]))
X_test_lstm = np.array(X_test).reshape((X_test.shape[0], 1, X_test.shape[1]))

model = Sequential()
model.add(LSTM(50, activation='relu', input_shape=(X_train_lstm.shape[1], X_train_lstm.shape[2])))
model.add(Dense(1))
model.compile(optimizer='adam', loss='mse')
model.fit(X_train_lstm, y_train, epochs=50, batch_size=32, verbose=1)

y_pred_lstm = model.predict(X_test_lstm)
  

Evaluate Model

In [None]:
from sklearn.metrics import mean_absolute_error

def evaluate(y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    rmse = mean_squared_error(y_true, y_pred, squared=False)
    r2 = r2_score(y_true, y_pred)
    return mae, rmse, r2

print("Linear Regression:", evaluate(y_test, y_pred_lr))
print("Random Forest:", evaluate(y_test, y_pred_rf))
print("LSTM:", evaluate(y_test, y_pred_lstm))
 