In [9]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder


In [10]:
# Load your dataset
# Replace the file path with your dataset's file path
df = pd.read_csv('updated_data.csv')

In [11]:
# Check for missing values
missing_values = df.isnull().sum()
print(f"Missing values:\n{missing_values}")

# Handle missing values (fill with the mean for numerical columns or mode for categorical columns)
numerical_cols = df.select_dtypes(include=['float64', 'int64']).columns  # Select only numeric columns
df[numerical_cols] = df[numerical_cols].fillna(df[numerical_cols].mean())  # Fill numerical columns with mean

# Handle categorical columns (fill with the mode)
for col in df.select_dtypes(include=['object']).columns:
    df[col] = df[col].fillna(df[col].mode()[0])

# Check for missing values after filling
missing_values = df.isnull().sum()
print(f"Missing values after filling:\n{missing_values}")

# Convert 'Order Date' to datetime format
if 'Order Date' in df.columns:  # Check if 'Order Date' exists before converting
    df['Order Date'] = pd.to_datetime(df['Order Date'])

    # Feature engineering: Create new columns from the 'Order Date'
    df['Year'] = df['Order Date'].dt.year
    df['Month'] = df['Order Date'].dt.month
    df['Day'] = df['Order Date'].dt.day
    df['Weekday'] = df['Order Date'].dt.weekday  # Monday = 0, Sunday = 6

    # Drop the original 'Order Date' column if not needed
    df.drop(columns=['Order Date'], inplace=True)
else:
    print("The 'Order Date' column is missing in the dataset.")

# Encode categorical columns (e.g., if 'Product_Id' or others are categorical)
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
for col in df.select_dtypes(include=['object']).columns:
    df[col] = le.fit_transform(df[col])

# Normalize numerical features using StandardScaler
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
numerical_cols = ['Quantity', 'Visibility', 'Cart Count', 'Cumulative Sales', 
                  'Sales Last 7 Days', 'Trend', 'Sort Rank', 'Interaction Score', 
                  'Stars', 'Reviews Count', 'Base_Price', 'Market_Price']
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])

# Check the processed dataframe
print(df.head())


Missing values:
Order Date               0
Product_Id               0
Unnamed: 2           20000
Quantity                 0
Visibility               0
Cart Count               0
Cumulative Sales         0
7-Day Moving Avg         0
Sales Last 7 Days      100
Trend                  100
Sort Rank                0
Interaction Score        0
Stars                    0
Reviews Count            0
Base_Price               0
Market_Price             0
dtype: int64
Missing values after filling:
Order Date               0
Product_Id               0
Unnamed: 2           20000
Quantity                 0
Visibility               0
Cart Count               0
Cumulative Sales         0
7-Day Moving Avg         0
Sales Last 7 Days        0
Trend                    0
Sort Rank                0
Interaction Score        0
Stars                    0
Reviews Count            0
Base_Price               0
Market_Price             0
dtype: int64
   Product_Id  Unnamed: 2  Quantity  Visibility  Cart Count  Cum

In [12]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import ElasticNet
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from statsmodels.tsa.arima.model import ARIMA
from sklearn.preprocessing import StandardScaler

In [13]:
print(df.columns)


Index(['Product_Id', 'Unnamed: 2', 'Quantity', 'Visibility', 'Cart Count',
       'Cumulative Sales', '7-Day Moving Avg', 'Sales Last 7 Days', 'Trend',
       'Sort Rank', 'Interaction Score', 'Stars', 'Reviews Count',
       'Base_Price', 'Market_Price', 'Year', 'Month', 'Day', 'Weekday'],
      dtype='object')


In [14]:
# Assuming df is your DataFrame and is already loaded
# Check for non-positive values in Quantity and replace them or filter out
print(df['Quantity'].min())  # Check if there's any zero or negative value

# Drop rows with non-positive Quantity or apply a small value adjustment
df = df[df['Quantity'] > 0]

# Feature Selection
X = df[['Visibility', 'Cart Count', 'Cumulative Sales', 'Sales Last 7 Days',
        'Trend', 'Sort Rank', 'Interaction Score', 'Stars',
        'Reviews Count', 'Base_Price', 'Market_Price']]
Y = df['Quantity']

# Apply log1p transformation to Quantity
Y = np.log1p(Y)

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train-Test Split
X_train, X_test, Y_train, Y_test = train_test_split(X_scaled, Y, test_size=0.2, random_state=42)

### 1. Linear Regression with ElasticNet Regularization
elastic_net = ElasticNet(alpha=0.01, l1_ratio=0.5, max_iter=1000)
elastic_net.fit(X_train, Y_train)
Y_pred_en = elastic_net.predict(X_test)

# Evaluate ElasticNet
mse_en = mean_squared_error(Y_test, Y_pred_en)
r2_en = r2_score(Y_test, Y_pred_en)
print("\nElasticNet Evaluation:")
print(f"MSE: {mse_en}, R2: {r2_en}")

### 2. Random Forest Regressor
rf = RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42)
rf.fit(X_train, Y_train)
Y_pred_rf = rf.predict(X_test)

# Evaluate Random Forest
mse_rf = mean_squared_error(Y_test, Y_pred_rf)
r2_rf = r2_score(Y_test, Y_pred_rf)
print("\nRandom Forest Evaluation:")
print(f"MSE: {mse_rf}, R2: {r2_rf}")

### 3. XGBoost Regressor
xgb = XGBRegressor(learning_rate=0.01, n_estimators=100, max_depth=5, random_state=42)
xgb.fit(X_train, Y_train)
Y_pred_xgb = xgb.predict(X_test)

# Evaluate XGBoost
mse_xgb = mean_squared_error(Y_test, Y_pred_xgb)
r2_xgb = r2_score(Y_test, Y_pred_xgb)
print("\nXGBoost Evaluation:")
print(f"MSE: {mse_xgb}, R2: {r2_xgb}")

### 4. LSTM for Temporal Analysis
# Reshape data for LSTM
X_train_lstm = X_train.reshape((X_train.shape[0], 1, X_train.shape[1]))
X_test_lstm = X_test.reshape((X_test.shape[0], 1, X_test.shape[1]))

# LSTM Model
lstm_model = Sequential([
    LSTM(50, activation='relu', input_shape=(1, X_train.shape[1])),
    Dense(1)
])
lstm_model.compile(optimizer='adam', loss='mse')
lstm_model.fit(X_train_lstm, Y_train, epochs=50, batch_size=32, verbose=1)

# Predict and Evaluate LSTM
Y_pred_lstm = lstm_model.predict(X_test_lstm)
mse_lstm = mean_squared_error(Y_test, Y_pred_lstm)
r2_lstm = r2_score(Y_test, Y_pred_lstm)
print("\nLSTM Evaluation:")
print(f"MSE: {mse_lstm}, R2: {r2_lstm}")

### 5. ARIMA for Trend and Seasonality
# Recreate 'Order Date' from Year, Month, and Day columns
df['Order Date'] = pd.to_datetime(df[['Year', 'Month', 'Day']])

# Set 'Order Date' as index
df.set_index('Order Date', inplace=True)

# Prepare quantity series
quantity_series = df['Quantity']

# ARIMA Model (p=1, d=2, q=1 as per ACF and PACF)
arima_model = ARIMA(quantity_series, order=(1, 2, 1))
arima_result = arima_model.fit()
arima_forecast = arima_result.forecast(steps=len(Y_test))

# Evaluate ARIMA
mse_arima = mean_squared_error(Y_test[:len(arima_forecast)], arima_forecast)
r2_arima = r2_score(Y_test[:len(arima_forecast)], arima_forecast)
print("\nARIMA Evaluation:")
print(f"MSE: {mse_arima}, R2: {r2_arima}")

### Ensemble of Predictions
# Median of all predictions
Y_pred_ensemble = np.median(
    np.vstack([Y_pred_en, Y_pred_rf, Y_pred_xgb, Y_pred_lstm.flatten()[:len(Y_pred_en)]]), axis=0)

# Evaluate Ensemble
mse_ensemble = mean_squared_error(Y_test, Y_pred_ensemble)
r2_ensemble = r2_score(Y_test, Y_pred_ensemble)
print("\nEnsemble Evaluation:")
print(f"MSE: {mse_ensemble}, R2: {r2_ensemble}")

-1.6533675481262433

ElasticNet Evaluation:
MSE: 0.0012473441348424515, R2: 0.9843321854649774

Random Forest Evaluation:
MSE: 9.756196693795924e-30, R2: 1.0

XGBoost Evaluation:
MSE: 0.010693782175288323, R2: 0.8656760463129844
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50

LSTM Evaluation:
MSE: 8.28242541969597e-06, R2: 0.9998959649532546


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)



ARIMA Evaluation:
MSE: 0.9642571358069402, R2: -11.111975793916288

Ensemble Evaluation:
MSE: 0.00021742405539251078, R2: 0.9972689495383144


  return get_prediction_index(
  return get_prediction_index(


In [23]:
# Define the model you want to use for predictions
model = arima_model  # Replace with rf, xgb, lstm_model, or arima_model as needed

# New data point for prediction
new_data = np.array([[2888, 3, 200, 30, 0.8, 5, 10, 4.5, 25, 800, 1500]])  # Replace with your actual values

# Scale the new data using the same scaler used for training
new_data_scaled = scaler.transform(new_data)

# ElasticNet, RandomForest, or XGBoost Prediction
if model in [elastic_net, rf, xgb]:
    predicted_quantity = model.predict(new_data_scaled)
    predicted_quantity_original = np.expm1(predicted_quantity)  # Inverse transform to original scale
    print(f"Predicted Quantity (Log Scale): {predicted_quantity[0]}")
    print(f"Predicted Quantity (Original Scale): {predicted_quantity_original[0]:.2f}")

# LSTM Prediction
elif model == lstm_model:
    new_data_lstm = new_data_scaled.reshape((new_data_scaled.shape[0], 1, new_data_scaled.shape[1]))
    predicted_quantity = model.predict(new_data_lstm)
    predicted_quantity_original = np.expm1(predicted_quantity)  # Inverse transform to original scale
    print(f"Predicted Quantity (Log Scale): {predicted_quantity[0][0]}")
    print(f"Predicted Quantity (Original Scale): {predicted_quantity_original[0][0]:.2f}")

# ARIMA Prediction
elif model == arima_model:
    # For ARIMA, provide a time series of features if required. Simplified here for one step.
    arima_forecast = model.forecast(steps=1)
    predicted_quantity = arima_forecast[0]
    predicted_quantity_original = np.expm1(predicted_quantity)  # Inverse transform to original scale
    print(f"Predicted Quantity (Log Scale): {predicted_quantity}")
    print(f"Predicted Quantity (Original Scale): {predicted_quantity_original:.2f}")


Predicted Quantity (Log Scale): 0.8276808261871338
Predicted Quantity (Original Scale): 1.29


