In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, RobustScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import xgboost as xgb
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam

class RetailPredictor:
    def __init__(self):
        self.label_encoders = {}
        self.feature_preprocessor = None
        self.rf_model = None
        self.xgb_model = None
        self.dl_model = None
        self.meta_model = None

    def preprocess_features(self, df):
        """
        Preprocess the retail dataset with improved feature engineering
        """
        data = df.copy()

        # Handle outliers
        Q1 = data['Sales'].quantile(0.25)
        Q3 = data['Sales'].quantile(0.75)
        IQR = Q3 - Q1
        data = data[(data['Sales'] >= Q1 - 1.5 * IQR) & (data['Sales'] <= Q3 + 1.5 * IQR)]

        # Create ratio features
        data['Profit_Ratio'] = data['Profit'] / data['Sales']
        data['Discount_Impact'] = data['Sales'] * (1 - data['Discount'])

        # Encode categorical variables
        categorical_cols = ['Category', 'Sub Category', 'City', 'Region', 'State']
        for col in categorical_cols:
            self.label_encoders[col] = LabelEncoder()
            data[col] = self.label_encoders[col].fit_transform(data[col])

        return data

    def prepare_features(self, data):
        """
        Prepare final feature set for modeling
        """
        numeric_features = ['Discount', 'Profit_Ratio', 'Discount_Impact']
        categorical_features = ['Category', 'Sub Category', 'City', 'Region', 'State']

        # Scale numerical features
        self.feature_preprocessor = RobustScaler()
        data[numeric_features] = self.feature_preprocessor.fit_transform(data[numeric_features])

        return numeric_features + categorical_features

    def train_base_models(self, data, features):
        """
        Train base models (RandomForest, XGBoost, and Deep Learning)
        """
        X = data[features]
        y = data['Sales']

        # Split the data
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

        # Train Random Forest
        self.rf_model = RandomForestRegressor(n_estimators=200, max_depth=15, min_samples_split=5, min_samples_leaf=2, random_state=42)
        self.rf_model.fit(X_train, y_train)
        rf_preds_train = self.rf_model.predict(X_train)
        rf_preds_test = self.rf_model.predict(X_test)

        # Train XGBoost
        self.xgb_model = xgb.XGBRegressor(n_estimators=200, learning_rate=0.05, max_depth=10, random_state=42)
        self.xgb_model.fit(X_train, y_train)
        xgb_preds_train = self.xgb_model.predict(X_train)
        xgb_preds_test = self.xgb_model.predict(X_test)

        # Train Deep Learning Model
        self.dl_model = Sequential([
            Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
            Dropout(0.3),
            Dense(64, activation='relu'),
            Dense(32, activation='relu'),
            Dense(1, activation='linear')  # Regression output
        ])
        self.dl_model.compile(optimizer=Adam(learning_rate=0.001), loss='mse', metrics=['mse'])
        self.dl_model.fit(X_train, y_train, epochs=50, batch_size=16, verbose=0)
        dl_preds_train = self.dl_model.predict(X_train).flatten()
        dl_preds_test = self.dl_model.predict(X_test).flatten()

        # Combine predictions as input for meta-model
        X_train_meta = np.column_stack((rf_preds_train, xgb_preds_train, dl_preds_train))
        X_test_meta = np.column_stack((rf_preds_test, xgb_preds_test, dl_preds_test))

        return X_train_meta, X_test_meta, y_train, y_test

    def train_meta_model(self, X_train_meta, X_test_meta, y_train, y_test):
        """
        Train final meta-model using predictions from base models
        """
        self.meta_model = Sequential([
            Dense(32, activation='relu', input_shape=(X_train_meta.shape[1],)),
            Dense(1, activation='linear')
        ])
        self.meta_model.compile(optimizer=Adam(learning_rate=0.001), loss='mse', metrics=['mse'])
        self.meta_model.fit(X_train_meta, y_train, epochs=30, batch_size=8, verbose=0)

        y_pred = self.meta_model.predict(X_test_meta).flatten()

        mse = mean_squared_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)

        print("\n🧠 Stacking Ensemble Model Performance:")
        print(f"📉 Mean Squared Error: {mse:.2f}")
        print(f"📊 Root Mean Squared Error: {np.sqrt(mse):.2f}")
        print(f"🎯 R² Score: {r2:.2f}")

        return y_pred

def analyze_sales_data(file_path):
    """
    Analyze sales data using a stacking ensemble approach
    """
    print("📂 Loading data...")
    df = pd.read_csv(file_path)

    predictor = RetailPredictor()

    print("⚙️ Preprocessing data...")
    processed_data = predictor.preprocess_features(df)

    print("🔧 Preparing features...")
    features = predictor.prepare_features(processed_data)

    print("🛠️ Training base models...")
    X_train_meta, X_test_meta, y_train, y_test = predictor.train_base_models(processed_data, features)

    print("🧠 Training meta-model...")
    y_pred = predictor.train_meta_model(X_train_meta, X_test_meta, y_train, y_test)

    return predictor, processed_data, X_test_meta, y_test, y_pred

# Run Ensemble Model
predictor, processed_data, X_test, y_test, y_pred = analyze_sales_data('DMart_Grocery_Sales_-_Retail_Analytics_Dataset.csv')


📂 Loading data...
⚙️ Preprocessing data...
🔧 Preparing features...
🛠️ Training base models...
🧠 Training meta-model...

🧠 Stacking Ensemble Model Performance:
📉 Mean Squared Error: 25.16
📊 Root Mean Squared Error: 5.02
🎯 R² Score: 1.00


In [2]:
def create_test_data():
    return pd.DataFrame({
        'Category': ['Furniture', 'Office Supplies', 'Technology'] * 5,
        'Sub Category': ['Chairs', 'Storage', 'Phones', 'Tables', 'Binders', 'Accessories'] * 2 + ['Phones', 'Storage', 'Chairs'],
        'City': ['New York', 'Los Angeles', 'Chicago', 'Houston', 'Phoenix'] * 3,
        'Region': ['East', 'West', 'Central', 'South', 'West'] * 3,
        'State': ['NY', 'CA', 'IL', 'TX', 'AZ'] * 3,
        'Sales': np.random.uniform(100, 5000, 15),
        'Discount': np.random.uniform(0, 0.5, 15),
        'Profit': np.random.uniform(10, 1000, 15)
    })

def test_predictor(predictor, test_data):
    """
    Test the trained predictor on new data
    """
    # Preprocess the test data
    processed_test = predictor.preprocess_features(test_data)
    features = predictor.prepare_features(processed_test)
    
    # Get predictions from base models
    X_test = processed_test[features]
    rf_preds = predictor.rf_model.predict(X_test)
    xgb_preds = predictor.xgb_model.predict(X_test)
    dl_preds = predictor.dl_model.predict(X_test).flatten()
    
    # Combine predictions for meta-model
    X_test_meta = np.column_stack((rf_preds, xgb_preds, dl_preds))
    
    # Get final predictions
    final_predictions = predictor.meta_model.predict(X_test_meta).flatten()
    
    # Add predictions to original data for comparison
    results = test_data.copy()
    results['Predicted_Sales'] = final_predictions
    results['Prediction_Difference'] = results['Predicted_Sales'] - results['Sales']
    results['Prediction_Accuracy'] = (1 - abs(results['Prediction_Difference'] / results['Sales'])) * 100
    
    return results

# Generate and test new data
test_data = create_test_data()
results = test_predictor(predictor, test_data)

# Print summary statistics
print("\n📊 Prediction Results Summary:")
print(f"Average Prediction Accuracy: {results['Prediction_Accuracy'].mean():.2f}%")
print(f"Maximum Prediction Error: ${abs(results['Prediction_Difference']).max():.2f}")
print(f"Minimum Prediction Error: ${abs(results['Prediction_Difference']).min():.2f}")

# Display detailed results
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', lambda x: '%.2f' % x)
print("\n📋 Detailed Predictions:")
print(results[['Category', 'Sub Category', 'Sales', 'Predicted_Sales', 'Prediction_Accuracy']])


📊 Prediction Results Summary:
Average Prediction Accuracy: 54.13%
Maximum Prediction Error: $2719.65
Minimum Prediction Error: $197.17

📋 Detailed Predictions:
           Category Sub Category   Sales  Predicted_Sales  Prediction_Accuracy
0         Furniture       Chairs 4672.11          1952.47                41.79
1   Office Supplies      Storage 1790.29          1203.57                67.23
2        Technology       Phones 3724.98          1633.82                43.86
3         Furniture       Tables 4552.76          2325.29                51.07
4   Office Supplies      Binders 1934.55          1134.44                58.64
5        Technology  Accessories 4247.90          1935.01                45.55
6         Furniture       Chairs 4137.47          2237.10                54.07
7   Office Supplies      Storage 1305.90           916.74                70.20
8        Technology       Phones 2506.18          1252.68                49.98
9         Furniture       Tables 4981.11         

In [4]:
from sklearn.model_selection import train_test_split

def sample_test_data(original_df, sample_size=10, random_state=42):
    """
    Randomly sample data points from the original dataset
    """
    # Create a holdout set that wasn't used in training
    _, test_holdout = train_test_split(original_df, test_size=0.1, random_state=random_state)
    
    # Randomly sample from the holdout set
    test_sample = test_holdout.sample(n=sample_size, random_state=random_state)
    return test_sample

def test_predictor(predictor, test_data):
    """
    Test the trained predictor on sampled data
    """
    # Store original sales for comparison
    original_sales = test_data['Sales'].copy()
    
    # Preprocess the test data
    processed_test = predictor.preprocess_features(test_data)
    features = predictor.prepare_features(processed_test)
    
    # Get predictions from base models
    X_test = processed_test[features]
    rf_preds = predictor.rf_model.predict(X_test)
    xgb_preds = predictor.xgb_model.predict(X_test)
    dl_preds = predictor.dl_model.predict(X_test).flatten()
    
    # Combine predictions for meta-model
    X_test_meta = np.column_stack((rf_preds, xgb_preds, dl_preds))
    
    # Get final predictions
    final_predictions = predictor.meta_model.predict(X_test_meta).flatten()
    
    # Create results DataFrame
    results = test_data[['Category', 'Sub Category', 'City', 'Region', 'State', 'Discount', 'Profit']].copy()
    results['Actual_Sales'] = original_sales
    results['Predicted_Sales'] = final_predictions
    results['Prediction_Difference'] = results['Predicted_Sales'] - results['Actual_Sales']
    results['Prediction_Accuracy'] = (1 - abs(results['Prediction_Difference'] / results['Actual_Sales'])) * 100
    
    return results

# Sample and test data
test_data = sample_test_data(processed_data, sample_size=10)
results = test_predictor(predictor, test_data)

# Calculate and print performance metrics
mse = np.mean(results['Prediction_Difference'] ** 2)
rmse = np.sqrt(mse)
mae = np.mean(abs(results['Prediction_Difference']))
mape = np.mean(abs(results['Prediction_Difference'] / results['Actual_Sales'])) * 100

print("\n📊 Model Performance Metrics:")
print(f"Root Mean Square Error: ${rmse:.2f}")
print(f"Mean Absolute Error: ${mae:.2f}")
print(f"Mean Absolute Percentage Error: {mape:.2f}%")
print(f"Average Prediction Accuracy: {results['Prediction_Accuracy'].mean():.2f}%")

# Display detailed results
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', lambda x: '%.2f' % x)
print("\n📋 Detailed Predictions:")
print(results[['Category', 'Sub Category', 'City', 'Actual_Sales', 'Predicted_Sales', 'Prediction_Accuracy']])

# Print range of predictions
print("\n📈 Prediction Range Analysis:")
print(f"Minimum Actual Sales: ${results['Actual_Sales'].min():.2f}")
print(f"Maximum Actual Sales: ${results['Actual_Sales'].max():.2f}")
print(f"Minimum Predicted Sales: ${results['Predicted_Sales'].min():.2f}")
print(f"Maximum Predicted Sales: ${results['Predicted_Sales'].max():.2f}")


📊 Model Performance Metrics:
Root Mean Square Error: $694.40
Mean Absolute Error: $631.55
Mean Absolute Percentage Error: 66.07%
Average Prediction Accuracy: 33.93%

📋 Detailed Predictions:
      Category  Sub Category  City  Actual_Sales  Predicted_Sales  \
2275         0             3     6           611          1309.61   
1498         4            19     7          1422          1416.66   
5613         3            18     9          1419          2240.83   
7685         5             8     7          1966          2469.67   
7782         1            13    17          2234          1704.93   
8            0             1    19           791          1388.57   
9968         6            16     1          1981          2481.31   
9853         4            17     8           860          2066.84   
2055         3             0     8          1331           661.15   
251          1            13     5           531          1313.43   

      Prediction_Accuracy  
2275               -1