## Importing Libraries

In [1]:
# !pip install hyperopt lightgbm

In [1]:
import pandas as pd
import numpy as np

## Generating Training Data

In [3]:
# Generating a synthetic dataset based on the given schema
np.random.seed(42)

# Number of records to generate
n_records = 2000

# Product mapping to ensure logical category assignments
product_category_mapping = {
    'Smartphone': 'Electronics',
    'Headphones': 'Electronics',
    'Monitor': 'Electronics',
    'Dining Table': 'Furniture',
    'Board Game': 'Toys',
    'Laptop': 'Electronics',
    'Sofa': 'Furniture',
    'Television': 'Electronics',
    'Camera': 'Electronics',
    'Watch': 'Clothing',
    'Blender': 'Kitchen',
    'Microwave': 'Kitchen',
    'T-shirt': 'Clothing',
    'Sneakers': 'Clothing',
    'Action Figure': 'Toys',
    'Coffee Maker': 'Kitchen',
    'Wardrobe': 'Furniture',
    'Bookshelf': 'Furniture',
    'Toy Car': 'Toys'
}

# Generate the columns
data = {
    'Transaction ID': ['T' + str(i) for i in range(1000, 1000 + n_records)],
    'Date': pd.date_range(start='2023-01-01', periods=n_records, freq='D'),
    'Product Name': np.random.choice(list(product_category_mapping.keys()), n_records),
    'Quantity Sold': np.random.randint(1, 20, n_records),
    'Sales Price per Unit ($)': np.round(np.random.uniform(10, 1000, n_records), 2),
    'Discount (%)': np.random.choice([0, 5, 10, 15, 20], n_records),
    'Stock After Sale': np.random.randint(1, 500, n_records),
    'Returned': np.random.choice([0, 1], n_records),
    'Weather Condition': np.random.choice(['Sunny', 'Rainy', 'Cloudy', 'Snow', 'Stormy'], n_records),
    'Temperature (°C)': np.round(np.random.uniform(-5, 40, n_records), 2),
    'Humidity (%)': np.random.randint(10, 100, n_records),
    'Precipitation (mm)': np.round(np.random.uniform(0, 50, n_records), 2),
    'Wind Speed (km/h)': np.round(np.random.uniform(0, 100, n_records), 2),
    'Is Holiday': np.random.choice([0, 1], n_records),
    'Is Weekend': np.random.choice([0, 1], n_records),
    'Is Special Event': np.random.choice([0, 1], n_records),
    'Season': np.random.choice(['Winter', 'Spring', 'Summer', 'Fall'], n_records)
}

# Assign Category based on Product Name
data['Category'] = [product_category_mapping[product] for product in data['Product Name']]

# Convert to DataFrame
df = pd.DataFrame(data)

# Adjust Total Sales ($) based on quantity and price
df['Total Sales ($)'] = df['Quantity Sold'] * df['Sales Price per Unit ($)']
df['Total Sales After Discount ($)'] = df['Total Sales ($)'] * (1 - df['Discount (%)'] / 100)
df['Net Sales Value ($)'] = df['Total Sales After Discount ($)'] - df['Returned'] * df['Sales Price per Unit ($)']

# Creating lag features (for Quantity Sold) within each product
df['Sales_Lag_1'] = df.groupby('Product Name')['Quantity Sold'].shift(1)
df['Sales_Lag_7'] = df.groupby('Product Name')['Quantity Sold'].shift(7)
df['Sales_Lag_14'] = df.groupby('Product Name')['Quantity Sold'].shift(14)

# Create a future 7-day sales column as the target (simulating future sales)
df['Future_7_Day_Sales'] = df.groupby('Product Name')['Quantity Sold'].shift(-7)

# Drop the rows with NaN values due to lagging and future shifts
df.dropna(inplace=True)

# Calculate monthly sales
df['Month'] = df['Date'].dt.to_period('M')  # Create a 'Month' column
monthly_sales = df.groupby(['Product Name', 'Month'])['Quantity Sold'].sum().reset_index()  # Aggregate sales

# Rename the aggregated column for clarity
monthly_sales.rename(columns={'Quantity Sold': 'Monthly Sales'}, inplace=True)

# Merge the monthly sales back into the original dataframe
df = df.merge(monthly_sales, on=['Product Name', 'Month'], how='left')

# Save the dataset to CSV
file_path = 'synthetic_retail_sales_2000_train.csv'
df.to_csv(file_path, index=False)

file_path

'synthetic_retail_sales_2000_train.csv'

## Generating Testing Data

In [2]:
import numpy as np
import pandas as pd

# Setting a seed for reproducibility
np.random.seed(42)

# Number of records to generate
n_records = 10000

# Product mapping to ensure logical category assignments
product_category_mapping = {
    'Smartphone': 'Gadgets',
    'Wireless Earbuds': 'Gadgets',
    'Gaming Monitor': 'Gaming',
    'Coffee Table': 'Furniture',
    'Puzzle Set': 'Toys',
    'Gaming Laptop': 'Gaming',
    'Recliner Chair': 'Furniture',
    'Smart TV': 'Gadgets',
    'Digital Camera': 'Photography',
    'Luxury Watch': 'Accessories',
    'Juicer': 'Kitchen Appliances',
    'Oven': 'Kitchen Appliances',
    'Hoodie': 'Apparel',
    'Running Shoes': 'Apparel',
    'Collectible Figure': 'Toys',
    'Espresso Machine': 'Kitchen Appliances',
    'Closet Organizer': 'Furniture',
    'Corner Bookshelf': 'Furniture',
    'Remote-Controlled Car': 'Toys'
}

# Generate the columns
data = {
    'Transaction ID': ['T' + str(i) for i in range(1000, 1000 + n_records)],
    'Date': pd.date_range(start='2023-01-01', periods=n_records, freq='D'),
}

# Simulate product purchases with dependencies to create frequently bought products
frequent_product_combinations = [
    ['Smartphone', 'Wireless Earbuds'],
    ['Gaming Laptop', 'Gaming Monitor'],
    ['Coffee Table', 'Recliner Chair'],
    ['Espresso Machine', 'Juicer'],
    ['Smart TV', 'Digital Camera'],
    ['Hoodie', 'Running Shoes'],
    ['Puzzle Set', 'Collectible Figure'],
    ['Oven', 'Kitchen Appliances']
]

# Create a function to generate product purchases based on combinations
def generate_products():
    products = []
    # Randomly choose a combination of products
    for combo in frequent_product_combinations:
        if np.random.rand() < 0.3:  # 30% chance to pick a combination
            products.extend(combo)
    # Add some random products to ensure variety
    additional_products = np.random.choice(
        list(product_category_mapping.keys()), 
        size=np.random.randint(1, 3), 
        replace=False
    )
    products.extend(additional_products)
    return products

# Generate Product Names for each transaction
data['Product Name'] = [generate_products() for _ in range(n_records)]

# Explode the lists of products into individual rows
data = data.explode('Product Name')

# Remaining columns as before
data['Quantity Sold'] = np.random.randint(1, 20, len(data))
data['Sales Price per Unit ($)'] = np.round(np.random.uniform(10, 1000, len(data)), 2)
data['Discount (%)'] = np.random.choice([0, 5, 10, 15, 20], len(data))
data['Stock After Sale'] = np.random.randint(1, 500, len(data))
data['Returned'] = np.random.choice([0, 1], len(data))
data['Weather Condition'] = np.random.choice(['Sunny', 'Rainy', 'Cloudy', 'Snow', 'Stormy'], len(data))
data['Temperature (°C)'] = np.round(np.random.uniform(-5, 40, len(data)), 2)
data['Humidity (%)'] = np.random.randint(10, 100, len(data))
data['Precipitation (mm)'] = np.round(np.random.uniform(0, 50, len(data)), 2)
data['Wind Speed (km/h)'] = np.round(np.random.uniform(0, 100, len(data)), 2)
data['Is Holiday'] = np.random.choice([0, 1], len(data))
data['Is Weekend'] = np.random.choice([0, 1], len(data))
data['Is Special Event'] = np.random.choice([0, 1], len(data))
data['Season'] = np.random.choice(['Winter', 'Spring', 'Summer', 'Fall'], len(data))

# Assign Category based on Product Name
data['Category'] = [product_category_mapping[product] for product in data['Product Name']]

# Calculate Total Sales
data['Total Sales ($)'] = data['Quantity Sold'] * data['Sales Price per Unit ($)']
data['Total Sales After Discount ($)'] = data['Total Sales ($)'] * (1 - data['Discount (%)'] / 100)
data['Net Sales Value ($)'] = data['Total Sales After Discount ($)'] - data['Returned'] * data['Sales Price per Unit ($)']

# Creating lag features (for Quantity Sold) within each product
data['Sales_Lag_1'] = data.groupby('Product Name')['Quantity Sold'].shift(1)
data['Sales_Lag_7'] = data.groupby('Product Name')['Quantity Sold'].shift(7)
data['Sales_Lag_14'] = data.groupby('Product Name')['Quantity Sold'].shift(14)

# Create a future 7-day sales column as the target (simulating future sales)
data['Future_7_Day_Sales'] = data.groupby('Product Name')['Quantity Sold'].shift(-7)

# Drop the rows with NaN values due to lagging and future shifts
data.dropna(inplace=True)

# Calculate monthly sales
data['Month'] = data['Date'].dt.to_period('M')  # Create a 'Month' column
monthly_sales = data.groupby(['Product Name', 'Month'])['Quantity Sold'].sum().reset_index()  # Aggregate sales

# Rename the aggregated column for clarity
monthly_sales.rename(columns={'Quantity Sold': 'Monthly Sales'}, inplace=True)

# Merge the monthly sales back into the original dataframe
df = data.merge(monthly_sales, on=['Product Name', 'Month'], how='left')

# Save the dataset to CSV
file_path = 'synthetic_retail_sales_2000_test.csv'
df.to_csv(file_path, index=False)

file_path

AttributeError: 'dict' object has no attribute 'explode'

In [5]:
# import pandas as pd
# from sklearn.model_selection import train_test_split
# from sklearn.ensemble import RandomForestRegressor
# from sklearn.metrics import mean_squared_error
# from hyperopt import hp, fmin, tpe, Trials, STATUS_OK
# import joblib

# # Load your dataset
# data = pd.read_csv('synthetic_retail_sales_2000_with_lags_rational.csv')

# # Data cleaning (handle missing values, etc.)
# data = data.dropna()

# # Extract date-related features if needed (e.g., day, month)
# data['Day'] = pd.to_datetime(data['Date']).dt.day
# data['Month'] = pd.to_datetime(data['Date']).dt.month
# data['Year'] = pd.to_datetime(data['Date']).dt.year

# # One-hot encoding for categorical features
# data = pd.get_dummies(data, columns=['Weather Condition', 'Season', 'Category', 'Product Name'], drop_first=True)

# # Define features and target, retaining relevant sales metrics
# features = data.drop(columns=['Transaction ID', 'Future_7_Day_Sales', 'Date'])  # Consider keeping sales metrics
# target = data['Future_7_Day_Sales']

# # Split the data into training and testing sets
# X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# # Define the search space for hyperparameter optimization
# space = {
#     'n_estimators': hp.choice('n_estimators', [50, 100, 200]),
#     'max_depth': hp.choice('max_depth', [None, 10, 20, 30]),  # None is acceptable, but no 0
#     'min_samples_split': hp.randint('min_samples_split', 2, 20),
#     'min_samples_leaf': hp.randint('min_samples_leaf', 1, 20),
#     'max_features': hp.choice('max_features', [0.5, 0.75, 'sqrt', 'log2', None])  # Valid options
# }

# # Define the objective function for hyperparameter tuning
# def objective(params):
#     model = RandomForestRegressor(**params)
#     model.fit(X_train, y_train)
#     predictions = model.predict(X_test)
#     mse = mean_squared_error(y_test, predictions)
#     return {'loss': mse, 'status': STATUS_OK}

# # Create a Trials object to keep track of progress
# trials = Trials()

# # Run the optimization
# best = fmin(fn=objective,
#             space=space,
#             algo=tpe.suggest,
#             max_evals=100,
#             trials=trials)

# print("Best hyperparameters:", best)

# # Train the final model with the best hyperparameters
# final_model = RandomForestRegressor(**best)
# final_model.fit(X_train, y_train)

# # Make predictions on the test set
# final_predictions = final_model.predict(X_test)
# final_mse = mean_squared_error(y_test, final_predictions)
# print(f'Final Mean Squared Error: {final_mse}')

# # Save the model and feature names using joblib
# joblib.dump(final_model, 'final_model.joblib')
# joblib.dump(features.columns, 'feature_names.joblib')

# # Load the model and feature names for future predictions
# loaded_model = joblib.load('final_model.joblib')
# loaded_feature_names = joblib.load('feature_names.joblib')

# # Create new data ensuring to include the necessary categorical columns
# new_data = pd.DataFrame({
#     'Temperature (°C)': [25],
#     'Humidity (%)': [70],
#     'Precipitation (mm)': [0],
#     'Wind Speed (km/h)': [10],
#     'Is Holiday': [0],
#     'Is Weekend': [0],
#     'Is Special Event': [1],
#     'Day': [15],  # Example day
#     'Month': [7],  # Example month
#     'Year': [2023],  # Example year
#     'Weather Condition': ['Sunny'],
#     'Season': ['Summer'],
#     'Category': ['Electronics'],
#     'Product Name': ['Smartphone'],
# })

# # One-hot encoding for categorical features in new data
# new_data = pd.get_dummies(new_data, columns=['Weather Condition', 'Season', 'Category', 'Product Name'], drop_first=True)

# # Align the new data with the trained model's features
# new_data = new_data.reindex(columns=loaded_feature_names, fill_value=0)

# # Make predictions using the loaded model
# predictions = loaded_model.predict(new_data)
# print("Predictions for new data:", predictions)

## Training 3 Models

In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score
import joblib
import xgboost as xgb

# Load your dataset
data = pd.read_csv('synthetic_retail_sales_2000_test.csv')

# Data cleaning (handle missing values, etc.)
data = data.dropna()

# Extract date-related features
data['Day'] = pd.to_datetime(data['Date']).dt.day
data['Month'] = pd.to_datetime(data['Date']).dt.month
data['Year'] = pd.to_datetime(data['Date']).dt.year

# One-hot encoding for categorical features
data = pd.get_dummies(data, columns=['Weather Condition', 'Season', 'Category', 'Product Name'], drop_first=True)

# Prepare features and targets for each prediction task
# Predict Next Week's Sales
features_week = data.drop(columns=['Transaction ID', 'Future_7_Day_Sales', 'Date', 'Monthly Sales'])
target_week = data['Future_7_Day_Sales']
X_train_week, X_test_week, y_train_week, y_test_week = train_test_split(features_week, target_week, test_size=0.2, random_state=42)

# Predict Next Month's Sales
features_month = data.drop(columns=['Transaction ID', 'Monthly Sales', 'Date', 'Future_7_Day_Sales'])
target_month = data['Monthly Sales']
X_train_month, X_test_month, y_train_month, y_test_month = train_test_split(features_month, target_month, test_size=0.2, random_state=42)

# Predict Discount Percentage
features_discount = data.drop(columns=['Transaction ID', 'Discount (%)', 'Date', 'Future_7_Day_Sales', 'Monthly Sales'])
target_discount = data['Discount (%)']
X_train_discount, X_test_discount, y_train_discount, y_test_discount = train_test_split(features_discount, target_discount, test_size=0.2, random_state=42)

# Save feature names for each model
loaded_feature_names = {
    'weekly': features_week.columns.tolist(),
    'monthly': features_month.columns.tolist(),
    'discount': features_discount.columns.tolist()
}
joblib.dump(loaded_feature_names, 'feature_names.joblib')

# Function to evaluate models
def evaluate_model(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    mse = mean_squared_error(y_test, predictions)
    r2 = r2_score(y_test, predictions)
    return mse, r2

# Initialize models to compare
models = {
    'Random Forest': RandomForestRegressor(),
    'Gradient Boosting': GradientBoostingRegressor(),
    'XGBoost': xgb.XGBRegressor()
}

# Store results for each model for each prediction task
results = {'Weekly Sales': {}, 'Monthly Sales': {}, 'Discount Percentage': {}}
best_models = {}

# Evaluate models for each prediction task
for task in ['Weekly Sales', 'Monthly Sales', 'Discount Percentage']:
    if task == 'Weekly Sales':
        X_train, y_train, X_test, y_test = X_train_week, y_train_week, X_test_week, y_test_week
    elif task == 'Monthly Sales':
        X_train, y_train, X_test, y_test = X_train_month, y_train_month, X_test_month, y_test_month
    else:  # Discount Percentage
        X_train, y_train, X_test, y_test = X_train_discount, y_train_discount, X_test_discount, y_test_discount
    
    for model_name, model in models.items():
        mse, r2 = evaluate_model(model, X_train, y_train, X_test, y_test)
        results[task][model_name] = {'MSE': mse, 'R²': r2}
        print(f"{task} - {model_name} - MSE: {mse:.4f}, R²: {r2:.4f}")

    # Select the best model based on MSE
    best_model_name = min(results[task], key=lambda k: results[task][k]['MSE'])
    best_models[task] = models[best_model_name]  # Store the best model

# Train and save the best models
for task, model in best_models.items():
    if task == 'Weekly Sales':
        model.fit(X_train_week, y_train_week)
        joblib.dump(model, 'final_model_weekly_sales.joblib')
    elif task == 'Monthly Sales':
        model.fit(X_train_month, y_train_month)
        joblib.dump(model, 'final_model_monthly_sales.joblib')
    elif task == 'Discount Percentage':
        model.fit(X_train_discount, y_train_discount)
        joblib.dump(model, 'final_model_discount_percentage.joblib')

# Load the feature names
loaded_feature_names = joblib.load('feature_names.joblib')

# Load the models for predictions
loaded_model_weekly = joblib.load('final_model_weekly_sales.joblib')
loaded_model_monthly = joblib.load('final_model_monthly_sales.joblib')
loaded_model_discount = joblib.load('final_model_discount_percentage.joblib')

# Create new data ensuring to include the necessary categorical columns
new_data = pd.DataFrame({
    'Temperature (°C)': [25],
    'Humidity (%)': [70],
    'Precipitation (mm)': [0],
    'Wind Speed (km/h)': [10],
    'Is Holiday': [0],
    'Is Weekend': [0],
    'Is Special Event': [1],
    'Day': [15],  # Example day
    'Month': [7],  # Example month
    'Year': [2023],  # Example year
    'Weather Condition': ['Sunny'],
    'Season': ['Summer'],
    'Category': ['Electronics'],
    'Product Name': ['Smartphone'],
})

# One-hot encoding for categorical features in new data
new_data = pd.get_dummies(new_data, columns=['Weather Condition', 'Season', 'Category', 'Product Name'], drop_first=True)

# Make sure to align new_data with the features used during training for each model
# For predicting weekly sales
new_data_weekly = new_data.reindex(columns=loaded_feature_names['weekly'], fill_value=0)

# For predicting monthly sales
new_data_monthly = new_data.reindex(columns=loaded_feature_names['monthly'], fill_value=0)

# For predicting discount percentage
new_data_discount = new_data.reindex(columns=loaded_feature_names['discount'], fill_value=0)

# Make predictions using the loaded models
predicted_weekly_sales = loaded_model_weekly.predict(new_data_weekly)
predicted_monthly_sales = loaded_model_monthly.predict(new_data_monthly)
predicted_discount_percentage = loaded_model_discount.predict(new_data_discount)

print("Predicted Next Week's Sales:", predicted_weekly_sales)
print("Predicted Next Month's Sales:", predicted_monthly_sales)
print("Predicted Discount Percentage:", predicted_discount_percentage)

Weekly Sales - Random Forest - MSE: 31.6137, R²: -0.0799
Weekly Sales - Gradient Boosting - MSE: 32.3608, R²: -0.1054
Weekly Sales - XGBoost - MSE: 39.6500, R²: -0.3544
Monthly Sales - Random Forest - MSE: 200.7641, R²: 0.0733
Monthly Sales - Gradient Boosting - MSE: 197.1162, R²: 0.0902
Monthly Sales - XGBoost - MSE: 227.4731, R²: -0.0499
Discount Percentage - Random Forest - MSE: 26.0438, R²: 0.4749
Discount Percentage - Gradient Boosting - MSE: 33.4112, R²: 0.3263
Discount Percentage - XGBoost - MSE: 21.9387, R²: 0.5576
Predicted Next Week's Sales: [10.12]
Predicted Next Month's Sales: [19.49455878]
Predicted Discount Percentage: [8.940439]
