Load and inspect the data

In [5]:
import pandas as pd
import numpy as np

# Loading the dataset from Excel
df = pd.read_excel('Data/XBrand_FB_Main_metrics_Anonymized_dataset.xlsx')

# Convert 'date' column to datetime
df['date'] = pd.to_datetime(df['date'], errors='coerce')

# Clean and convert 'reach' column to numeric (remove commas and handle invalid entries)
# Replace invalid entries like ' -   ' with NaN
df['reach'] = df['reach'].astype(str).str.replace(',', '').str.strip()  # Remove commas and strip whitespace
df['reach'] = pd.to_numeric(df['reach'], errors='coerce')  # Convert to numeric, invalid entries become NaN

# Optionally, drop rows where 'reach' is NaN (since 'reach' is a target variable)
df = df.dropna(subset=['reach'])
print(f"Number of rows after dropping rows with NaN in 'reach': {len(df)}")

# Convert 'reach' to integer now that invalid entries are handled
df['reach'] = df['reach'].astype(int)

# Printing the exact column names to identify any discrepancies
print("Column names in the dataset:")
print(df.columns.tolist())

# Displaying basic information about the dataset
print("\nDataset Info:")
print(df.info())

print("\nFirst 5 rows:")
print(df.head())

print("\nSummary statistics:")
print(df.describe())

# Checking for missing values
print("\nMissing values:")
print(df[['spend', 'campaign', 'reach', 'engagement']].isna().sum())

# Checking for negative values (since these metrics should be non-negative)
print("\nNegative values:")
print((df[['spend', 'reach', 'engagement']] < 0).sum())

Number of rows after dropping rows with NaN in 'reach': 22634
Column names in the dataset:
['engagement', 'ad_name', 'campaign', 'date', 'reach', 'spend']

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
Index: 22634 entries, 0 to 22753
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   engagement  22634 non-null  int64         
 1   ad_name     22634 non-null  object        
 2   campaign    22634 non-null  object        
 3   date        22634 non-null  datetime64[ns]
 4   reach       22634 non-null  int64         
 5   spend       22634 non-null  float64       
dtypes: datetime64[ns](1), float64(1), int64(2), object(2)
memory usage: 1.2+ MB
None

First 5 rows:
   engagement                                            ad_name  \
0        3771  XBrand | Save More | Zero Interest | Easy Paym...   
1       12798  XBrand | Weekend Treat | Easy Payments | Trave...   
2          54  XBrand | Gift Vouchers 

Preprocess the data

In [None]:
# import pandas as pd
# import numpy as np
# from sklearn.preprocessing import StandardScaler
# from sklearn.model_selection import train_test_split

# # defining features and targets
# numerical_features = ['spend']
# categorical_features = ['campaign']
# targets = ['reach', 'engagement']

# # debug: checking for NaN in raw data
# print("NaN in raw df:")
# print(df[numerical_features + targets].isna().sum())

# # dropping rows with NaN values in numerical features or targets
# df = df.dropna(subset=numerical_features + targets)
# print(f"Number of rows after dropping NaN: {len(df)}")

# # applying log transformation to handle skewness (np.log1p handles zeros)
# df['spend'] = np.log1p(df['spend'])
# df['reach'] = np.log1p(df['reach'])
# df['engagement'] = np.log1p(df['engagement'])

# # debug: checking for NaN and inf values after log transformation
# print("NaN in df after log transformation:")
# print(df[numerical_features + targets].isna().sum())
# print("Inf in df after log transformation:")
# print(np.isinf(df[numerical_features + targets]).sum())

# # storing quantiles of spend in log-transformed space (before scaling)
# spend_log_lower = df['spend'].quantile(0.01)
# spend_log_upper = df['spend'].quantile(0.99)
# print(f"Spend log-transformed quantiles (before scaling): 1st percentile: {spend_log_lower:.4f}, 99th percentile: {spend_log_upper:.4f}")

# # clipping outliers at the 99th percentile
# df['spend'] = df['spend'].clip(lower=0, upper=df['spend'].quantile(0.99))
# df['reach'] = df['reach'].clip(lower=0, upper=df['reach'].quantile(0.99))
# df['engagement'] = df['engagement'].clip(lower=0, upper=df['engagement'].quantile(0.99))

# # additionally clipping to prevent extreme values
# df['spend'] = df['spend'].clip(lower=df['spend'].quantile(0.01), upper=df['spend'].quantile(0.99))
# df['reach'] = df['reach'].clip(lower=df['reach'].quantile(0.01), upper=df['reach'].quantile(0.99))
# df['engagement'] = df['engagement'].clip(lower=df['engagement'].quantile(0.01), upper=df['engagement'].quantile(0.99))

# # debug: checking for NaN and inf values after clipping
# print("NaN in df after clipping:")
# print(df[numerical_features + targets].isna().sum())
# print("Inf in df after clipping:")
# print(np.isinf(df[numerical_features + targets]).sum())

# # standardizing numerical features
# feature_scaler = StandardScaler()
# df[numerical_features] = feature_scaler.fit_transform(df[numerical_features])

# # standardizing targets
# target_scaler = StandardScaler()
# df[targets] = target_scaler.fit_transform(df[targets])

# # checking scaler parameters
# print("Scaler mean:", target_scaler.mean_)
# print("Scaler scale:", target_scaler.scale_)

# # checking for NaN and inf values after scaling
# print("NaN in df after scaling:")
# print(df[numerical_features + targets].isna().sum())
# print("Inf in df after scaling:")
# print(np.isinf(df[numerical_features + targets]).sum())

# # dropping rows with NaN or inf values (should be none at this point)
# df = df.replace([np.inf, -np.inf], np.nan).dropna()
# print(f"Number of rows after dropping NaN/inf: {len(df)}")

# # storing unique campaigns before one-hot encoding
# unique_campaigns = df['campaign'].unique()
# print("\nUnique campaigns:")
# print(unique_campaigns)

# # One-hot encode categorical features with numeric dtype
# df = pd.get_dummies(df, columns=categorical_features, dtype=np.float64)

# # printing statistics after scaling to confirm
# print("Feature stats after scaling:\n", df[numerical_features].describe())
# print("Target stats after scaling:\n", df[targets].describe())

# # shuffling the data before splitting
# df = df.sample(frac=1, random_state=42).reset_index(drop=True)

# # splitting the data
# train_df, temp_df = train_test_split(df, test_size=0.2, random_state=42)
# val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

# # checking distribution of targets in each split
# print("\nTarget distribution in training set:")
# print(train_df[targets].describe())
# print("\nTarget distribution in validation set:")
# print(val_df[targets].describe())
# print("\nTarget distribution in test set:")
# print(test_df[targets].describe())

# print(f"\nTraining samples: {len(train_df)}, Validation samples: {len(val_df)}, Test samples: {len(test_df)}")

NaN in raw df:
spend           0
reach           1
engagement    906
dtype: int64
Number of rows after dropping NaN: 21848
NaN in df after log transformation:
spend         0
reach         0
engagement    0
dtype: int64
Inf in df after log transformation:
spend         0
reach         0
engagement    0
dtype: int64
Spend log-transformed quantiles (before scaling): 1st percentile: 2.5705, 99th percentile: 9.5649
NaN in df after clipping:
spend         0
reach         0
engagement    0
dtype: int64
Inf in df after clipping:
spend         0
reach         0
engagement    0
dtype: int64
Scaler mean: [9.0470371  5.01718797]
Scaler scale: [1.64824353 2.31993538]
NaN in df after scaling:
spend         0
reach         0
engagement    0
dtype: int64
Inf in df after scaling:
spend         0
reach         0
engagement    0
dtype: int64
Number of rows after dropping NaN/inf: 21848

Unique campaigns:
['HNB | Credit Cards | April Offers 2023 | Auction Reach | RO 64550 | March 2023'
 'HNB | Credit Car

In [6]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Define holiday periods
holiday_periods = {
    'April': 'Avurudu',
    'December': 'Christmas/New Year'
}

# Function to extract ad type from campaign name
def extract_ad_type(campaign):
    ad_types = ['Auction Reach', 'Auction Engagement', 'Lead Gen', 'Video Views', 
                'Auction Traffic', 'Auction Views', 'Link Clicks', 'Thruplays']
    for ad_type in ad_types:
        if ad_type in campaign:
            return ad_type
    return 'Other'

# Group rare campaigns
campaign_counts = df['campaign'].value_counts()
threshold = 10  # Campaigns with fewer than 10 occurrences are grouped as 'Other'
rare_campaigns = campaign_counts[campaign_counts < threshold].index
df['campaign_grouped'] = df['campaign'].apply(lambda x: 'Other' if x in rare_campaigns else x)

# Extract features from the date column
df['month'] = df['date'].dt.strftime('%B')  # Full month name (e.g., January)
df['year'] = df['date'].dt.year
df['day_of_week'] = df['date'].dt.day_name()  # e.g., 'Monday'

# Create is_holiday_period feature
df['is_holiday_period'] = df['month'].apply(lambda x: 1 if x in holiday_periods else 0)

# Extract ad_type from campaign
df['ad_type'] = df['campaign'].apply(extract_ad_type)

# Add text-based features from ad_name
df['ad_name_length'] = df['ad_name'].str.len()
df['has_sale'] = df['ad_name'].str.contains('Sale', case=False, na=False).astype(int)
df['has_offer'] = df['ad_name'].str.contains('Offer', case=False, na=False).astype(int)
df['has_win'] = df['ad_name'].str.contains('Win', case=False, na=False).astype(int)

# Define features and targets
numerical_features = ['spend', 'ad_name_length']
binary_features = ['is_holiday_period', 'has_sale', 'has_offer', 'has_win']
categorical_features = ['campaign_grouped', 'month', 'year', 'ad_type', 'day_of_week']
targets = ['reach', 'engagement']

# Debug: Check for NaN in new features
print("NaN in new features before filling:")
print(df[categorical_features + binary_features].isna().sum())

# Handle missing values in new features
df['month'] = df['month'].fillna('Unknown')
df['year'] = df['year'].fillna(df['year'].mode()[0] if not df['year'].isna().all() else 2023)
df['day_of_week'] = df['day_of_week'].fillna('Unknown')
df['ad_type'] = df['ad_type'].fillna('Unknown')
df['campaign_grouped'] = df['campaign_grouped'].fillna('Unknown')
df['ad_name_length'] = df['ad_name_length'].fillna(df['ad_name_length'].median())
df['has_sale'] = df['has_sale'].fillna(0)
df['has_offer'] = df['has_offer'].fillna(0)
df['has_win'] = df['has_win'].fillna(0)

# Debug: Check for NaN in new features after filling
print("\nNaN in new features after filling:")
print(df[categorical_features + binary_features].isna().sum())

# Debug: Check for NaN in raw data
print("\nNaN in raw df:")
print(df[numerical_features + targets].isna().sum())

# Drop rows with NaN values in numerical features or targets
df = df.dropna(subset=numerical_features + targets)
print(f"Number of rows after dropping NaN: {len(df)}")

# Store unique campaigns before one-hot encoding
unique_campaigns = df['campaign_grouped'].unique()
print("\nUnique campaigns (first 5 for brevity):")
print(unique_campaigns[:5])
print(f"Total unique campaigns: {len(unique_campaigns)}")

# Apply transformations to handle skewness
df['spend'] = np.log1p(df['spend'])
df['reach'] = np.log1p(df['reach'])
df['engagement'] = np.sqrt(df['engagement'])  # Use sqrt for engagement

# Store quantiles of spend in log-transformed space (before scaling)
spend_log_lower = df['spend'].quantile(0.01)
spend_log_upper = df['spend'].quantile(0.99)
print(f"Spend log-transformed quantiles (before scaling): 1st percentile: {spend_log_lower:.4f}, 99th percentile: {spend_log_upper:.4f}")

# Clip outliers
df['spend'] = df['spend'].clip(lower=spend_log_lower, upper=spend_log_upper)
df['reach'] = df['reach'].clip(lower=df['reach'].quantile(0.01), upper=df['reach'].quantile(0.99))
df['engagement'] = df['engagement'].clip(lower=df['engagement'].quantile(0.01), upper=df['engagement'].quantile(0.999))

# Standardize numerical features
feature_scaler = StandardScaler()
df[numerical_features] = feature_scaler.fit_transform(df[numerical_features])

# Standardize binary features
df[binary_features] = df[binary_features].astype(float)

# Standardize targets
target_scaler = StandardScaler()
df[targets] = target_scaler.fit_transform(df[targets])

# Debug: Check scaler parameters
print("Scaler mean:", target_scaler.mean_)
print("Scaler scale:", target_scaler.scale_)

# One-hot encode categorical features
try:
    df = pd.get_dummies(df, columns=categorical_features, dtype=np.float64)
    print("One-hot encoding successful.")
except Exception as e:
    print(f"Error during one-hot encoding: {e}")
    raise

# Print statistics after scaling
print("Feature stats after scaling:\n", df[numerical_features + binary_features].describe())

# Debug: Check column names after one-hot encoding
print("\nColumns after one-hot encoding (first 10 for brevity):")
print(df.columns[:10])
print(f"Total number of columns: {len(df.columns)}")

# Shuffle the data before splitting
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

# Split the data
train_df, temp_df = train_test_split(df, test_size=0.2, random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

# Debug: Check distribution of targets in each split
print("\nTarget distribution in training set:")
print(train_df[targets].describe())
print("\nTarget distribution in validation set:")
print(val_df[targets].describe())
print("\nTarget distribution in test set:")
print(test_df[targets].describe())

print(f"\nTraining samples: {len(train_df)}, Validation samples: {len(val_df)}, Test samples: {len(test_df)}")

NaN in new features before filling:
campaign_grouped     0
month                0
year                 0
ad_type              0
day_of_week          0
is_holiday_period    0
has_sale             0
has_offer            0
has_win              0
dtype: int64

NaN in new features after filling:
campaign_grouped     0
month                0
year                 0
ad_type              0
day_of_week          0
is_holiday_period    0
has_sale             0
has_offer            0
has_win              0
dtype: int64

NaN in raw df:
spend             0
ad_name_length    0
reach             0
engagement        0
dtype: int64
Number of rows after dropping NaN: 22634

Unique campaigns (first 5 for brevity):
['Other' 'XBrand | Lifestyle Rewards | Auto Loans Special | Flash Sale'
 'XBrand | Luxury Travel | Gold Loan Offers | Flash Sale'
 'XBrand | Holiday Cashback | Auto Loans Special | Gold Loan Offers'
 'XBrand | Diwali Deals | Lifestyle Rewards | Back to School']
Total unique campaigns: 97
Spend lo

Prepare data for MLP

In [None]:
# # preparing input features (numerical + one-hot encoded campaign)
# campaign_columns = [col for col in train_df.columns if col.startswith('campaign_')]
# feature_columns = numerical_features + campaign_columns

# # converting to NumPy arrays and ensure numeric type
# X_train = train_df[feature_columns].values.astype(np.float64)
# y_train = train_df[targets].values.astype(np.float64)
# X_val = val_df[feature_columns].values.astype(np.float64)
# y_val = val_df[targets].values.astype(np.float64)
# X_test = test_df[feature_columns].values.astype(np.float64)
# y_test = test_df[targets].values.astype(np.float64)

# # checking dtypes
# print("X_train dtype:", X_train.dtype)
# print("y_train dtype:", y_train.dtype)

# # printing shapes to confirm
# print("X_train shape:", X_train.shape)
# print("y_train shape:", y_train.shape)
# print("X_val shape:", X_val.shape)
# print("y_val shape:", y_val.shape)
# print("X_test shape:", X_test.shape)
# print("y_test shape:", y_test.shape)

X_train dtype: float64
y_train dtype: float64
X_train shape: (17478, 216)
y_train shape: (17478, 2)
X_val shape: (2185, 216)
y_val shape: (2185, 2)
X_test shape: (2185, 216)
y_test shape: (2185, 2)


In [7]:
# Prepare input features (numerical + binary + one-hot encoded categorical)
campaign_columns = [col for col in train_df.columns if col.startswith('campaign_grouped_')]
month_columns = [col for col in train_df.columns if col.startswith('month_')]
year_columns = [col for col in train_df.columns if col.startswith('year_')]
ad_type_columns = [col for col in train_df.columns if col.startswith('ad_type_')]
day_of_week_columns = [col for col in train_df.columns if col.startswith('day_of_week_')]
feature_columns = numerical_features + binary_features + campaign_columns + month_columns + year_columns + ad_type_columns + day_of_week_columns

# Convert to NumPy arrays and ensure numeric type
X_train = train_df[feature_columns].values.astype(np.float64)
y_train = train_df[targets].values.astype(np.float64)
X_val = val_df[feature_columns].values.astype(np.float64)
y_val = val_df[targets].values.astype(np.float64)
X_test = test_df[feature_columns].values.astype(np.float64)
y_test = test_df[targets].values.astype(np.float64)

# Check dtypes
print("X_train dtype:", X_train.dtype)
print("y_train dtype:", y_train.dtype)

# Print shapes to confirm
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_val shape:", X_val.shape)
print("y_val shape:", y_val.shape)
print("X_test shape:", X_test.shape)
print("y_test shape:", y_test.shape)

X_train dtype: float64
y_train dtype: float64
X_train shape: (18107, 126)
y_train shape: (18107, 2)
X_val shape: (2263, 126)
y_val shape: (2263, 2)
X_test shape: (2264, 126)
y_test shape: (2264, 2)


Define and train the MLP model

In [None]:
# import torch
# import torch.nn as nn
# import torch.optim as optim

# # defining the MLP model
# class MLP(nn.Module):
#     def __init__(self, input_dim):
#         super(MLP, self).__init__()
#         self.fc1 = nn.Linear(input_dim, 128)
#         self.fc2 = nn.Linear(128, 64)
#         self.fc3 = nn.Linear(64, 32)
#         self.fc4 = nn.Linear(32, 2)  # output will be reach and engagement
#         self.relu = nn.ReLU()
#         self.dropout = nn.Dropout(0.1)  # reducing dropout rate

#     def forward(self, x):
#         x = self.relu(self.fc1(x))
#         x = self.dropout(x)
#         x = self.relu(self.fc2(x))
#         x = self.dropout(x)
#         x = self.relu(self.fc3(x))
#         x = self.fc4(x)
#         return x

# # initializing the model
# input_dim = X_train.shape[1]  # number of features (216)
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# model = MLP(input_dim).to(device)

# # defining loss function and optimizer
# criterion = nn.MSELoss()
# optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-5)

# # converting data to PyTorch tensors
# X_train_tensor = torch.FloatTensor(X_train).to(device)
# y_train_tensor = torch.FloatTensor(y_train).to(device)
# X_val_tensor = torch.FloatTensor(X_val).to(device)
# y_val_tensor = torch.FloatTensor(y_val).to(device)

# # training loop with early stopping
# num_epochs = 200
# patience = 20
# best_val_loss = float('inf')
# epochs_no_improve = 0
# best_model_state = None

# for epoch in range(num_epochs):
#     model.train()
#     optimizer.zero_grad()
#     outputs = model(X_train_tensor)
#     loss = criterion(outputs, y_train_tensor)
#     loss.backward()
#     optimizer.step()

#     model.eval()
#     with torch.no_grad():
#         val_outputs = model(X_val_tensor)
#         val_loss = criterion(val_outputs, y_val_tensor)

#     if (epoch + 1) % 10 == 0:
#         print(f'Epoch [{epoch+1}/{num_epochs}], Train Loss: {loss.item():.4f}, Val Loss: {val_loss.item():.4f}')

#     # early stopping
#     if val_loss < best_val_loss:
#         best_val_loss = val_loss
#         epochs_no_improve = 0
#         best_model_state = model.state_dict()
#     else:
#         epochs_no_improve += 1

#     if epochs_no_improve >= patience:
#         print(f'Early stopping at epoch {epoch+1}, Best Val Loss: {best_val_loss.item():.4f}')
#         break

# # loading the best model
# model.load_state_dict(best_model_state)

# # saving the model
# torch.save(model.state_dict(), 'mlp_model.pth')
# print("Model saved to 'mlp_model.pth'")

Epoch [10/200], Train Loss: 0.9652, Val Loss: 0.9176
Epoch [20/200], Train Loss: 0.8431, Val Loss: 0.7984
Epoch [30/200], Train Loss: 0.6376, Val Loss: 0.6146
Epoch [40/200], Train Loss: 0.5033, Val Loss: 0.4950
Epoch [50/200], Train Loss: 0.4309, Val Loss: 0.4215
Epoch [60/200], Train Loss: 0.3671, Val Loss: 0.3567
Epoch [70/200], Train Loss: 0.2943, Val Loss: 0.2778
Epoch [80/200], Train Loss: 0.2200, Val Loss: 0.2012
Epoch [90/200], Train Loss: 0.2000, Val Loss: 0.1831
Epoch [100/200], Train Loss: 0.1854, Val Loss: 0.1710
Epoch [110/200], Train Loss: 0.1794, Val Loss: 0.1672
Epoch [120/200], Train Loss: 0.1755, Val Loss: 0.1635
Epoch [130/200], Train Loss: 0.1723, Val Loss: 0.1619
Epoch [140/200], Train Loss: 0.1705, Val Loss: 0.1606
Epoch [150/200], Train Loss: 0.1685, Val Loss: 0.1594
Epoch [160/200], Train Loss: 0.1674, Val Loss: 0.1584
Epoch [170/200], Train Loss: 0.1660, Val Loss: 0.1575
Epoch [180/200], Train Loss: 0.1647, Val Loss: 0.1568
Epoch [190/200], Train Loss: 0.1640, 

In [8]:
import torch
import torch.nn as nn
import torch.optim as optim

# Defining the MLP model
class MLP(nn.Module):
    def __init__(self, input_dim):
        super(MLP, self).__init__()
        self.fc1 = nn.Linear(input_dim, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 32)
        self.fc4 = nn.Linear(32, 2)  # Output will be reach and engagement
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.1)  # Reducing dropout rate

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.relu(self.fc2(x))
        x = self.dropout(x)
        x = self.relu(self.fc3(x))
        x = self.fc4(x)
        return x

# Initializing the model
input_dim = X_train.shape[1]  # Number of features
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = MLP(input_dim).to(device)

# Defining loss function and optimizer
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-5)

# Converting data to PyTorch tensors
X_train_tensor = torch.FloatTensor(X_train).to(device)
y_train_tensor = torch.FloatTensor(y_train).to(device)
X_val_tensor = torch.FloatTensor(X_val).to(device)
y_val_tensor = torch.FloatTensor(y_val).to(device)

# Training loop with early stopping
num_epochs = 200
patience = 20
best_val_loss = float('inf')
epochs_no_improve = 0
best_model_state = None

for epoch in range(num_epochs):
    model.train()
    optimizer.zero_grad()
    outputs = model(X_train_tensor)
    loss = criterion(outputs, y_train_tensor)
    loss.backward()
    optimizer.step()

    model.eval()
    with torch.no_grad():
        val_outputs = model(X_val_tensor)
        val_loss = criterion(val_outputs, y_val_tensor)

    if (epoch + 1) % 10 == 0:
        print(f'Epoch [{epoch+1}/{num_epochs}], Train Loss: {loss.item():.4f}, Val Loss: {val_loss.item():.4f}')

    # Early stopping
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        epochs_no_improve = 0
        best_model_state = model.state_dict()
    else:
        epochs_no_improve += 1

    if epochs_no_improve >= patience:
        print(f'Early stopping at epoch {epoch+1}, Best Val Loss: {best_val_loss.item():.4f}')
        break

# Loading the best model
model.load_state_dict(best_model_state)

# Saving the model
torch.save(model.state_dict(), 'mlp_model.pth')
print("Model saved to 'mlp_model.pth'")

Epoch [10/200], Train Loss: 0.9666, Val Loss: 0.8959
Epoch [20/200], Train Loss: 0.8472, Val Loss: 0.7694
Epoch [30/200], Train Loss: 0.6161, Val Loss: 0.5538
Epoch [40/200], Train Loss: 0.5390, Val Loss: 0.5074
Epoch [50/200], Train Loss: 0.4898, Val Loss: 0.4645
Epoch [60/200], Train Loss: 0.4576, Val Loss: 0.4487
Epoch [70/200], Train Loss: 0.4405, Val Loss: 0.4368
Epoch [80/200], Train Loss: 0.4294, Val Loss: 0.4312
Epoch [90/200], Train Loss: 0.4221, Val Loss: 0.4267
Epoch [100/200], Train Loss: 0.4125, Val Loss: 0.4228
Epoch [110/200], Train Loss: 0.4047, Val Loss: 0.4189
Epoch [120/200], Train Loss: 0.3966, Val Loss: 0.4152
Epoch [130/200], Train Loss: 0.3941, Val Loss: 0.4121
Epoch [140/200], Train Loss: 0.3888, Val Loss: 0.4098
Epoch [150/200], Train Loss: 0.3821, Val Loss: 0.4069
Epoch [160/200], Train Loss: 0.3823, Val Loss: 0.4042
Epoch [170/200], Train Loss: 0.3757, Val Loss: 0.4023
Epoch [180/200], Train Loss: 0.3738, Val Loss: 0.4011
Epoch [190/200], Train Loss: 0.3690, 

Evaluate the MLP model

In [None]:
# import numpy as np
# from sklearn.metrics import mean_squared_error

# # loading the model
# model.load_state_dict(torch.load('mlp_model.pth'))
# model.eval()

# # converting test data to PyTorch tensors
# X_test_tensor = torch.FloatTensor(X_test).to(device)
# y_test_tensor = torch.FloatTensor(y_test).to(device)

# # predicting on test set
# with torch.no_grad():
#     y_pred = model(X_test_tensor).cpu().numpy()
#     y_test_np = y_test_tensor.cpu().numpy()

# # calculating RMSE in scaled space
# rmse_reach_scaled = np.sqrt(mean_squared_error(y_test_np[:, 0], y_pred[:, 0]))
# rmse_engagement_scaled = np.sqrt(mean_squared_error(y_test_np[:, 1], y_pred[:, 1]))
# print(f'RMSE for Reach (scaled): {rmse_reach_scaled:.4f}')
# print(f'RMSE for Engagement (scaled): {rmse_engagement_scaled:.4f}')

# # inverse transform to original scale
# y_test_original = target_scaler.inverse_transform(y_test_np)
# y_pred_original = target_scaler.inverse_transform(y_pred)

# # undo log transformation
# y_test_original = np.expm1(y_test_original)
# y_pred_original = np.expm1(y_pred_original)

# # calculating RMSE in original scale
# rmse_reach_original = np.sqrt(mean_squared_error(y_test_original[:, 0], y_pred_original[:, 0]))
# rmse_engagement_original = np.sqrt(mean_squared_error(y_test_original[:, 1], y_pred_original[:, 1]))
# print(f'RMSE for Reach (original scale): {rmse_reach_original:.2f}')
# print(f'RMSE for Engagement (original scale): {rmse_engagement_original:.2f}')

RMSE for Reach (scaled): 0.3185
RMSE for Engagement (scaled): 0.4608
RMSE for Reach (original scale): 11011.15
RMSE for Engagement (original scale): 2686.57


In [9]:
import numpy as np
from sklearn.metrics import mean_squared_error

# Loading the model
model.load_state_dict(torch.load('mlp_model.pth'))
model.eval()

# Converting test data to PyTorch tensors
X_test_tensor = torch.FloatTensor(X_test).to(device)
y_test_tensor = torch.FloatTensor(y_test).to(device)

# Predicting on test set
with torch.no_grad():
    y_pred = model(X_test_tensor).cpu().numpy()
    y_test_np = y_test_tensor.cpu().numpy()

# Calculating RMSE in scaled space
rmse_reach_scaled = np.sqrt(mean_squared_error(y_test_np[:, 0], y_pred[:, 0]))
rmse_engagement_scaled = np.sqrt(mean_squared_error(y_test_np[:, 1], y_pred[:, 1]))
print(f'RMSE for Reach (scaled): {rmse_reach_scaled:.4f}')
print(f'RMSE for Engagement (scaled): {rmse_engagement_scaled:.4f}')

# Inverse transform to original scale
y_test_original = target_scaler.inverse_transform(y_test_np)
y_pred_original = target_scaler.inverse_transform(y_pred)

# Undo transformations
y_test_original[:, 0] = np.expm1(y_test_original[:, 0])  # Undo log for reach
y_test_original[:, 1] = y_test_original[:, 1] ** 2  # Undo sqrt for engagement
y_test_original = np.maximum(y_test_original, 0)

y_pred_original[:, 0] = np.expm1(y_pred_original[:, 0])  # Undo log for reach
y_pred_original[:, 1] = y_pred_original[:, 1] ** 2  # Undo sqrt for engagement
y_pred_original = np.maximum(y_pred_original, 0)

# Calculating RMSE in original scale
rmse_reach_original = np.sqrt(mean_squared_error(y_test_original[:, 0], y_pred_original[:, 0]))
rmse_engagement_original = np.sqrt(mean_squared_error(y_test_original[:, 1], y_pred_original[:, 1]))
print(f'RMSE for Reach (original scale): {rmse_reach_original:.2f}')
print(f'RMSE for Engagement (original scale): {rmse_engagement_original:.2f}')

RMSE for Reach (scaled): 0.5675
RMSE for Engagement (scaled): 0.6711
RMSE for Reach (original scale): 22188.72
RMSE for Engagement (original scale): 6182.10


Predict with MLP

In [10]:
import pandas as pd
import numpy as np

# Hardcode the log-transformed quantiles from Cell 2
spend_log_lower = 1.4142  # From Cell 2 output
spend_log_upper = 9.5700  # From Cell 2 output

# Create new data point
campaign_name = 'XBrand | Green Finance | Holiday Cashback | Savings Fiesta'
ad_name = 'XBrand | Save More | Zero Interest | Easy Payments'
date_str = '2023-05-01'
date = pd.to_datetime(date_str, errors='coerce')

# Extract features
month = date.strftime('%B')
year = date.year
day_of_week = date.day_name()
is_holiday_period = 1 if month in holiday_periods else 0

# Extract ad_type
def extract_ad_type(campaign):
    ad_types = ['Auction Reach', 'Auction Engagement', 'Lead Gen', 'Video Views', 
                'Auction Traffic', 'Auction Views', 'Link Clicks', 'Thruplays']
    for ad_type in ad_types:
        if ad_type in campaign:
            return ad_type
    return 'Other'

ad_type = extract_ad_type(campaign_name)

# Group campaign (same logic as in Cell 2)
campaign_grouped = 'Other' if campaign_name in rare_campaigns else campaign_name

# Extract ad_name features
ad_name_length = len(ad_name)
has_sale = 1 if 'Sale' in ad_name else 0
has_offer = 1 if 'Offer' in ad_name else 0
has_win = 1 if 'Win' in ad_name else 0

new_data = pd.DataFrame({
    'spend': [4000],
    'campaign_grouped': [campaign_grouped],
    'month': [month],
    'year': [year],
    'day_of_week': [day_of_week],
    'ad_type': [ad_type],
    'is_holiday_period': [is_holiday_period],
    'ad_name_length': [ad_name_length],
    'has_sale': [has_sale],
    'has_offer': [has_offer],
    'has_win': [has_win]
})

# Preprocess the new data
new_data['spend'] = np.log1p(new_data['spend'])
new_data['spend'] = new_data['spend'].clip(lower=spend_log_lower, upper=spend_log_upper)
new_data[['spend', 'ad_name_length']] = feature_scaler.transform(new_data[['spend', 'ad_name_length']])
new_data[['is_holiday_period', 'has_sale', 'has_offer', 'has_win']] = new_data[['is_holiday_period', 'has_sale', 'has_offer', 'has_win']].astype(float)
new_data = pd.get_dummies(new_data, columns=['campaign_grouped', 'month', 'year', 'ad_type', 'day_of_week'], dtype=np.float64)
new_data = new_data.reindex(columns=feature_columns, fill_value=0)

# Convert to PyTorch tensor
X_new_tensor = torch.FloatTensor(new_data.values).to(device)

# Predict
model.eval()
with torch.no_grad():
    predictions = model(X_new_tensor).cpu().numpy()

# Inverse transform predictions
predictions = target_scaler.inverse_transform(predictions)
predictions[:, 0] = np.expm1(predictions[:, 0])  # Undo log for reach
predictions[:, 1] = predictions[:, 1] ** 2  # Undo sqrt for engagement
predictions = np.maximum(predictions, 0)

print('Predicted Reach:', predictions[0, 0])
print('Predicted Engagement:', predictions[0, 1])

Predicted Reach: 46848.78
Predicted Engagement: 3506.437


In [None]:
# # Hardcode the log-transformed quantiles from Step 2
# spend_log_lower = 2.5705
# spend_log_upper = 9.5649

# # Create new data point
# # Use a different campaign name from the unique campaigns list
# campaign_name = 'HNB | May Monthly Boosting | Auction Reach | RO 63711 | May 2023'  # Different campaign
# print(f"Using campaign: {campaign_name}")

# new_data = pd.DataFrame({
#     'spend': [4000],
#     'campaign': [campaign_name]
# })

# # Preprocess the new data
# new_data['spend'] = np.log1p(new_data['spend'])
# # Clip using the log-transformed quantiles (before scaling)
# new_data['spend'] = new_data['spend'].clip(lower=spend_log_lower, upper=spend_log_upper)
# new_data[['spend']] = feature_scaler.transform(new_data[['spend']])
# new_data = pd.get_dummies(new_data, columns=['campaign'], dtype=np.float64)
# new_data = new_data.reindex(columns=feature_columns, fill_value=0)

# # Convert to PyTorch tensor
# X_new_tensor = torch.FloatTensor(new_data.values).to(device)

# # Predict
# model.eval()
# with torch.no_grad():
#     predictions = model(X_new_tensor).cpu().numpy()

# # Inverse transform predictions
# predictions = target_scaler.inverse_transform(predictions)
# predictions = np.expm1(predictions)

# # Ensure predictions are non-negative
# predictions = np.maximum(predictions, 0)

# print('Predicted Reach:', predictions[0, 0])
# print('Predicted Engagement:', predictions[0, 1])

Using campaign: HNB | May Monthly Boosting | Auction Reach | RO 63711 | May 2023
Predicted Reach: 100564.664
Predicted Engagement: 1516.6621


Evaluate Model predictions on the Test set

In [None]:
# import numpy as np
# import pandas as pd
# from sklearn.metrics import mean_absolute_error, r2_score

# # Convert test data to PyTorch tensor
# X_test_tensor = torch.FloatTensor(X_test).to(device)
# y_test_tensor = torch.FloatTensor(y_test).to(device)

# # Predict on the test set
# model.eval()
# with torch.no_grad():
#     y_pred = model(X_test_tensor).cpu().numpy()

# # Inverse transform predictions and actual values to original scale
# y_pred_original = target_scaler.inverse_transform(y_pred)
# y_pred_original = np.expm1(y_pred_original)
# y_pred_original = np.maximum(y_pred_original, 0)  # Ensure non-negative

# y_test_original = target_scaler.inverse_transform(y_test)
# y_test_original = np.expm1(y_test_original)
# y_test_original = np.maximum(y_test_original, 0)  # Ensure non-negative

# # Create a DataFrame to compare predictions and actual values
# comparison_df = pd.DataFrame({
#     'Actual Reach': y_test_original[:, 0],
#     'Predicted Reach': y_pred_original[:, 0],
#     'Actual Engagement': y_test_original[:, 1],
#     'Predicted Engagement': y_pred_original[:, 1]
# })

# # Compute additional metrics
# # Mean Absolute Error (MAE)
# mae_reach = mean_absolute_error(comparison_df['Actual Reach'], comparison_df['Predicted Reach'])
# mae_engagement = mean_absolute_error(comparison_df['Actual Engagement'], comparison_df['Predicted Engagement'])

# # Mean Absolute Percentage Error (MAPE)
# # Add small epsilon to avoid division by zero
# epsilon = 1e-10
# mape_reach = np.mean(np.abs((comparison_df['Actual Reach'] - comparison_df['Predicted Reach']) / (comparison_df['Actual Reach'] + epsilon))) * 100
# mape_engagement = np.mean(np.abs((comparison_df['Actual Engagement'] - comparison_df['Predicted Engagement']) / (comparison_df['Actual Engagement'] + epsilon))) * 100

# # R² Score
# r2_reach = r2_score(comparison_df['Actual Reach'], comparison_df['Predicted Reach'])
# r2_engagement = r2_score(comparison_df['Actual Engagement'], comparison_df['Predicted Engagement'])

# # Print evaluation metrics
# print("\nEvaluation Metrics on Test Set:")
# print(f"RMSE Reach (original scale): {np.sqrt(mean_squared_error(comparison_df['Actual Reach'], comparison_df['Predicted Reach'])):.2f}")
# print(f"RMSE Engagement (original scale): {np.sqrt(mean_squared_error(comparison_df['Actual Engagement'], comparison_df['Predicted Engagement'])):.2f}")
# print(f"MAE Reach: {mae_reach:.2f}")
# print(f"MAE Engagement: {mae_engagement:.2f}")
# print(f"MAPE Reach: {mape_reach:.2f}%")
# print(f"MAPE Engagement: {mape_engagement:.2f}%")
# print(f"R² Reach: {r2_reach:.4f}")
# print(f"R² Engagement: {r2_engagement:.4f}")

# # Display a sample of predictions vs actual values
# print("\nSample of Predictions vs Actual Values (first 10 rows):")
# print(comparison_df.head(10))

# # Summary statistics of errors
# comparison_df['Reach Error'] = comparison_df['Predicted Reach'] - comparison_df['Actual Reach']
# comparison_df['Engagement Error'] = comparison_df['Predicted Engagement'] - comparison_df['Actual Engagement']
# print("\nSummary of Errors:")
# print(comparison_df[['Reach Error', 'Engagement Error']].describe())


Evaluation Metrics on Test Set:
RMSE Reach (original scale): 11011.15
RMSE Engagement (original scale): 2686.57
MAE Reach: 4910.43
MAE Engagement: 813.58
MAPE Reach: 49.55%
MAPE Engagement: 159.68%
R² Reach: 0.8853
R² Engagement: 0.6357

Sample of Predictions vs Actual Values (first 10 rows):
   Actual Reach  Predicted Reach  Actual Engagement  Predicted Engagement
0       16764.0     15044.733398               13.0             19.662148
1       14478.0     13532.988281             1308.0           1238.735718
2         744.0      1523.711914               96.0            360.638489
3        6011.0      5187.545898               10.0             20.935696
4       19383.0     19902.220703              755.0             56.206776
5       52923.0     65795.703125            19190.0          27014.277344
6       68284.0     18862.894531             2379.0            709.759277
7        2693.0      5309.661621               14.0             14.991861
8       35089.0     21981.626953       

In [11]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error

# Convert test data to PyTorch tensor
X_test_tensor = torch.FloatTensor(X_test).to(device)
y_test_tensor = torch.FloatTensor(y_test).to(device)

# Predict on the test set
model.eval()
with torch.no_grad():
    y_pred = model(X_test_tensor).cpu().numpy()

# Inverse transform predictions and actual values to original scale
y_pred_original = target_scaler.inverse_transform(y_pred)
y_pred_original[:, 0] = np.expm1(y_pred_original[:, 0])  # Undo log for reach
y_pred_original[:, 1] = y_pred_original[:, 1] ** 2  # Undo sqrt for engagement
y_pred_original = np.maximum(y_pred_original, 0)

y_test_original = target_scaler.inverse_transform(y_test)
y_test_original[:, 0] = np.expm1(y_test_original[:, 0])  # Undo log for reach
y_test_original[:, 1] = y_test_original[:, 1] ** 2  # Undo sqrt for engagement
y_test_original = np.maximum(y_test_original, 0)

# Create a DataFrame to compare predictions and actual values
comparison_df = pd.DataFrame({
    'Actual Reach': y_test_original[:, 0],
    'Predicted Reach': y_pred_original[:, 0],
    'Actual Engagement': y_test_original[:, 1],
    'Predicted Engagement': y_pred_original[:, 1]
})

# Compute additional metrics
# Mean Absolute Error (MAE)
mae_reach = mean_absolute_error(comparison_df['Actual Reach'], comparison_df['Predicted Reach'])
mae_engagement = mean_absolute_error(comparison_df['Actual Engagement'], comparison_df['Predicted Engagement'])

# Mean Absolute Percentage Error (MAPE)
epsilon = 1e-10
mape_reach = np.mean(np.abs((comparison_df['Actual Reach'] - comparison_df['Predicted Reach']) / (comparison_df['Actual Reach'] + epsilon))) * 100
mape_engagement = np.mean(np.abs((comparison_df['Actual Engagement'] - comparison_df['Predicted Engagement']) / (comparison_df['Actual Engagement'] + epsilon))) * 100

# R² Score
r2_reach = r2_score(comparison_df['Actual Reach'], comparison_df['Predicted Reach'])
r2_engagement = r2_score(comparison_df['Actual Engagement'], comparison_df['Predicted Engagement'])

# Print evaluation metrics
print("\nEvaluation Metrics on Test Set:")
print(f"RMSE Reach (original scale): {np.sqrt(mean_squared_error(comparison_df['Actual Reach'], comparison_df['Predicted Reach'])):.2f}")
print(f"RMSE Engagement (original scale): {np.sqrt(mean_squared_error(comparison_df['Actual Engagement'], comparison_df['Predicted Engagement'])):.2f}")
print(f"MAE Reach: {mae_reach:.2f}")
print(f"MAE Engagement: {mae_engagement:.2f}")
print(f"MAPE Reach: {mape_reach:.2f}%")
print(f"MAPE Engagement: {mape_engagement:.2f}%")
print(f"R² Reach: {r2_reach:.4f}")
print(f"R² Engagement: {r2_engagement:.4f}")

# Display a sample of predictions vs actual values
print("\nSample of Predictions vs Actual Values (first 10 rows):")
print(comparison_df.head(10))

# Summary statistics of errors
comparison_df['Reach Error'] = comparison_df['Predicted Reach'] - comparison_df['Actual Reach']
comparison_df['Engagement Error'] = comparison_df['Predicted Engagement'] - comparison_df['Actual Engagement']
print("\nSummary of Errors:")
print(comparison_df[['Reach Error', 'Engagement Error']].describe())


Evaluation Metrics on Test Set:
RMSE Reach (original scale): 22188.72
RMSE Engagement (original scale): 6182.10
MAE Reach: 11450.66
MAE Engagement: 1506.74
MAPE Reach: 147.93%
MAPE Engagement: 958714232710.47%
R² Reach: 0.5591
R² Engagement: 0.4346

Sample of Predictions vs Actual Values (first 10 rows):
    Actual Reach  Predicted Reach  Actual Engagement  Predicted Engagement
0    7540.000000      5361.107910               10.0            170.970901
1    1157.000000      2931.041504               50.0            116.877968
2   45575.000000     33812.152344               87.0            412.119781
3    6013.000000     12503.794922              270.0            466.499451
4   39235.000000     29855.482422               59.0            237.073441
5    5482.000000     20806.177734               41.0            630.929565
6   30641.000000     26691.087891                7.0            515.598328
7   43569.000000     57143.621094              433.0          17435.679688
8  197689.332763  

Reconstruct Test Set and Compare Predictions

In [None]:
# # Reconstruct the original test set features
# # Inverse transform the scaled spend
# spend_scaled = X_test[:, 0]  # First column is scaled spend
# spend_log = feature_scaler.inverse_transform(spend_scaled.reshape(-1, 1)).flatten()
# spend_original = np.expm1(spend_log)

# # Reconstruct the campaign names from one-hot encoded columns
# campaign_columns = [col for col in feature_columns if col.startswith('campaign_')]
# campaign_indices = np.argmax(X_test[:, 1:], axis=1)  # Index of the 1 in one-hot encoding
# campaign_names = [campaign_columns[idx].replace('campaign_', '') for idx in campaign_indices]

# # Create a DataFrame with the original test set features and actual/predicted values
# test_df_reconstructed = pd.DataFrame({
#     'Spend': spend_original,
#     'Campaign': campaign_names,
#     'Actual Reach': y_test_original[:, 0],
#     'Predicted Reach': y_pred_original[:, 0],
#     'Actual Engagement': y_test_original[:, 1],
#     'Predicted Engagement': y_pred_original[:, 1]
# })

# # Step 6 prediction for comparison
# step6_pred_reach = 108745.32
# step6_pred_engagement = 10683.112
# step6_spend = 4000
# step6_campaign = 'HNB | Credit Cards | April Offers 2023 | Auction Reach | RO 64550 | March 2023'

# # Find data points with the same campaign
# same_campaign_df = test_df_reconstructed[
#     test_df_reconstructed['Campaign'] == step6_campaign
# ]

# # Find data points with similar spend (within ±500 of 4000)
# similar_spend_df = test_df_reconstructed[
#     (test_df_reconstructed['Spend'] >= 3500) & (test_df_reconstructed['Spend'] <= 4500)
# ]

# # Print results
# print(f"\nStep 6 Prediction (Spend={step6_spend}, Campaign={step6_campaign}):")
# print(f"Predicted Reach: {step6_pred_reach:.2f}, Predicted Engagement: {step6_pred_engagement:.2f}")

# print("\nData Points with the Same Campaign:")
# if not same_campaign_df.empty:
#     print(same_campaign_df)
# else:
#     print("No data points found with the same campaign in the test set.")

# print("\nData Points with Similar Spend (3500 ≤ Spend ≤ 4500):")
# if not similar_spend_df.empty:
#     print(similar_spend_df)
#     print("\nSummary Statistics for Similar Spend Data Points:")
#     print(similar_spend_df[['Actual Reach', 'Actual Engagement']].describe())
# else:
#     print("No data points found with similar spend in the test set.")


Step 6 Prediction (Spend=4000, Campaign=HNB | Credit Cards | April Offers 2023 | Auction Reach | RO 64550 | March 2023):
Predicted Reach: 108745.32, Predicted Engagement: 10683.11

Data Points with the Same Campaign:
      Spend                                           Campaign  Actual Reach  \
104  689.16  HNB | Credit Cards | April Offers 2023 | Aucti...       33192.0   

     Predicted Reach  Actual Engagement  Predicted Engagement  
104     36044.199219             1996.0           2240.762695  

Data Points with Similar Spend (3500 ≤ Spend ≤ 4500):
        Spend                                           Campaign  \
8     3598.74  HNB | February Monthly Boosting 2025 | Digital...   
26    3851.34  HNB | Avrudu Meta Content Advertising 2024 | A...   
32    3943.09  HNB | Digital Banking Campaign Meta | Auction ...   
60    3737.63  HNB | January Monthly Boosting 2025 | Corporat...   
97    4089.66  HNB | October Monthly Boosting | Auction Reach...   
...       ...                 

In [12]:
import numpy as np
import pandas as pd

# Reconstruct the original test set features
# Inverse transform the scaled features
numerical_scaled = X_test[:, :2]  # First two columns are spend and ad_name_length
numerical_original = feature_scaler.inverse_transform(numerical_scaled)
spend_original = np.expm1(numerical_original[:, 0])  # Undo log for spend
ad_name_length_original = numerical_original[:, 1]

# Reconstruct binary features
binary_features_values = X_test[:, 2:6]  # Next four columns are binary features

# Reconstruct categorical features (campaign_grouped, month, year, ad_type, day_of_week)
campaign_cols = [col for col in feature_columns if col.startswith('campaign_grouped_')]
month_cols = [col for col in feature_columns if col.startswith('month_')]
year_cols = [col for col in feature_columns if col.startswith('year_')]
ad_type_cols = [col for col in feature_columns if col.startswith('ad_type_')]
day_of_week_cols = [col for col in feature_columns if col.startswith('day_of_week_')]

campaign_start_idx = 6
month_start_idx = campaign_start_idx + len(campaign_cols)
year_start_idx = month_start_idx + len(month_cols)
ad_type_start_idx = year_start_idx + len(year_cols)
day_of_week_start_idx = ad_type_start_idx + len(ad_type_cols)

campaign_indices = np.argmax(X_test[:, campaign_start_idx:month_start_idx], axis=1)
month_indices = np.argmax(X_test[:, month_start_idx:year_start_idx], axis=1)
year_indices = np.argmax(X_test[:, year_start_idx:ad_type_start_idx], axis=1)
ad_type_indices = np.argmax(X_test[:, ad_type_start_idx:day_of_week_start_idx], axis=1)
day_of_week_indices = np.argmax(X_test[:, day_of_week_start_idx:], axis=1)

campaign_names = [campaign_cols[idx].replace('campaign_grouped_', '') for idx in campaign_indices]
month_names = [month_cols[idx].replace('month_', '') for idx in month_indices]
year_names = [year_cols[idx].replace('year_', '') for idx in year_indices]
ad_type_names = [ad_type_cols[idx].replace('ad_type_', '') for idx in ad_type_indices]
day_of_week_names = [day_of_week_cols[idx].replace('day_of_week_', '') for idx in day_of_week_indices]

# Create a DataFrame with the original test set features and actual/predicted values
test_df_reconstructed = pd.DataFrame({
    'Spend': spend_original,
    'Campaign': campaign_names,
    'Month': month_names,
    'Year': year_names,
    'Ad Type': ad_type_names,
    'Day of Week': day_of_week_names,
    'Is Holiday Period': binary_features_values[:, 0],
    'Has Sale': binary_features_values[:, 1],
    'Has Offer': binary_features_values[:, 2],
    'Has Win': binary_features_values[:, 3],
    'Ad Name Length': ad_name_length_original,
    'Actual Reach': y_test_original[:, 0],
    'Predicted Reach': y_pred_original[:, 0],
    'Actual Engagement': y_test_original[:, 1],
    'Predicted Engagement': y_pred_original[:, 1]
})

# Step 6 prediction for comparison
step6_pred_reach = predictions[0, 0]  # Use the prediction from Cell 6
step6_pred_engagement = predictions[0, 1]
step6_spend = 4000
step6_campaign = campaign_name

# Find data points with the same campaign
same_campaign_df = test_df_reconstructed[
    test_df_reconstructed['Campaign'] == step6_campaign
]

# Find data points with similar spend (within ±500 of 4000)
similar_spend_df = test_df_reconstructed[
    (test_df_reconstructed['Spend'] >= 3500) & (test_df_reconstructed['Spend'] <= 4500)
]

# Print results
print(f"\nStep 6 Prediction (Spend={step6_spend}, Campaign={step6_campaign}):")
print(f"Predicted Reach: {step6_pred_reach:.2f}, Predicted Engagement: {step6_pred_engagement:.2f}")

print("\nData Points with the Same Campaign:")
if not same_campaign_df.empty:
    print(same_campaign_df)
else:
    print("No data points found with the same campaign in the test set.")

print("\nData Points with Similar Spend (3500 ≤ Spend ≤ 4500):")
if not similar_spend_df.empty:
    print(similar_spend_df)
    print("\nSummary Statistics for Similar Spend Data Points:")
    print(similar_spend_df[['Actual Reach', 'Actual Engagement']].describe())
else:
    print("No data points found with similar spend in the test set.")


Step 6 Prediction (Spend=4000, Campaign=XBrand | Green Finance | Holiday Cashback | Savings Fiesta):
Predicted Reach: 46848.78, Predicted Engagement: 3506.44

Data Points with the Same Campaign:
No data points found with the same campaign in the test set.

Data Points with Similar Spend (3500 ≤ Spend ≤ 4500):
        Spend Campaign      Month  Year Ad Type Day of Week  \
28    3710.10    Other    January  2025   Other     Tuesday   
85    3872.84    Other   December  2024   Other      Monday   
209   3855.60    Other      April  2025   Other    Thursday   
234   3725.74    Other   November  2024   Other      Friday   
254   4067.46    Other   February  2025   Other   Wednesday   
258   3995.20    Other   November  2023   Other    Thursday   
263   3656.65    Other      March  2025   Other      Friday   
265   3914.79    Other      April  2024   Other    Thursday   
277   4257.82    Other      March  2025   Other      Friday   
309   4296.27    Other        May  2023   Other      Sunda