In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score
import matplotlib.pyplot as plt
from datetime import datetime
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel
import numpy as np
import pandas as pd

from sklearn.model_selection import KFold, train_test_split
from sklearn.preprocessing import StandardScaler
import logging


In [2]:
def mean_absolute_percentage_error(y_true, y_pred):
    """
    Calculates the Mean Absolute Percentage Error (MAPE) and returns it in percentage format.

    Parameters:
    - y_true: Array-like of shape (n_samples,) or (n_samples, n_outputs)
        True target values.
    - y_pred: Array-like of shape (n_samples,) or (n_samples, n_outputs)
        Predicted target values.
    
    Returns:
    - A string representing MAPE as a percentage (e.g., '32.73%').
    """
    # Import the original sklearn function
    from sklearn.metrics import mean_absolute_percentage_error as sklearn_mape
    
    # Calculate MAPE using sklearn
    mape_value = sklearn_mape(y_true, y_pred)
    
    # Convert to percentage and format as a string
    mape_percentage = mape_value * 100
    return mape_percentage

In [3]:
file_path = "/notebooks/hackathon/baseline_data"
baseline = "baseline_author_audio_features"

In [4]:
df_merged = pd.read_csv(f"{file_path}/baseline_author_audio_vlm_numerical_features.csv")

In [None]:
import os
from datetime import datetime
import logging

# Ensure the log directory exists
log_dir = '/notebooks/hackathon/logs/'
os.makedirs(log_dir, exist_ok=True)

# Create a logger
logger = logging.getLogger('model_processing_logger')
logger.setLevel(logging.INFO)  # Set the logger level correctly

# Check if the logger already has handlers to prevent duplicates
if not logger.hasHandlers():
    # Create file handler for logging to a file
    current_time = datetime.now().strftime("%Y_%m_%d_%H_%M_%S")
    log_filename = f'{log_dir}final_model_processing_{baseline}_{current_time}.log'
    file_handler = logging.FileHandler(log_filename)
    file_handler.setLevel(logging.INFO)

    # Create console handler for logging to the console (stdout)
    console_handler = logging.StreamHandler()
    console_handler.setLevel(logging.INFO)

    # Create a formatter and set it for both handlers
    formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
    file_handler.setFormatter(formatter)
    console_handler.setFormatter(formatter)

    # Add the handlers to the logger
    logger.addHandler(file_handler)
    logger.addHandler(console_handler)

# Optional: Disable propagation to avoid duplicate logging if needed
logger.propagate = False

# Test logging with the custom logger
logger.info('Custom logger has been set up successfully!')


In [6]:

numerical_features = ['author_follower_count',
       'author_following_count', 'author_total_heart_count',
       'author_total_video_count', 'follower_following_ratio',
       'avg_hearts_per_video', 'engagement_rate', 'videos_per_follower',
       'Video_Length', 'Hashtags', 'Collaborations', 'Series', 'Post_Day',
       'Post_Hour', 'Post_Month', 'Post_Season', 'Post_Quarter',
       'Post_Week_of_Year', 'Post_Part_of_Day', 'Is_Weekend',
       'Next_Day_Holiday', 'Is_Long_Weekend', 'Engagement_Rate',
       'Description_Sentiment', 'industry', 'Sentiment Score',
       'Speech vs. Action Focus', 'Setting', 'Interaction Type', 'Tone',
       'Dialogue/Monologue', 'Audience Engagement',
       'zero_crossing_rate_mean', 'mfcc_1', 'mfcc_2', 'mfcc_3', 'mfcc_4',
       'mfcc_5', 'mfcc_6', 'mfcc_7', 'mfcc_8', 'mfcc_9', 'mfcc_10', 'mfcc_11',
       'mfcc_12', 'mfcc_13', 'spectral_contrast_1', 'spectral_contrast_2',
       'spectral_contrast_3', 'spectral_contrast_4', 'spectral_contrast_5',
       'spectral_contrast_6', 'spectral_contrast_7', 'chroma_1', 'chroma_2',
       'chroma_3', 'chroma_4', 'chroma_5', 'chroma_6', 'chroma_7', 'chroma_8',
       'chroma_9', 'chroma_10', 'chroma_11', 'chroma_12']


text_features = ['video_description', 'transcribe_text', 'generated_vlm_text', "combined_text"]

labels = ['video_comment_count', 'video_heart_count', 'video_play_count', 'video_share_count']

# Fill NaN values with "empty input" for each text column
df_merged["video_description"] = df_merged["video_description"].fillna("empty input").astype(str)
df_merged["transcribe_text"] = df_merged["transcribe_text"].fillna("empty input").astype(str)
df_merged["generated_vlm_text"] = df_merged["generated_vlm_text"].fillna("empty input").astype(str)

# Alternatively, using agg for better readability and flexibility
df_merged["combined_text"] = df_merged[["video_description", "transcribe_text", "generated_vlm_text"]].astype(str).agg(' '.join, axis=1)

In [7]:
from sklearn.preprocessing import LabelEncoder

# Selecting categorical columns to encode
categorical_columns = ['Collaborations', 'Series', 'Post_Day', 'Post_Month', 'Post_Season', 'Post_Part_of_Day', "Post_Part_of_Day", 'Is_Weekend',
       'Next_Day_Holiday', 'Is_Long_Weekend', 'industry', 'Sentiment Score', 'Speech vs. Action Focus', 'Setting', 'Interaction Type', 'Tone', 
       'Dialogue/Monologue', 'Audience Engagement']

# Encoding categorical columns
label_encoders = {}
for column in categorical_columns:
    le = LabelEncoder()
    df_merged[column] = le.fit_transform(df_merged[column])
    label_encoders[column] = le  # Storing the encoder for each column in case you need to reverse the transformation


In [8]:
# Define MAPE Loss Function
class MAPELoss(nn.Module):
    def __init__(self):
        super(MAPELoss, self).__init__()

    def forward(self, preds, target):
        epsilon = 1e-10  # A small value to avoid division by zero
        loss = torch.abs((target - preds) / (target + epsilon))
        return torch.mean(loss) * 100
    
# Define SMAPE Loss Function
class SMAPELoss(nn.Module):
    def __init__(self):
        super(SMAPELoss, self).__init__()

    def forward(self, preds, targets):
        epsilon = 1e-10
        smape = 200 * torch.mean(torch.abs(targets - preds) / (torch.abs(targets) + torch.abs(preds) + epsilon))
        return smape

# Define MAE Loss Function
class MAELoss(nn.Module):
    def __init__(self):
        super(MAELoss, self).__init__()

    def forward(self, preds, targets):
        return torch.mean(torch.abs(targets - preds))

In [None]:
# Assuming your data is in a pandas DataFrame `df` with features in X and target in y
X = df_merged[numerical_features]
y = df_merged[labels[0]].values


# Step 2: Initialize KFold for the cross-validation step on the 80% (or 90%) train set
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Custom Dataset for Regression Task
class RegressionDataset(torch.utils.data.Dataset):
    def __init__(self, X, y):
        if isinstance(X, pd.DataFrame):
            X = X.values
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.float32).unsqueeze(1)
    
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]


# Transformer-based Model
class TransformerRegressionModel(nn.Module):
    def __init__(self, input_dim, num_heads=8, hidden_dim=64, num_layers=2):
        super(TransformerRegressionModel, self).__init__()
        
        if input_dim < 8:
            input_dim = 8  # Set a minimum threshold
            logger.info(f"Input_dim is too small, adjusting to minimum threshold: {input_dim}")
        
        if input_dim % num_heads != 0:
            self.embedding_dim = (input_dim // num_heads) * num_heads
            logger.info(f"Projecting input_dim from {input_dim} to {self.embedding_dim} to be divisible by num_heads={num_heads}")
        else:
            self.embedding_dim = input_dim
        
        self.fc_embedding = nn.Linear(input_dim, self.embedding_dim)
        self.multihead_attn = nn.MultiheadAttention(embed_dim=self.embedding_dim, num_heads=num_heads, batch_first=True)
        self.fc1 = nn.Linear(self.embedding_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)
        self.fc3 = nn.Linear(hidden_dim, 1)
        
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.1)
    
    def forward(self, x):
        x = self.fc_embedding(x)
        x = x.unsqueeze(1)
        attn_output, _ = self.multihead_attn(x, x, x)
        attn_output = attn_output.squeeze(1)
        x = self.relu(self.fc1(attn_output))
        x = self.dropout(x)
        x = self.relu(self.fc2(x))
        x = self.fc3(x)
        return x

# Placeholder for storing test MAPE for each fold
test_mape_scores = []

test_mape_scores = []
fold_train_losses = []
fold_val_losses = []

# Cross-validation loop
for fold, (train_idx, val_idx) in enumerate(kf.split(X)):
    logger.info(f"\nFold {fold + 1}")
    
    # Use iloc to ensure you are selecting rows by their positional index
    X_train, X_test = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_test = y[train_idx], y[val_idx]

    X_train_val, X_val, y_train_val, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=42)

    # Create datasets and dataloaders
    train_dataset = RegressionDataset(X_train_val, y_train_val)
    val_dataset = RegressionDataset(X_val, y_val)
    test_dataset = RegressionDataset(X_test, y_test)  # Test set remains fixed

    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)
    test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

    # Initialize Model
    input_dim = X.shape[1]  # Number of features
    model = TransformerRegressionModel(input_dim=input_dim)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

    loss_type = "Huber"
    if loss_type == "MSE":
        loss_fn = nn.MSELoss()
    elif loss_type == "MAE":
        loss_fn = nn.L1Loss()
    elif loss_type == "Huber":
        loss_fn = nn.SmoothL1Loss()
    elif loss_type == "MAPE":
        loss_fn = MAPELoss()
    else:
        raise ValueError(f"Unknown loss_type: {loss_type}")

    # Store losses for plotting
    train_losses_epoch = []
    val_losses_epoch = []

    # Training loop
    num_epochs = 1000
    for epoch in range(num_epochs):
        model.train()
        train_losses = []
        for batch_X, batch_y in train_loader:
            optimizer.zero_grad()
            outputs = model(batch_X)
            loss = loss_fn(outputs, batch_y)
            loss.backward()
            optimizer.step()
            train_losses.append(loss.item())
        
        avg_train_loss = np.mean(train_losses)
        train_losses_epoch.append(avg_train_loss)

        # Validation
        model.eval()
        val_losses = []
        with torch.no_grad():
            for batch_X, batch_y in val_loader:
                outputs = model(batch_X)
                loss = loss_fn(outputs, batch_y)
                val_losses.append(loss.item())
        
        avg_val_loss = np.mean(val_losses)
        val_losses_epoch.append(avg_val_loss)

        if epoch % 100 == 0:
            logger.info(f'Epoch {epoch+1}, Fold {fold+1}, Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}')
    
    # Store fold-wise losses
    fold_train_losses.append(train_losses_epoch)
    fold_val_losses.append(val_losses_epoch)
    
    # Test the model on the hold-out test set
    model.eval()
    all_preds = []
    with torch.no_grad():
        for batch_X, batch_y in test_loader:
            preds = model(batch_X)
            all_preds.extend(preds.squeeze(1).tolist())

    # Compute MAPE on the test set
    test_mape = mean_absolute_percentage_error(y_test, all_preds)
    test_mape_scores.append(test_mape)

# Plot training and validation losses for each fold
for fold in range(kf.get_n_splits()):
    plt.figure(figsize=(10, 6))
    plt.plot(fold_train_losses[fold], label='Train Loss')
    plt.plot(fold_val_losses[fold], label='Validation Loss')
    plt.title(f'Fold {fold+1} Loss Curves')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()
    plt.show()

# Calculate the average training and validation losses across all folds
avg_train_losses = np.mean(fold_train_losses, axis=0)
avg_val_losses = np.mean(fold_val_losses, axis=0)

# Plot the average training and validation losses across all folds
plt.figure(figsize=(10, 6))
plt.plot(avg_train_losses, label='Average Train Loss', color='blue')
plt.plot(avg_val_losses, label='Average Validation Loss', color='orange')
plt.title(f' label {labels[0]} - Average Loss Curves Across All Folds')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()
    
# Plot test MAPE scores across all folds
plt.figure(figsize=(10, 6))
plt.plot(range(1, kf.get_n_splits() + 1), test_mape_scores, marker='o', label='Test MAPE')
plt.title('Test MAPE Scores Across Folds')
plt.xlabel('Fold')
plt.ylabel('MAPE')
plt.legend()
plt.show()


mean_mape = np.mean(test_mape_scores)
std_mape = np.std(test_mape_scores)

logger.info(f'MAPE across all folds on the {labels[0]} test set: {mean_mape:.4f}')
logger.info(f'Average MAPE across all folds on the {labels[0]} test set: {mean_mape:.4f}')
logger.info(f'Standard deviation of MAPE across all folds on the {labels[0]} test set: {std_mape:.4f}')


In [None]:
from sklearn.preprocessing import StandardScaler
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from sklearn.model_selection import KFold, train_test_split
from torch.utils.data import DataLoader
import matplotlib.pyplot as plt
import logging

logger = logging.getLogger()

# Assuming your data is in a pandas DataFrame `df_merged` with features in X and target in y
X = df_merged[numerical_features]
y = df_merged[labels[0]].values

# Step 1: Standardize features (X)
scaler_X = StandardScaler()
X_scaled = scaler_X.fit_transform(X)

# Step 2: Standardize target variable (y)
scaler_y = StandardScaler()
y_scaled = scaler_y.fit_transform(y.reshape(-1, 1)).flatten()  # Standardizing y

# Step 3: Initialize KFold for cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Custom Dataset for Regression Task
class RegressionDataset(torch.utils.data.Dataset):
    def __init__(self, X, y):
        if isinstance(X, pd.DataFrame):
            X = X.values
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.float32).unsqueeze(1)
    
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]


# Transformer-based Model
class TransformerRegressionModel(nn.Module):
    def __init__(self, input_dim, num_heads=8, hidden_dim=64, num_layers=2):
        super(TransformerRegressionModel, self).__init__()
        
        if input_dim < 8:
            input_dim = 8  # Set a minimum threshold
            logger.info(f"Input_dim is too small, adjusting to minimum threshold: {input_dim}")
        
        if input_dim % num_heads != 0:
            self.embedding_dim = (input_dim // num_heads) * num_heads
            logger.info(f"Projecting input_dim from {input_dim} to {self.embedding_dim} to be divisible by num_heads={num_heads}")
        else:
            self.embedding_dim = input_dim
        
        self.fc_embedding = nn.Linear(input_dim, self.embedding_dim)
        self.multihead_attn = nn.MultiheadAttention(embed_dim=self.embedding_dim, num_heads=num_heads, batch_first=True)
        self.fc1 = nn.Linear(self.embedding_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)
        self.fc3 = nn.Linear(hidden_dim, 1)
        
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.1)
    
    def forward(self, x):
        x = self.fc_embedding(x)
        x = x.unsqueeze(1)
        attn_output, _ = self.multihead_attn(x, x, x)
        attn_output = attn_output.squeeze(1)
        x = self.relu(self.fc1(attn_output))
        x = self.dropout(x)
        x = self.relu(self.fc2(x))
        x = self.fc3(x)
        return x

# Placeholder for storing test MAPE for each fold
test_mape_scores = []

fold_train_losses = []
fold_val_losses = []

# Cross-validation loop
for fold, (train_idx, val_idx) in enumerate(kf.split(X_scaled)):
    logger.info(f"\nFold {fold + 1}")
    
    # Use iloc to ensure you are selecting rows by their positional index
    X_train, X_test = X_scaled[train_idx], X_scaled[val_idx]
    y_train, y_test = y_scaled[train_idx], y_scaled[val_idx]  # Use scaled y

    X_train_val, X_val, y_train_val, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=42)

    # Create datasets and dataloaders
    train_dataset = RegressionDataset(X_train_val, y_train_val)
    val_dataset = RegressionDataset(X_val, y_val)
    test_dataset = RegressionDataset(X_test, y_test)

    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)
    test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

    # Initialize Model
    input_dim = X.shape[1]  # Number of features
    model = TransformerRegressionModel(input_dim=input_dim)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

    # Choose loss function (example: MSE)
    loss_type = "MSE"
    if loss_type == "MSE":
        loss_fn = nn.MSELoss()
    elif loss_type == "MAE":
        loss_fn = nn.L1Loss()
    elif loss_type == "Huber":
        loss_fn = nn.SmoothL1Loss()
    elif loss_type == "MAPE":
        loss_fn = MAPELoss()
    elif loss_type == "SMAPE":
        loss_fn = SMAPELoss()
    else:
        raise ValueError(f"Unknown loss_type: {loss_type}")

    # Store losses for plotting
    train_losses_epoch = []
    val_losses_epoch = []

    # Training loop
    num_epochs = 1000
    for epoch in range(num_epochs):
        model.train()
        train_losses = []
        for batch_X, batch_y in train_loader:
            optimizer.zero_grad()
            outputs = model(batch_X)
            loss = loss_fn(outputs, batch_y)
            loss.backward()
            optimizer.step()
            train_losses.append(loss.item())
        
        avg_train_loss = np.mean(train_losses)
        train_losses_epoch.append(avg_train_loss)

        # Validation
        model.eval()
        val_losses = []
        with torch.no_grad():
            for batch_X, batch_y in val_loader:
                outputs = model(batch_X)
                loss = loss_fn(outputs, batch_y)
                val_losses.append(loss.item())
        
        avg_val_loss = np.mean(val_losses)
        val_losses_epoch.append(avg_val_loss)

        if epoch % 100 == 0:
            logger.info(f'Epoch {epoch+1}, Fold {fold+1}, Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}')
    
    # Store fold-wise losses
    fold_train_losses.append(train_losses_epoch)
    fold_val_losses.append(val_losses_epoch)
    
    # Test the model on the hold-out test set
    model.eval()
    all_preds = []
    with torch.no_grad():
        for batch_X, batch_y in test_loader:
            preds = model(batch_X)
            all_preds.extend(preds.tolist())  # Collect predictions

    # Inverse scaling for predictions
    all_preds = scaler_y.inverse_transform(np.array(all_preds).reshape(-1, 1)).flatten()  # Reverse scaling

    # Compute MAPE on the test set
    test_mape = mean_absolute_percentage_error(scaler_y.inverse_transform(y_test.reshape(-1, 1)), all_preds)
    test_mape_scores.append(test_mape)

# Plot training and validation losses for each fold
for fold in range(kf.get_n_splits()):
    plt.figure(figsize=(10, 6))
    plt.plot(fold_train_losses[fold], label='Train Loss')
    plt.plot(fold_val_losses[fold], label='Validation Loss')
    plt.title(f'Fold {fold+1} Loss Curves')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()
    plt.show()

# Calculate the average training and validation losses across all folds
avg_train_losses = np.mean(fold_train_losses, axis=0)
avg_val_losses = np.mean(fold_val_losses, axis=0)

# Plot the average training and validation losses across all folds
plt.figure(figsize=(10, 6))
plt.plot(avg_train_losses, label='Average Train Loss', color='blue')
plt.plot(avg_val_losses, label='Average Validation Loss', color='orange')
plt.title(f' label {labels[0]} - Average Loss Curves Across All Folds')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

# Plot test MAPE scores across all folds
plt.figure(figsize=(10, 6))
plt.plot(range(1, kf.get_n_splits() + 1), test_mape_scores, marker='o', label='Test MAPE')
plt.title('Test MAPE Scores Across Folds')
plt.xlabel('Fold')
plt.ylabel('MAPE')
plt.legend()
plt.show()

mean_mape = np.mean(test_mape_scores)
std_mape = np.std(test_mape_scores)

logger.info(f'MAPE across all folds on the {labels[0]} test set: {mean_mape:.4f}')
logger.info(f'Average MAPE across all folds on the {labels[0]} test set: {mean_mape:.4f}')
logger.info(f'Standard deviation of MAPE across all folds on the {labels[0]} test set: {std_mape:.4f}')


In [None]:
from sklearn.preprocessing import MinMaxScaler
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from sklearn.model_selection import KFold, train_test_split
from torch.utils.data import DataLoader
import matplotlib.pyplot as plt
import logging

logger = logging.getLogger()

# Assuming your data is in a pandas DataFrame `df_merged` with features in X and target in y
X = df_merged[numerical_features]
y = df_merged[labels[0]].values

# Step 1: Apply Min-Max Scaling to features (X)
scaler_X = MinMaxScaler(feature_range=(0, 1))
X_scaled = scaler_X.fit_transform(X)

# Step 2: Apply Min-Max Scaling to target variable (y)
scaler_y = MinMaxScaler(feature_range=(0, 1))
y_scaled = scaler_y.fit_transform(y.reshape(-1, 1)).flatten()

# Step 3: Initialize KFold for cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Custom Dataset for Regression Task
class RegressionDataset(torch.utils.data.Dataset):
    def __init__(self, X, y):
        if isinstance(X, pd.DataFrame):
            X = X.values
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.float32).unsqueeze(1)
    
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

# Transformer-based Model
class TransformerRegressionModel(nn.Module):
    def __init__(self, input_dim, num_heads=8, hidden_dim=64, num_layers=2):
        super(TransformerRegressionModel, self).__init__()
        
        if input_dim < 8:
            input_dim = 8  # Set a minimum threshold
            logger.info(f"Input_dim is too small, adjusting to minimum threshold: {input_dim}")
        
        if input_dim % num_heads != 0:
            self.embedding_dim = (input_dim // num_heads) * num_heads
            logger.info(f"Projecting input_dim from {input_dim} to {self.embedding_dim} to be divisible by num_heads={num_heads}")
        else:
            self.embedding_dim = input_dim
        
        self.fc_embedding = nn.Linear(input_dim, self.embedding_dim)
        self.multihead_attn = nn.MultiheadAttention(embed_dim=self.embedding_dim, num_heads=num_heads, batch_first=True)
        self.fc1 = nn.Linear(self.embedding_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)
        self.fc3 = nn.Linear(hidden_dim, 1)
        
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.1)
    
    def forward(self, x):
        x = self.fc_embedding(x)
        x = x.unsqueeze(1)
        attn_output, _ = self.multihead_attn(x, x, x)
        attn_output = attn_output.squeeze(1)
        x = self.relu(self.fc1(attn_output))
        x = self.dropout(x)
        x = self.relu(self.fc2(x))
        x = self.fc3(x)
        return x

# Placeholder for storing test MAPE for each fold
test_mape_scores = []

fold_train_losses = []
fold_val_losses = []

# Cross-validation loop
for fold, (train_idx, val_idx) in enumerate(kf.split(X_scaled)):
    logger.info(f"\nFold {fold + 1}")
    
    # Split the data based on index from the KFold split
    X_train, X_test = X_scaled[train_idx], X_scaled[val_idx]
    y_train, y_test = y_scaled[train_idx], y_scaled[val_idx]  # Use scaled y

    X_train_val, X_val, y_train_val, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=42)

    # Create datasets and dataloaders
    train_dataset = RegressionDataset(X_train_val, y_train_val)
    val_dataset = RegressionDataset(X_val, y_val)
    test_dataset = RegressionDataset(X_test, y_test)

    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)
    test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

    # Initialize the model
    input_dim = X.shape[1]  # Number of features
    model = TransformerRegressionModel(input_dim=input_dim)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

    # Choose loss function (e.g., MSE)
    loss_type = "MSE"
    if loss_type == "MSE":
        loss_fn = nn.MSELoss()
    elif loss_type == "MAE":
        loss_fn = nn.L1Loss()
    elif loss_type == "Huber":
        loss_fn = nn.SmoothL1Loss()
    elif loss_type == "MAPE":
        loss_fn = MAPELoss()
    elif loss_type == "SMAPE":
        loss_fn = SMAPELoss()
    else:
        raise ValueError(f"Unknown loss_type: {loss_type}")

    # Store losses for plotting
    train_losses_epoch = []
    val_losses_epoch = []

    # Training loop
    num_epochs = 1000
    for epoch in range(num_epochs):
        model.train()
        train_losses = []
        for batch_X, batch_y in train_loader:
            optimizer.zero_grad()
            outputs = model(batch_X)
            loss = loss_fn(outputs, batch_y)
            loss.backward()
            optimizer.step()
            train_losses.append(loss.item())
        
        avg_train_loss = np.mean(train_losses)
        train_losses_epoch.append(avg_train_loss)

        # Validation
        model.eval()
        val_losses = []
        with torch.no_grad():
            for batch_X, batch_y in val_loader:
                outputs = model(batch_X)
                loss = loss_fn(outputs, batch_y)
                val_losses.append(loss.item())
        
        avg_val_loss = np.mean(val_losses)
        val_losses_epoch.append(avg_val_loss)

        if epoch % 100 == 0:
            logger.info(f'Epoch {epoch+1}, Fold {fold+1}, Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}')
    
    # Store fold-wise losses
    fold_train_losses.append(train_losses_epoch)
    fold_val_losses.append(val_losses_epoch)
    
    # Test the model on the hold-out test set
    model.eval()
    all_preds = []
    with torch.no_grad():
        for batch_X, batch_y in test_loader:
            preds = model(batch_X)
            all_preds.extend(preds.tolist())  # Collect predictions

    # Inverse scaling for predictions
    all_preds = scaler_y.inverse_transform(np.array(all_preds).reshape(-1, 1)).flatten()  # Reverse scaling

    # Compute MAPE on the test set
    test_mape = mean_absolute_percentage_error(scaler_y.inverse_transform(y_test.reshape(-1, 1)), all_preds)
    test_mape_scores.append(test_mape)

# Plot training and validation losses for each fold
for fold in range(kf.get_n_splits()):
    plt.figure(figsize=(10, 6))
    plt.plot(fold_train_losses[fold], label='Train Loss')
    plt.plot(fold_val_losses[fold], label='Validation Loss')
    plt.title(f'Fold {fold+1} Loss Curves')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()
    plt.show()

# Calculate the average training and validation losses across all folds
avg_train_losses = np.mean(fold_train_losses, axis=0)
avg_val_losses = np.mean(fold_val_losses, axis=0)

# Plot the average training and validation losses across all folds
plt.figure(figsize=(10, 6))
plt.plot(avg_train_losses, label='Average Train Loss', color='blue')
plt.plot(avg_val_losses, label='Average Validation Loss', color='orange')
plt.title(f' label {labels[0]} - Average Loss Curves Across All Folds')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

# Plot test MAPE scores across all folds
plt.figure(figsize=(10, 6))
plt.plot(range(1, kf.get_n_splits() + 1), test_mape_scores, marker='o', label='Test MAPE')
plt.title('Test MAPE Scores Across Folds')
plt.xlabel('Fold')
plt.ylabel('MAPE')
plt.legend()
plt.show()

mean_mape = np.mean(test_mape_scores)
std_mape = np.std(test_mape_scores)

logger.info(f'MAPE across all folds on the {labels[0]} test set: {mean_mape:.4f}')
logger.info(f'Average MAPE across all folds on the {labels[0]} test set: {mean_mape:.4f}')
logger.info(f'Standard deviation of MAPE across all folds on the {labels[0]} test set: {std_mape:.4f}')


In [None]:
from sklearn.preprocessing import MinMaxScaler
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from sklearn.model_selection import KFold, train_test_split
from torch.utils.data import DataLoader
import matplotlib.pyplot as plt
import logging

logger = logging.getLogger()

# Assuming your data is in a pandas DataFrame `df_merged` with features in X and target in y
X = df_merged[numerical_features]
y = df_merged[labels[0]].values

# Step 1: Apply Min-Max Scaling to features (X)
scaler_X = MinMaxScaler(feature_range=(0, 1))
X_scaled = scaler_X.fit_transform(X)

# Step 2: Apply Min-Max Scaling to target variable (y)
scaler_y = MinMaxScaler(feature_range=(0, 1))
y_scaled = scaler_y.fit_transform(y.reshape(-1, 1)).flatten()

# Step 3: Initialize KFold for cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Custom Dataset for Regression Task
class RegressionDataset(torch.utils.data.Dataset):
    def __init__(self, X, y):
        if isinstance(X, pd.DataFrame):
            X = X.values
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.float32).unsqueeze(1)
    
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

# Transformer-based Model
class TransformerRegressionModel(nn.Module):
    def __init__(self, input_dim, num_heads=8, hidden_dim=64, num_layers=2):
        super(TransformerRegressionModel, self).__init__()
        
        if input_dim < 8:
            input_dim = 8  # Set a minimum threshold
            logger.info(f"Input_dim is too small, adjusting to minimum threshold: {input_dim}")
        
        if input_dim % num_heads != 0:
            self.embedding_dim = (input_dim // num_heads) * num_heads
            logger.info(f"Projecting input_dim from {input_dim} to {self.embedding_dim} to be divisible by num_heads={num_heads}")
        else:
            self.embedding_dim = input_dim
        
        self.fc_embedding = nn.Linear(input_dim, self.embedding_dim)
        self.multihead_attn = nn.MultiheadAttention(embed_dim=self.embedding_dim, num_heads=num_heads, batch_first=True)
        self.fc1 = nn.Linear(self.embedding_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)
        self.fc3 = nn.Linear(hidden_dim, 1)
        
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.1)
    
    def forward(self, x):
        x = self.fc_embedding(x)
        x = x.unsqueeze(1)
        attn_output, _ = self.multihead_attn(x, x, x)
        attn_output = attn_output.squeeze(1)
        x = self.relu(self.fc1(attn_output))
        x = self.dropout(x)
        x = self.relu(self.fc2(x))
        x = self.fc3(x)
        return x

# Placeholder for storing test MAPE for each fold
test_mape_scores = []

fold_train_losses = []
fold_val_losses = []

# Cross-validation loop
for fold, (train_idx, val_idx) in enumerate(kf.split(X_scaled)):
    logger.info(f"\nFold {fold + 1}")
    
    # Split the data based on index from the KFold split
    X_train, X_test = X_scaled[train_idx], X_scaled[val_idx]
    y_train, y_test = y_scaled[train_idx], y_scaled[val_idx]  # Use scaled y

    X_train_val, X_val, y_train_val, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=42)

    # Create datasets and dataloaders
    train_dataset = RegressionDataset(X_train_val, y_train_val)
    val_dataset = RegressionDataset(X_val, y_val)
    test_dataset = RegressionDataset(X_test, y_test)

    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)
    test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

    # Initialize the model
    input_dim = X.shape[1]  # Number of features
    model = TransformerRegressionModel(input_dim=input_dim)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

    # Choose loss function (e.g., MSE)
    loss_type = "Huber"
    if loss_type == "MSE":
        loss_fn = nn.MSELoss()
    elif loss_type == "MAE":
        loss_fn = nn.L1Loss()
    elif loss_type == "Huber":
        loss_fn = nn.SmoothL1Loss()
    elif loss_type == "MAPE":
        loss_fn = MAPELoss()
    elif loss_type == "SMAPE":
        loss_fn = SMAPELoss()
    else:
        raise ValueError(f"Unknown loss_type: {loss_type}")

    # Store losses for plotting
    train_losses_epoch = []
    val_losses_epoch = []

    # Training loop
    num_epochs = 1000
    for epoch in range(num_epochs):
        model.train()
        train_losses = []
        for batch_X, batch_y in train_loader:
            optimizer.zero_grad()
            outputs = model(batch_X)
            loss = loss_fn(outputs, batch_y)
            loss.backward()
            optimizer.step()
            train_losses.append(loss.item())
        
        avg_train_loss = np.mean(train_losses)
        train_losses_epoch.append(avg_train_loss)

        # Validation
        model.eval()
        val_losses = []
        with torch.no_grad():
            for batch_X, batch_y in val_loader:
                outputs = model(batch_X)
                loss = loss_fn(outputs, batch_y)
                val_losses.append(loss.item())
        
        avg_val_loss = np.mean(val_losses)
        val_losses_epoch.append(avg_val_loss)

        if epoch % 100 == 0:
            logger.info(f'Epoch {epoch+1}, Fold {fold+1}, Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}')
    
    # Store fold-wise losses
    fold_train_losses.append(train_losses_epoch)
    fold_val_losses.append(val_losses_epoch)
    
    # Test the model on the hold-out test set
    model.eval()
    all_preds = []
    with torch.no_grad():
        for batch_X, batch_y in test_loader:
            preds = model(batch_X)
            all_preds.extend(preds.tolist())  # Collect predictions

    # Inverse scaling for predictions
    all_preds = scaler_y.inverse_transform(np.array(all_preds).reshape(-1, 1)).flatten()  # Reverse scaling

    # Compute MAPE on the test set
    test_mape = mean_absolute_percentage_error(scaler_y.inverse_transform(y_test.reshape(-1, 1)), all_preds)
    test_mape_scores.append(test_mape)

# Plot training and validation losses for each fold
for fold in range(kf.get_n_splits()):
    plt.figure(figsize=(10, 6))
    plt.plot(fold_train_losses[fold], label='Train Loss')
    plt.plot(fold_val_losses[fold], label='Validation Loss')
    plt.title(f'Fold {fold+1} Loss Curves')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()
    plt.show()

# Calculate the average training and validation losses across all folds
avg_train_losses = np.mean(fold_train_losses, axis=0)
avg_val_losses = np.mean(fold_val_losses, axis=0)

# Plot the average training and validation losses across all folds
plt.figure(figsize=(10, 6))
plt.plot(avg_train_losses, label='Average Train Loss', color='blue')
plt.plot(avg_val_losses, label='Average Validation Loss', color='orange')
plt.title(f' label {labels[0]} - Average Loss Curves Across All Folds')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

# Plot test MAPE scores across all folds
plt.figure(figsize=(10, 6))
plt.plot(range(1, kf.get_n_splits() + 1), test_mape_scores, marker='o', label='Test MAPE')
plt.title('Test MAPE Scores Across Folds')
plt.xlabel('Fold')
plt.ylabel('MAPE')
plt.legend()
plt.show()

mean_mape = np.mean(test_mape_scores)
std_mape = np.std(test_mape_scores)

logger.info(f'MAPE across all folds on the {labels[0]} test set: {mean_mape:.4f}')
logger.info(f'Average MAPE across all folds on the {labels[0]} test set: {mean_mape:.4f}')
logger.info(f'Standard deviation of MAPE across all folds on the {labels[0]} test set: {std_mape:.4f}')
