In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from collections import defaultdict

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/sri-lankan-youtube-video-data/FInalProcessedDataset1.csv
/kaggle/input/sri-lankan-youtube-video-data/Balanced_final_dataset_by_weekday.csv
/kaggle/input/sri-lankan-youtube-video-data/yt_data_merged_v2.csv
/kaggle/input/sri-lankan-youtube-video-data/FinalProcessedDataset_03.csv
/kaggle/input/sri-lankan-youtube-video-data/final_dataset_with_trending_unique_vids3.csv
/kaggle/input/sri-lankan-youtube-video-data/FinalProcessedDataset_04.csv
/kaggle/input/sri-lankan-youtube-video-data/yt_data_merged.csv


In [2]:
df = pd.read_csv('/kaggle/input/sri-lankan-youtube-video-data/FinalProcessedDataset_03.csv')

In [3]:
df.head()

Unnamed: 0,title,channel_name,video_id,channel_id,publish_date,snapshot_date,country,view_count,like_count,comment_count,...,is_holiday,publish_date_naive,title_len,desc_len,tag_count,has_hashtag,views_per_hour_norm,duration_timedelta,duration_minutes,category_name
0,Watch full short film!!👆@kaidoleeroberts429 #s...,Sofi Manassyan,o8rP66qVN-4,UCZFBnnCCO65xMXOdtFz8CfA,2025-09-06 18:14:11+00:00,2025-09-11 00:00:00+00:00,LK,286283,12778,277,...,False,2025-09-06 00:00:00,77,0.0,1,1,0.009381,0 days 00:01:48,1.8,Film & Animation
1,ToRung short film: 🙏 let's love each other ❤️,ToRung,07TTa5FHqz8,UCXbYlU08sOTBktOtjVsvR6w,2025-09-06 00:41:33+00:00,2025-09-11 00:00:00+00:00,LK,537527,6174,12,...,False,2025-09-06 00:00:00,45,139.0,13,0,0.017613,0 days 00:00:37,0.616667,Comedy
2,He messed with a wrong person #movieexplainedi...,CineClarity,6VfcnEBZ2dg,UCOHMJ8XnZL5TKZbY7Pj1yLA,2025-09-07 03:16:44+00:00,2025-09-11 00:00:00+00:00,LK,2693713,0,285,...,True,2025-09-07 00:00:00,60,396.0,17,1,0.088266,0 days 00:02:58,2.966667,Entertainment
3,সব কুলি চোর না। short film _natok3.2_bsngla na...,Natok3.2,BwR2GU0kovc,UCKkv7Lm0fprBm5LSaaKlZoA,2025-09-06 17:26:54+00:00,2025-09-11 00:00:00+00:00,LK,72606,723,1,...,False,2025-09-06 00:00:00,50,1591.0,1,0,0.002379,0 days 00:03:01,3.016667,Entertainment
4,😱Movie Recap Short Film Mystery Explained in H...,HowTo1C,EYBCNN_9E2M,UCtkFE68zgVgQimnkgJBSVmA,2025-09-07 09:30:08+00:00,2025-09-11 00:00:00+00:00,LK,22894,0,1,...,True,2025-09-07 00:00:00,98,2709.0,1,1,0.00075,0 days 00:00:37,0.616667,Entertainment


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 97766 entries, 0 to 97765
Data columns (total 34 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   title                97766 non-null  object 
 1   channel_name         97766 non-null  object 
 2   video_id             97766 non-null  object 
 3   channel_id           97766 non-null  object 
 4   publish_date         97766 non-null  object 
 5   snapshot_date        97766 non-null  object 
 6   country              97766 non-null  object 
 7   view_count           97766 non-null  int64  
 8   like_count           97766 non-null  int64  
 9   comment_count        97766 non-null  int64  
 10  description          97766 non-null  object 
 11  thumbnail_url        97766 non-null  object 
 12  video_tags           97766 non-null  object 
 13  kind                 97766 non-null  object 
 14  language             97766 non-null  object 
 15  category_id          97766 non-null 

In [5]:
df.isnull().sum()

title                  0
channel_name           0
video_id               0
channel_id             0
publish_date           0
snapshot_date          0
country                0
view_count             0
like_count             0
comment_count          0
description            0
thumbnail_url          0
video_tags             0
kind                   0
language               0
category_id            0
duration               0
days_since_publish     0
publish_weekday        0
publish_hour           0
is_weekend             0
slot_id                0
part_of_day            0
views_per_hour         0
is_holiday             0
publish_date_naive     0
title_len              0
desc_len               0
tag_count              0
has_hashtag            0
views_per_hour_norm    0
duration_timedelta     0
duration_minutes       0
category_name          0
dtype: int64

In [6]:
numerical_features = ['view_count', 'like_count', 'comment_count', 'publish_hour', 'days_since_publish', 
                     'is_weekend', 'views_per_hour', 'duration_minutes', 'title_len', 'desc_len', 'tag_count']
categorical_features = ['country', 'kind', 'language', 'category_id', 'part_of_day', 'is_holiday']

# Handle missing values
df[numerical_features] = df[numerical_features].fillna(0)
df[categorical_features] = df[categorical_features].fillna('unknown')

# Convert boolean to int (0/1) for is_weekend and is_holiday
df['is_weekend'] = df['is_weekend'].astype(int)
df['is_holiday'] = df['is_holiday'].astype(int)

# Create vocabulary mappings with padding index for unknown
vocab_maps = {}
for cat in categorical_features:
    unique_vals = df[cat].unique()
    vocab_maps[cat] = {'unknown': 0}  # Padding/unknown index
    vocab_maps[cat].update({val: idx + 1 for idx, val in enumerate(unique_vals[unique_vals != 'unknown'])})

In [7]:
# Normalize numerical features with min-max scaling
scaler = StandardScaler()
df[numerical_features] = scaler.fit_transform(df[numerical_features])

In [8]:
# Split into train/test (for inference on different subsets)
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

In [9]:
# Custom Dataset class for PyTorch
class MetadataDataset(Dataset):
    def __init__(self, df, numerical_features, categorical_features, vocab_maps):
        self.numerical = torch.tensor(df[numerical_features].values, dtype=torch.float32)
        self.categorical = {
            cat: torch.tensor([vocab_maps[cat][val] for val in df[cat]], dtype=torch.long)
            for cat in categorical_features
        }

    def __len__(self):
        return len(self.numerical)

    def __getitem__(self, idx):
        return {
            'numerical': self.numerical[idx],
            **{cat: self.categorical[cat][idx] for cat in self.categorical}
        }

In [10]:
# Create datasets
train_dataset = MetadataDataset(train_df, numerical_features, categorical_features, vocab_maps)
test_dataset = MetadataDataset(test_df, numerical_features, categorical_features, vocab_maps)

# DataLoaders
batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [11]:
# MetadataTower Module
class MetadataTower(nn.Module):
    def __init__(self, num_numerical, embed_dims, ffnn_layers=[128, 256], output_dim=256):
        super(MetadataTower, self).__init__()
        
        # Categorical Embeddings with larger dims for semantic richness
        self.embeddings = nn.ModuleDict({
            cat: nn.Embedding(vocab_size, embed_dim, padding_idx=0)
            for cat, (vocab_size, embed_dim) in embed_dims.items()
        })
        
        # Attention mechanism for feature weighting
        self.attention = nn.MultiheadAttention(embed_dim=output_dim, num_heads=4)
        
        # Calculate input dim for FFNN: numerical + sum of embed dims
        embed_total_dim = sum(embed_dim for _, embed_dim in embed_dims.values())
        input_dim = num_numerical + embed_total_dim
        
        # Feedforward Neural Network (MLP) with attention integration
        layers = []
        current_dim = input_dim
        for hidden_dim in ffnn_layers:
            layers.append(nn.Linear(current_dim, hidden_dim))
            layers.append(nn.LayerNorm(hidden_dim))  # Add normalization for stability
            layers.append(nn.ReLU())
            current_dim = hidden_dim
        layers.append(nn.Linear(current_dim, output_dim))
        self.ffnn = nn.Sequential(*layers)

    def forward(self, numerical, categorical):
        # Embed categorical features
        embed_outs = [self.embeddings[cat](categorical[cat]) for cat in categorical_features]
        
        # Concatenate embeddings and numerical features
        combined = torch.cat([numerical] + embed_outs, dim=1)  # Shape: [batch, input_dim]
        
        # Pass through FFNN to get initial embedding
        ffnn_out = self.ffnn(combined)  # Shape: [batch, output_dim]
        
        # Apply self-attention to capture semantic relationships
        attn_out, _ = self.attention(ffnn_out.unsqueeze(1), ffnn_out.unsqueeze(1), ffnn_out.unsqueeze(1))
        attn_out = attn_out.squeeze(1)  # Shape: [batch, output_dim]
        
        # Final metadata vector with semantic essence
        metadata_vec = nn.functional.normalize(attn_out, p=2, dim=1)  # L2 normalization
        return metadata_vec

In [12]:
# Model Instantiation
# Embedding dims with increased sizes for semantic capture
embed_dims = {
    'country': (len(vocab_maps['country']), 32),
    'kind': (len(vocab_maps['kind']), 16),
    'language': (len(vocab_maps['language']), 64),
    'category_id': (len(vocab_maps['category_id']), 32),
    'part_of_day': (len(vocab_maps['part_of_day']), 16),
    'is_holiday': (len(vocab_maps['is_holiday']), 8)
}

model = MetadataTower(
    num_numerical=len(numerical_features),
    embed_dims=embed_dims,
    ffnn_layers=[256, 256],
    output_dim=256
)

# Move to device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

MetadataTower(
  (embeddings): ModuleDict(
    (country): Embedding(2, 32, padding_idx=0)
    (kind): Embedding(2, 16, padding_idx=0)
    (language): Embedding(65, 64, padding_idx=0)
    (category_id): Embedding(17, 32, padding_idx=0)
    (part_of_day): Embedding(5, 16, padding_idx=0)
    (is_holiday): Embedding(3, 8, padding_idx=0)
  )
  (attention): MultiheadAttention(
    (out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=True)
  )
  (ffnn): Sequential(
    (0): Linear(in_features=179, out_features=256, bias=True)
    (1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
    (2): ReLU()
    (3): Linear(in_features=256, out_features=256, bias=True)
    (4): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
    (5): ReLU()
    (6): Linear(in_features=256, out_features=256, bias=True)
  )
)

In [13]:
# Generate Embeddings
model.eval()
all_embeddings = []
with torch.no_grad():
    for batch in test_loader:
        numerical = batch['numerical'].to(device)
        categorical = {cat: batch[cat].to(device) for cat in categorical_features}
        metadata_vec = model(numerical, categorical)
        all_embeddings.append(metadata_vec.cpu().numpy())

# Concatenate all batch embeddings
all_embeddings = np.concatenate(all_embeddings, axis=0)
print("Total Embeddings Shape:", all_embeddings.shape)  # Should be [num_samples, 256]
print("Sample Embedding (first 5 dims of first sample):", all_embeddings[0][:5])

print("Metadata Tower built with semantic embeddings!")

Total Embeddings Shape: (19554, 256)
Sample Embedding (first 5 dims of first sample): [-0.03823535 -0.04874016 -0.0790294  -0.02878784  0.03985287]
Metadata Tower built with semantic embeddings!
