In [None]:
# STEP 1.1 ──────────────────────────────────────────────────
# Install core libraries. Ensure compatibility if needed, though the prompt's versions should be okay
!pip install -q --upgrade \
transformers \
torch \
pandas \
numpy \
scikit-learn \
nltk
print(" Libraries installed.")



[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
google-colab 1.0.0 requires pandas==2.2.2, but you have pandas 2.2.3 which is incompatible.
torchvision 0.21.0+cu124 requires torch==2.6.0, but you have torch 2.7.0 which is incompatible.
fastai 2.7.19 requires torch<2.7,>=1.10, but you have torch 2.7.0 which is incompatible.
numba 0.60.0 requires numpy<2.1,>=1.22, but you have numpy 2.2.5 which is incompatible.
tensorflow 2.18.0 requires numpy<2.1.0,>=1.26.0, but you have numpy 2.2.5 which is incompatible.[0m[31m
[0m Libraries installed.


In [None]:
# --- Step 1: Install Libraries and Download NLTK Data ---
print("Installing necessary libraries...")
!pip install -q pandas nltk tqdm

import nltk
print("\nDownloading VADER lexicon...")
# This command checks if vader_lexicon is already present and downloads if not.
nltk.download('vader_lexicon')
print("VADER lexicon download process completed.") # It might have already been there.

print("\nSetup complete.")


Installing necessary libraries...

Downloading VADER lexicon...


[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


VADER lexicon download process completed.

Setup complete.


In [None]:
# --- Step 2: Import Libraries and Initialize VADER ---
print("\nImporting libraries and initializing VADER...")
import pandas as pd
from nltk.sentiment import SentimentIntensityAnalyzer
from tqdm.auto import tqdm # For progress bar

# Initialize VADER Sentiment Intensity Analyzer
sia = SentimentIntensityAnalyzer()

print("Libraries imported and VADER initialized.")


Importing libraries and initializing VADER...
Libraries imported and VADER initialized.


In [None]:
# --- Step 3: Load News Data ---
print("\nLoading news data...")
# Path to your news.tsv file in Google Colab
news_path = '/content/news.tsv'

# Load the news file
try:
    news_df = pd.read_csv(news_path, sep='\t', header=None, names=[
        'article_id', 'category', 'subcategory', 'title', 'abstract',
        'url', 'title_entities', 'abstract_entities'
    ])
    print(f"Loaded {len(news_df)} news articles from {news_path}.")
    print("First 5 rows:")
    display(news_df.head())

except FileNotFoundError:
    print(f"ERROR: File not found at {news_path}. Please upload news.tsv to your Colab session.")
    news_df = None
except Exception as e:
    print(f"ERROR loading news.tsv: {e}")
    news_df = None



Loading news data...
Loaded 50944 news articles from /content/news.tsv.
First 5 rows:


Unnamed: 0,article_id,category,subcategory,title,abstract,url,title_entities,abstract_entities
0,N55528,lifestyle,lifestyleroyals,"The Brands Queen Elizabeth, Prince Charles, an...","Shop the notebooks, jackets, and more that the...",https://assets.msn.com/labs/mind/AAGH0ET.html,"[{""Label"": ""Prince Philip, Duke of Edinburgh"",...",[]
1,N19639,health,weightloss,50 Worst Habits For Belly Fat,These seemingly harmless habits are holding yo...,https://assets.msn.com/labs/mind/AAB19MK.html,"[{""Label"": ""Adipose tissue"", ""Type"": ""C"", ""Wik...","[{""Label"": ""Adipose tissue"", ""Type"": ""C"", ""Wik..."
2,N61837,news,newsworld,The Cost of Trump's Aid Freeze in the Trenches...,Lt. Ivan Molchanets peeked over a parapet of s...,https://assets.msn.com/labs/mind/AAJgNsz.html,[],"[{""Label"": ""Ukraine"", ""Type"": ""G"", ""WikidataId..."
3,N53526,health,voices,I Was An NBA Wife. Here's How It Affected My M...,"I felt like I was a fraud, and being an NBA wi...",https://assets.msn.com/labs/mind/AACk2N6.html,[],"[{""Label"": ""National Basketball Association"", ..."
4,N38324,health,medical,"How to Get Rid of Skin Tags, According to a De...","They seem harmless, but there's a very good re...",https://assets.msn.com/labs/mind/AAAKEkt.html,"[{""Label"": ""Skin tag"", ""Type"": ""C"", ""WikidataI...","[{""Label"": ""Skin tag"", ""Type"": ""C"", ""WikidataI..."


In [None]:
 # --- Step 4: Define VADER Sentiment Function ---
print("\nDefining VADER sentiment function...")
def get_vader_sentiment_label(text):
    """
    Gets sentiment label (positive, negative, neutral) for text using VADER.
    Handles non-string or empty input gracefully.
    """
    # Ensure input is a string and not empty after stripping whitespace
    if not isinstance(text, str) or not text.strip():
        return 'neutral'

    try:
        # Get VADER compound score
        score = sia.polarity_scores(text)['compound']

        # Classify based on standard VADER thresholds
        if score >= 0.05:
            return 'positive'
        elif score <= -0.05:
            return 'negative'
        else:
            return 'neutral'
    except Exception as e:
        # Return neutral on processing error
        # print(f"Warning: Error processing text '{text[:50]}...' for sentiment: {e}") # Optional: uncomment for verbose errors
        return 'neutral'

print("VADER sentiment function defined.")



Defining VADER sentiment function...
VADER sentiment function defined.


In [None]:
# --- Step 5: Apply Sentiment to News Data ---
if news_df is not None:
    print("\nApplying VADER sentiment analysis to news titles...")

    # We'll use the 'title' column for sentiment analysis.
    # Ensure the column is string type and handle potential NaNs.
    # Create a temporary column for processing
    news_df['title_for_sentiment'] = news_df['title'].fillna('').astype(str)

    # Initialize tqdm for pandas apply progress bar
    tqdm.pandas()

    # Apply the sentiment function to the temporary column
    news_df['sentiment'] = news_df['title_for_sentiment'].progress_apply(get_vader_sentiment_label)

    # Drop the temporary column as it's no longer needed
    news_df = news_df.drop(columns=['title_for_sentiment'])

    print("\nSentiment analysis complete!")
    print("DataFrame head with new sentiment column:")
    display(news_df.head())

    print("\nSentiment distribution:")
    print(news_df['sentiment'].value_counts())

else:
    print("Skipping sentiment analysis as news_df was not loaded.")


Applying VADER sentiment analysis to news titles...


  0%|          | 0/50944 [00:00<?, ?it/s]


Sentiment analysis complete!
DataFrame head with new sentiment column:


Unnamed: 0,article_id,category,subcategory,title,abstract,url,title_entities,abstract_entities,sentiment
0,N55528,lifestyle,lifestyleroyals,"The Brands Queen Elizabeth, Prince Charles, an...","Shop the notebooks, jackets, and more that the...",https://assets.msn.com/labs/mind/AAGH0ET.html,"[{""Label"": ""Prince Philip, Duke of Edinburgh"",...",[],negative
1,N19639,health,weightloss,50 Worst Habits For Belly Fat,These seemingly harmless habits are holding yo...,https://assets.msn.com/labs/mind/AAB19MK.html,"[{""Label"": ""Adipose tissue"", ""Type"": ""C"", ""Wik...","[{""Label"": ""Adipose tissue"", ""Type"": ""C"", ""Wik...",negative
2,N61837,news,newsworld,The Cost of Trump's Aid Freeze in the Trenches...,Lt. Ivan Molchanets peeked over a parapet of s...,https://assets.msn.com/labs/mind/AAJgNsz.html,[],"[{""Label"": ""Ukraine"", ""Type"": ""G"", ""WikidataId...",negative
3,N53526,health,voices,I Was An NBA Wife. Here's How It Affected My M...,"I felt like I was a fraud, and being an NBA wi...",https://assets.msn.com/labs/mind/AACk2N6.html,[],"[{""Label"": ""National Basketball Association"", ...",negative
4,N38324,health,medical,"How to Get Rid of Skin Tags, According to a De...","They seem harmless, but there's a very good re...",https://assets.msn.com/labs/mind/AAAKEkt.html,"[{""Label"": ""Skin tag"", ""Type"": ""C"", ""WikidataI...","[{""Label"": ""Skin tag"", ""Type"": ""C"", ""WikidataI...",neutral



Sentiment distribution:
sentiment
neutral     20367
negative    16485
positive    14092
Name: count, dtype: int64


In [None]:
# --- Step 6: Save DataFrame to CSV ---
if news_df is not None:
    print("\nSaving DataFrame to CSV...")
    # Define the output file path
    output_csv_path = '/content/news_with_sentiment.csv'

    # Save the DataFrame with the new 'sentiment' column to CSV
    try:
        news_df.to_csv(output_csv_path, index=False)
        print(f"\nDataFrame with sentiment saved successfully to '{output_csv_path}'.")
        print("\n--- Download Instructions ---")
        print("You can now download 'news_with_sentiment.csv' from the Google Colab 'Files' sidebar.")
        print("1. Click the folder icon on the left sidebar.")
        print("2. If you don't see the file immediately, click the refresh button (circular arrow).")
        print("3. Find 'news_with_sentiment.csv' in the list (it will be in the /content/ directory).")
        print("4. Right-click on the file name and select 'Download'.")

    except Exception as e:
        print(f"\nERROR saving CSV file: {e}")
else:
    print("Skipping CSV saving as news_df was not loaded.")


Saving DataFrame to CSV...

DataFrame with sentiment saved successfully to '/content/news_with_sentiment.csv'.

--- Download Instructions ---
You can now download 'news_with_sentiment.csv' from the Google Colab 'Files' sidebar.
1. Click the folder icon on the left sidebar.
2. If you don't see the file immediately, click the refresh button (circular arrow).
3. Find 'news_with_sentiment.csv' in the list (it will be in the /content/ directory).
4. Right-click on the file name and select 'Download'.


In [None]:
# @title Complete Code: Add RoBERTa Sentiment to News Data and Create Downloadable CSV (Attempt 3)

# --- Step 1: Install Libraries ---
# **FIXED:** Splitting installation into two steps to potentially resolve dependency conflicts
print("Installing core libraries (pandas, nltk, tqdm)...")
!pip install -q --upgrade pandas nltk tqdm

print("Installing ML/NLP libraries (transformers, torch, numpy<2)...")
# Adding numpy<2 to ensure compatibility with transformers
!pip install -q --upgrade transformers torch numpy<2

# --- Step 2: Download NLTK Data (still needed for potential text processing if we were using abstracts heavily, but good practice) ---
# Although RoBERta handles tokenization internally, having NLTK setup doesn't hurt.
import nltk
print("\nDownloading VADER lexicon...")
# This command checks if vader_lexicon is already present and downloads if not.
nltk.download('vader_lexicon', quiet=True) # Download VADER just in case, quiet=True hides progress
# You might need other nltk data depending on preprocessing, but for RoBERTa apply, mostly tokenizers are sufficient.

print("VADER lexicon download process completed.") # It might have already been there.
print("\nSetup complete.")

# --- Step 3: Import Libraries and Initialize RoBERTa Model ---
print("\nImporting libraries and initializing RoBERTa model...")
import pandas as pd
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
from tqdm.auto import tqdm # For progress bar

# Define the RoBERTa sentiment model name
roberta_model_name = "cardiffnlp/twitter-roberta-base-sentiment"

# Load the model and tokenizer using the pipeline
try:
    # Using a pipeline simplifies things for inference
    # device=0 if torch.cuda.is_available() else -1 -> Use GPU if available (0 is usually the first GPU)
    roberta_classifier = pipeline(
        "sentiment-analysis",
        model=roberta_model_name,
        tokenizer=roberta_model_name,
        device=0 if torch.cuda.is_available() else -1
    )
    print(f"RoBERTa sentiment model '{roberta_model_name}' loaded.")
    if torch.cuda.is_available():
        print("Model moved to GPU.")
    else:
         print("Running on CPU (GPU recommended for speed).")


except Exception as e:
    print(f"ERROR loading RoBERTa model: {e}")
    roberta_classifier = None # Set classifier to None if loading failed


print("Libraries imported and RoBERTa model initialized (if successful).")


# --- Step 4: Load News Data ---
print("\nLoading news data...")
# Path to your news.tsv file in Google Colab
news_path = '/content/news.tsv'

# Load the news file
try:
    news_df = pd.read_csv(news_path, sep='\t', header=None, names=[
        'article_id', 'category', 'subcategory', 'title', 'abstract',
        'url', 'title_entities', 'abstract_entities'
    ])
    print(f"Loaded {len(news_df)} news articles from {news_path}.")
    print("First 5 rows:")
    display(news_df.head())

except FileNotFoundError:
    print(f"ERROR: File not found at {news_path}. Please upload news.tsv to your Colab session.")
    news_df = None
except Exception as e:
    print(f"ERROR loading news.tsv: {e}")
    news_df = None


# --- Step 5: Define RoBERTa Sentiment Function (or use pipeline directly) ---
# Using the pipeline makes the function definition very simple.
# The pipeline function itself handles batching internally when applied correctly,
# but applying row-wise with pandas .apply and tqdm is easier to integrate here.
# For large scale, direct pipeline usage on a list or batching is better.
# Let's create a simple wrapper for .apply
print("\nDefining RoBERTa sentiment function...")
def get_roberta_sentiment_label(text):
     """
     Gets sentiment label (positive, negative, neutral) for text using RoBERTa pipeline.
     Handles non-string or empty input gracefully.
     """
     # Return neutral immediately if the model wasn't loaded successfully
     if roberta_classifier is None:
         return 'neutral'

     # Ensure input is a string and not empty after stripping whitespace
     if not isinstance(text, str) or not text.strip():
         return 'neutral'

     try:
         # The pipeline returns a list of results, e.g., [{'label': 'neutral', 'score': 0.9}]
         # The labels from cardiffnlp/twitter-roberta-base-sentiment are 'neutral', 'positive', 'negative'.
         result = roberta_classifier(text)
         return result[0]['label'].lower() # Return the predicted label (lowercase)

     except Exception as e:
         # Return neutral on processing error
         # print(f"Warning: Error processing text '{text[:50]}...' for sentiment with RoBERTa: {e}") # Optional: uncomment for verbose errors
         return 'neutral'

print("RoBERTa sentiment function defined.")


# --- Step 6: Apply Sentiment to News Data ---
# Check if news_df was loaded AND the RoBERTa classifier was initialized
if news_df is not None and roberta_classifier is not None:
    print("\nApplying RoBERTa sentiment analysis to news titles...")
    print("This will take significantly longer than VADER.")

    # We'll use the 'title' column for sentiment analysis.
    # Ensure the column is string type and handle potential NaNs.
    # Create a temporary column for processing
    news_df['title_for_sentiment'] = news_df['title'].fillna('').astype(str)

    # Initialize tqdm for pandas apply progress bar
    tqdm.pandas()

    # Apply the sentiment function to the temporary column
    # Using .progress_apply is easy but not the most efficient way to use the pipeline
    # for very large datasets. For simplicity here, we stick to it.
    news_df['sentiment'] = news_df['title_for_sentiment'].progress_apply(get_roberta_sentiment_label)

    # Drop the temporary column as it's no longer needed
    news_df = news_df.drop(columns=['title_for_sentiment'])

    print("\nSentiment analysis complete!")
    print("DataFrame head with new sentiment column:")
    display(news_df.head())

    print("\nSentiment distribution:")
    print(news_df['sentiment'].value_counts())

elif news_df is None:
    print("\nSkipping sentiment analysis as news_df was not loaded.")
elif roberta_classifier is None:
     print("\nSkipping sentiment analysis as RoBERTa model could not be loaded.")


# --- Step 7: Save DataFrame to CSV ---
# This step remains the same as before
# Check if news_df was loaded AND the sentiment column was successfully added
if news_df is not None and 'sentiment' in news_df.columns:
    print("\nSaving DataFrame to CSV...")
    # Define the output file path
    output_csv_path = '/content/news_with_sentiment_roberta.csv' # Changed filename to indicate RoBERTa

    # Save the DataFrame with the new 'sentiment' column to CSV
    try:
        news_df.to_csv(output_csv_path, index=False)
        print(f"\nDataFrame with sentiment saved successfully to '{output_csv_path}'.")
        print("\n--- Download Instructions ---")
        print(f"You can now download '{output_csv_path.split('/')[-1]}' from the Google Colab 'Files' sidebar.")
        print("1. Click the folder icon on the left sidebar.")
        print("2. If you don't see the file immediately, click the refresh button (circular arrow).")
        print(f"3. Find '{output_csv_path.split('/')[-1]}' in the list (it will be in the /content/ directory).")
        print("4. Right-click on the file name and select 'Download'.")

    except Exception as e:
        print(f"\nERROR saving CSV file: {e}")
else:
    print("\nSkipping CSV saving as news_df was not loaded or sentiment analysis failed.")

Installing core libraries (pandas, nltk, tqdm)...
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m89.9/89.9 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.1/13.1 MB[0m [31m130.2 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
google-colab 1.0.0 requires pandas==2.2.2, but you have pandas 2.2.3 which is incompatible.[0m[31m
[0mInstalling ML/NLP libraries (transformers, torch, numpy<2)...
/bin/bash: line 1: 2: No such file or directory

Downloading VADER lexicon...
VADER lexicon download process completed.

Setup complete.

Importing libraries and initializing RoBERTa model...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/747 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

Device set to use cuda:0


RoBERTa sentiment model 'cardiffnlp/twitter-roberta-base-sentiment' loaded.
Model moved to GPU.
Libraries imported and RoBERTa model initialized (if successful).

Loading news data...
Loaded 51282 news articles from /content/news.tsv.
First 5 rows:


Unnamed: 0,article_id,category,subcategory,title,abstract,url,title_entities,abstract_entities
0,N55528,lifestyle,lifestyleroyals,"The Brands Queen Elizabeth, Prince Charles, an...","Shop the notebooks, jackets, and more that the...",https://assets.msn.com/labs/mind/AAGH0ET.html,"[{""Label"": ""Prince Philip, Duke of Edinburgh"",...",[]
1,N19639,health,weightloss,50 Worst Habits For Belly Fat,These seemingly harmless habits are holding yo...,https://assets.msn.com/labs/mind/AAB19MK.html,"[{""Label"": ""Adipose tissue"", ""Type"": ""C"", ""Wik...","[{""Label"": ""Adipose tissue"", ""Type"": ""C"", ""Wik..."
2,N61837,news,newsworld,The Cost of Trump's Aid Freeze in the Trenches...,Lt. Ivan Molchanets peeked over a parapet of s...,https://assets.msn.com/labs/mind/AAJgNsz.html,[],"[{""Label"": ""Ukraine"", ""Type"": ""G"", ""WikidataId..."
3,N53526,health,voices,I Was An NBA Wife. Here's How It Affected My M...,"I felt like I was a fraud, and being an NBA wi...",https://assets.msn.com/labs/mind/AACk2N6.html,[],"[{""Label"": ""National Basketball Association"", ..."
4,N38324,health,medical,"How to Get Rid of Skin Tags, According to a De...","They seem harmless, but there's a very good re...",https://assets.msn.com/labs/mind/AAAKEkt.html,"[{""Label"": ""Skin tag"", ""Type"": ""C"", ""WikidataI...","[{""Label"": ""Skin tag"", ""Type"": ""C"", ""WikidataI..."



Defining RoBERTa sentiment function...
RoBERTa sentiment function defined.

Applying RoBERTa sentiment analysis to news titles...
This will take significantly longer than VADER.


  0%|          | 0/51282 [00:00<?, ?it/s]

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset



Sentiment analysis complete!
DataFrame head with new sentiment column:


Unnamed: 0,article_id,category,subcategory,title,abstract,url,title_entities,abstract_entities,sentiment
0,N55528,lifestyle,lifestyleroyals,"The Brands Queen Elizabeth, Prince Charles, an...","Shop the notebooks, jackets, and more that the...",https://assets.msn.com/labs/mind/AAGH0ET.html,"[{""Label"": ""Prince Philip, Duke of Edinburgh"",...",[],label_1
1,N19639,health,weightloss,50 Worst Habits For Belly Fat,These seemingly harmless habits are holding yo...,https://assets.msn.com/labs/mind/AAB19MK.html,"[{""Label"": ""Adipose tissue"", ""Type"": ""C"", ""Wik...","[{""Label"": ""Adipose tissue"", ""Type"": ""C"", ""Wik...",label_0
2,N61837,news,newsworld,The Cost of Trump's Aid Freeze in the Trenches...,Lt. Ivan Molchanets peeked over a parapet of s...,https://assets.msn.com/labs/mind/AAJgNsz.html,[],"[{""Label"": ""Ukraine"", ""Type"": ""G"", ""WikidataId...",label_1
3,N53526,health,voices,I Was An NBA Wife. Here's How It Affected My M...,"I felt like I was a fraud, and being an NBA wi...",https://assets.msn.com/labs/mind/AACk2N6.html,[],"[{""Label"": ""National Basketball Association"", ...",label_0
4,N38324,health,medical,"How to Get Rid of Skin Tags, According to a De...","They seem harmless, but there's a very good re...",https://assets.msn.com/labs/mind/AAAKEkt.html,"[{""Label"": ""Skin tag"", ""Type"": ""C"", ""WikidataI...","[{""Label"": ""Skin tag"", ""Type"": ""C"", ""WikidataI...",label_1



Sentiment distribution:
sentiment
label_1    32817
label_0    12413
label_2     6052
Name: count, dtype: int64

Saving DataFrame to CSV...

DataFrame with sentiment saved successfully to '/content/news_with_sentiment_roberta.csv'.

--- Download Instructions ---
You can now download 'news_with_sentiment_roberta.csv' from the Google Colab 'Files' sidebar.
1. Click the folder icon on the left sidebar.
2. If you don't see the file immediately, click the refresh button (circular arrow).
3. Find 'news_with_sentiment_roberta.csv' in the list (it will be in the /content/ directory).
4. Right-click on the file name and select 'Download'.


In [None]:
# --- Step 1: Install Libraries ---
# Ensure necessary libraries are installed
print("Installing necessary libraries...")
# Including scikit-learn for AUC/ROC metrics
!pip install -q --upgrade pandas nltk tqdm transformers torch numpy<2 scikit-learn

print("\nLibrary installation step completed.")

Installing necessary libraries...
/bin/bash: line 1: 2: No such file or directory

Library installation step completed.


In [None]:
# --- Step 2: Download NLTK Data ---
import nltk
print("\nDownloading NLTK data...")
# Although RoBERTa/VADER are handled, it's good practice if other NLTK parts are used later
nltk.download('vader_lexicon', quiet=True)
print("NLTK data download process completed.")
print("\nSetup complete.")


Downloading NLTK data...
NLTK data download process completed.

Setup complete.


In [None]:
# --- Step 3: Import Libraries ---
print("\nImporting libraries...")
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from tqdm.auto import tqdm # For progress bar
from sklearn.metrics import roc_auc_score, roc_curve # Import roc_curve

print("Libraries imported.")


Importing libraries...
Libraries imported.


In [None]:
# --- Step 4: Load News Data with RoBERTa Sentiment ---
print("\nLoading news data with RoBERTa sentiment...")
# Path to the news CSV with sentiment
news_with_sentiment_path = '/content/news_with_sentiment_roberta.csv'

try:
    news_df = pd.read_csv(news_with_sentiment_path)
    print(f"Loaded {len(news_df)} news articles from {news_with_sentiment_path}.")
    print("First 5 rows:")
    display(news_df.head())

    # Check if the 'sentiment' column exists
    if 'sentiment' not in news_df.columns:
        raise ValueError("The 'sentiment' column was not found in the loaded CSV.")

except FileNotFoundError:
    print(f"ERROR: File not found at {news_with_sentiment_path}. Please ensure you have run the sentiment analysis code and uploaded the CSV.")
    news_df = None
except Exception as e:
    print(f"ERROR loading news data with sentiment: {e}")
    news_df = None


Loading news data with RoBERTa sentiment...
Loaded 51282 news articles from /content/news_with_sentiment_roberta.csv.
First 5 rows:


Unnamed: 0,article_id,category,subcategory,title,abstract,url,title_entities,abstract_entities,sentiment
0,N55528,lifestyle,lifestyleroyals,"The Brands Queen Elizabeth, Prince Charles, an...","Shop the notebooks, jackets, and more that the...",https://assets.msn.com/labs/mind/AAGH0ET.html,"[{""Label"": ""Prince Philip, Duke of Edinburgh"",...",[],label_1
1,N19639,health,weightloss,50 Worst Habits For Belly Fat,These seemingly harmless habits are holding yo...,https://assets.msn.com/labs/mind/AAB19MK.html,"[{""Label"": ""Adipose tissue"", ""Type"": ""C"", ""Wik...","[{""Label"": ""Adipose tissue"", ""Type"": ""C"", ""Wik...",label_0
2,N61837,news,newsworld,The Cost of Trump's Aid Freeze in the Trenches...,Lt. Ivan Molchanets peeked over a parapet of s...,https://assets.msn.com/labs/mind/AAJgNsz.html,[],"[{""Label"": ""Ukraine"", ""Type"": ""G"", ""WikidataId...",label_1
3,N53526,health,voices,I Was An NBA Wife. Here's How It Affected My M...,"I felt like I was a fraud, and being an NBA wi...",https://assets.msn.com/labs/mind/AACk2N6.html,[],"[{""Label"": ""National Basketball Association"", ...",label_0
4,N38324,health,medical,"How to Get Rid of Skin Tags, According to a De...","They seem harmless, but there's a very good re...",https://assets.msn.com/labs/mind/AAAKEkt.html,"[{""Label"": ""Skin tag"", ""Type"": ""C"", ""WikidataI...","[{""Label"": ""Skin tag"", ""Type"": ""C"", ""WikidataI...",label_1


In [None]:
# --- Step 5: Load and Deduplicate Behaviors Data ---
print("\nLoading behaviors data...")
# Path to your behaviors.tsv file
behaviors_path = '/content/behaviors.tsv'

try:
    behaviors_df = pd.read_csv(behaviors_path, sep='\t', header=None, names=[
        'impression_id', 'user_id', 'time', 'click_history', 'impressions'
    ])
    print(f"Loaded {len(behaviors_df)} user behavior logs from {behaviors_path}.")
    display(behaviors_df.head(2))

    # --- Deduplication ---
    initial_rows = len(behaviors_df)
    behaviors_df = behaviors_df.drop_duplicates().reset_index(drop=True)
    rows_after_dedup = len(behaviors_df)
    if initial_rows > rows_after_dedup:
        print(f"\nRemoved {initial_rows - rows_after_dedup} duplicate rows from behaviors_df.")
    else:
        print("\nNo duplicate rows found in behaviors_df.")
    print(f"Behaviors DataFrame size after deduplication: {len(behaviors_df)}")


    # Fill missing click history and impressions with empty strings
    behaviors_df['click_history'] = behaviors_df['click_history'].fillna('')
    behaviors_df['impressions'] = behaviors_df['impressions'].fillna('')


except FileNotFoundError:
    print(f"ERROR: File not found at {behaviors_path}. Please upload behaviors.tsv.")
    behaviors_df = None
except Exception as e:
    print(f"ERROR loading behaviors.tsv: {e}")
    behaviors_df = None


Loading behaviors data...
Loaded 156965 user behavior logs from /content/behaviors.tsv.


Unnamed: 0,impression_id,user_id,time,click_history,impressions
0,1,U13740,11/11/2019 9:05:58 AM,N55189 N42782 N34694 N45794 N18445 N63302 N104...,N55689-1 N35729-0
1,2,U91836,11/12/2019 6:11:30 PM,N31739 N6072 N63045 N23979 N35656 N43353 N8129...,N20678-0 N39317-0 N58114-0 N20495-0 N42977-0 N...



No duplicate rows found in behaviors_df.
Behaviors DataFrame size after deduplication: 156965


In [None]:
# --- Step 6: Prepare Article Embeddings with Sentiment and User Embeddings ---
if news_df is not None and behaviors_df is not None:
    print("\nPreparing article embeddings with sentiment and user embeddings...")

    # Define base embedding dimension (for the random part)
    base_embedding_dim = 100
    # Define sentiment embedding dimension (one-hot encoding)
    sentiment_embedding_dim = 3 # For 'positive', 'neutral', 'negative'
    # Total article embedding dimension
    embedding_dim = base_embedding_dim + sentiment_embedding_dim
    print(f"Article embedding dimension (base + sentiment): {embedding_dim}")

    np.random.seed(42) # for reproducibility of random part

    # Create base random embeddings for articles
    base_article_embeddings = np.random.normal(0, 1, size=(len(news_df), base_embedding_dim))

    # Create sentiment embeddings (one-hot encoding)
    # Map sentiment labels to indices
    sentiment_map = {'negative': 0, 'neutral': 1, 'positive': 2}
    # Ensure all sentiment labels in the DataFrame are in the map
    if not all(s in sentiment_map for s in news_df['sentiment'].unique()):
        print("Warning: Found unexpected sentiment labels. Defaulting them to 'neutral'.")
        news_df['sentiment'] = news_df['sentiment'].apply(lambda x: x if x in sentiment_map else 'neutral')


    sentiment_indices = news_df['sentiment'].map(sentiment_map).values
    # Create one-hot encoding
    sentiment_one_hot = np.eye(sentiment_embedding_dim)[sentiment_indices]

    # Concatenate base embeddings and sentiment embeddings
    article_embeddings = np.concatenate((base_article_embeddings, sentiment_one_hot), axis=1)

    print(f"Created article embeddings NumPy array of shape: {article_embeddings.shape}")

    # Create a mapping from article_id to its index (row number in news_df and article_embeddings)
    article_id_to_idx = {article_id: idx for idx, article_id in enumerate(news_df['article_id'])}
    print(f"Created article_id to index mapping for {len(article_id_to_idx)} articles.")

    # --- Fast Create User Embeddings ---
    print("\nGenerating user embeddings...")
    user_embeddings = {}
    users_processed = 0
    skipped_users = 0
    skipped_articles_in_history = 0 # Count articles in history not found in news_df

    # Use tqdm for progress bar
    from tqdm.auto import tqdm
    for _, row in tqdm(behaviors_df.iterrows(), total=len(behaviors_df), desc="Processing Users for Embeddings"):
        user_id = row['user_id']
        clicked_article_indices = [] # Store indices of clicked articles

        # Process click history if it's a non-empty string
        if isinstance(row['click_history'], str) and row['click_history'].strip():
            clicked_ids = row['click_history'].split() # Split IDs by space
            for aid in clicked_ids:
                if aid in article_id_to_idx:
                    # Append the index if the article ID is found
                    clicked_article_indices.append(article_id_to_idx[aid])
                else:
                    skipped_articles_in_history += 1 # Count articles not found in mapping

        # Calculate user embedding if they clicked known articles
        if clicked_article_indices:
            # Get the embeddings for the clicked articles using their indices
            embeddings_for_user = article_embeddings[clicked_article_indices]
            # Calculate the mean embedding (average pooling)
            user_embedding = np.mean(embeddings_for_user, axis=0)
            users_processed += 1
        else:
            # Cold-start: User has no known click history or clicked unknown articles
            # Use a zero vector for cold-start users
            user_embedding = np.zeros(embedding_dim, dtype=np.float32)
            skipped_users +=1 # Count users with no valid history

        user_embeddings[user_id] = user_embedding

    print(f"\nCreated embeddings for {users_processed} users with click history.")
    print(f"Generated zero embeddings for {skipped_users} cold-start users.")
    if skipped_articles_in_history > 0:
        print(f"Skipped {skipped_articles_in_history} article clicks in history (article_id not found in news_df).")

    # Check shape of a sample embedding
    if user_embeddings:
        sample_user_id = list(user_embeddings.keys())[0]
        print(f"Sample user ('{sample_user_id}') embedding shape: {user_embeddings[sample_user_id].shape}")
    else:
        print("No user embeddings were created.")


else:
    print("\nSkipping embedding preparation as news_df or behaviors_df was not loaded.")


Preparing article embeddings with sentiment and user embeddings...
Article embedding dimension (base + sentiment): 103
Created article embeddings NumPy array of shape: (51282, 103)
Created article_id to index mapping for 51282 articles.

Generating user embeddings...


Processing Users for Embeddings:   0%|          | 0/156965 [00:00<?, ?it/s]


Created embeddings for 153727 users with click history.
Generated zero embeddings for 3238 cold-start users.
Sample user ('U13740') embedding shape: (103,)


In [None]:
# --- Step 7: Define a simple Twin Tower (Siamese) Model ---
print("\nDefining TwinTowers model...")
import torch.nn as nn

class TwinTowers(nn.Module):
    def __init__(self, embedding_dim, hidden_dim=128, output_dim=64):
        super(TwinTowers, self).__init__()
        self.embedding_dim = embedding_dim

        # Article Tower
        self.article_tower = nn.Sequential(
            nn.Linear(embedding_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, output_dim)
            # Consider adding BatchNorm or Dropout here if needed
            # nn.BatchNorm1d(output_dim)
        )

        # User Tower
        self.user_tower = nn.Sequential(
            nn.Linear(embedding_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, output_dim)
            # nn.BatchNorm1d(output_dim)
        )

    def forward(self, user_emb, article_emb):
        # Ensure input tensors are FloatTensors
        user_emb = user_emb.float()
        article_emb = article_emb.float()

        # Pass through respective towers
        user_latent = self.user_tower(user_emb)
        article_latent = self.article_tower(article_emb)

        # Calculate dot product for similarity (cosine similarity is also common)
        # Normalize vectors for cosine similarity (optional, helps stability)
        # user_latent = F.normalize(user_latent, p=2, dim=1)
        # article_latent = F.normalize(article_latent, p=2, dim=1)
        # Element-wise product and sum
        dot_product = torch.sum(user_latent * article_latent, dim=1)

        # Output a probability using sigmoid
        return torch.sigmoid(dot_product)

# Quick check: Instantiate model with updated embedding_dim and print structure
if 'embedding_dim' in locals():
    temp_model = TwinTowers(embedding_dim=embedding_dim)
    print(temp_model)
    print(f"\nTwinTowers model defined with input dimension: {embedding_dim}.")
else:
    print("\nTwinTowers model definition skipped as embedding_dim is not defined.")


Defining TwinTowers model...
TwinTowers(
  (article_tower): Sequential(
    (0): Linear(in_features=103, out_features=128, bias=True)
    (1): ReLU()
    (2): Linear(in_features=128, out_features=64, bias=True)
  )
  (user_tower): Sequential(
    (0): Linear(in_features=103, out_features=128, bias=True)
    (1): ReLU()
    (2): Linear(in_features=128, out_features=64, bias=True)
  )
)

TwinTowers model defined with input dimension: 103.


In [None]:
# --- Step 8: Prepare Training and Test Data ---
# Restructure test data preparation to enable MRR calculation
if 'behaviors_df' is not None and 'user_embeddings' in locals() and 'article_embeddings' in locals() and 'article_id_to_idx' in locals():
    print("\nPreparing training and test data...")

    train_pairs = []
    test_pairs_for_auc_roc = [] # Simple list for flattened AUC/ROC calculation
    # For MRR, we need to group by impression session
    test_sessions_for_mrr = {} # Dictionary: {impression_id: [(user_emb, article_emb, label, article_id), ...]}

    positive_samples_train = 0
    negative_samples_train = 0
    positive_samples_test = 0
    negative_samples_test = 0


    max_samples_per_user_train = 3 # Limit positive/negative samples per interaction for training
    # For test, we need all impressions in a session for MRR

    # All article indices list (potential candidates for random negative sampling if needed)
    all_article_indices = list(range(len(article_embeddings)))

    # Determine the split point for train/test sessions
    # Split behaviors_df to ensure sessions don't overlap between train and test
    # This is better than splitting random pairs after generation
    print("Splitting behaviors into train and test sets by session...")
    train_behaviors_df = behaviors_df.sample(frac=0.8, random_state=42).reset_index(drop=True)
    test_behaviors_df = behaviors_df.drop(train_behaviors_df.index).reset_index(drop=True)

    print(f"Split behaviors into {len(train_behaviors_df)} train and {len(test_behaviors_df)} test sessions.")

    # Process Training Sessions
    print("Generating training pairs...")
    for _, row in tqdm(train_behaviors_df.iterrows(), total=len(train_behaviors_df), desc="Generating Train Pairs"):
        user_id = row['user_id']
        # Skip if user embedding doesn't exist (e.g., user only appeared once with unknown clicks)
        if user_id not in user_embeddings:
            continue
        user_emb = user_embeddings[user_id] # Get the pre-computed user embedding

        clicked_indices = []
        if isinstance(row['click_history'], str) and row['click_history'].strip():
            for aid in row['click_history'].split():
                if aid in article_id_to_idx:
                    clicked_indices.append(article_id_to_idx[aid])

        # --- Positive Samples (for training) ---
        # Sample up to N positive examples from this interaction's history
        pos_indices_to_add = clicked_indices[:max_samples_per_user_train]
        for pos_idx in pos_indices_to_add:
            # Append tuple: (user_embedding, article_embedding, label=1)
            train_pairs.append((user_emb, article_embeddings[pos_idx], 1))
            positive_samples_train += 1

        # --- Negative Samples (for training) ---
        impressed_indices = []
        if isinstance(row['impressions'], str) and row['impressions'].strip():
             # Impressions format: 'article_id-label article_id-label ...'
            for impression_pair in row['impressions'].split():
                aid_label = impression_pair.split('-')
                if len(aid_label) == 2:
                    aid, label_str = aid_label
                    # Only consider explicitly non-clicked items as negative impressions
                    if label_str == '0' and aid in article_id_to_idx:
                        impressed_indices.append(article_id_to_idx[aid])

        # Use negatives from impressions first (up to max_samples_per_user_train)
        neg_indices_from_impressions = impressed_indices[:max_samples_per_user_train]

        # If not enough negatives from impressions, sample randomly (excluding clicked articles in this session)
        num_negatives_needed = max_samples_per_user_train - len(neg_indices_from_impressions)
        random_neg_indices = []
        if num_negatives_needed > 0:
             # Pool of potential random negatives: all articles MINUS articles clicked in this session
             clicked_article_ids_in_session = set(row['click_history'].split() if isinstance(row['click_history'], str) else [])
             clicked_indices_set = set(article_id_to_idx[aid] for aid in clicked_article_ids_in_session if aid in article_id_to_idx)
             negative_pool_indices = list(set(all_article_indices) - clicked_indices_set)

             if negative_pool_indices: # Ensure pool is not empty
                # Sample without replacement
                random_neg_indices = np.random.choice(
                    negative_pool_indices,
                    min(num_negatives_needed, len(negative_pool_indices)), # Don't sample more than available
                    replace=False
                ).tolist() # Convert to list


        neg_indices_to_add = neg_indices_from_impressions + random_neg_indices
        # Ensure we don't add more than max_samples_per_user_train from the combination
        neg_indices_to_add = neg_indices_to_add[:max_samples_per_user_train]


        # Add the selected negative samples to train_pairs
        for neg_idx in neg_indices_to_add:
            train_pairs.append((user_emb, article_embeddings[neg_idx], 0))
            negative_samples_train += 1


    # Shuffle training data
    np.random.shuffle(train_pairs)


    # Process Test Sessions (structured for MRR)
    print("Generating test pairs and sessions for MRR evaluation...")
    skipped_articles_in_impressions = 0 # Count articles in impressions not found in news_df

    for _, row in tqdm(test_behaviors_df.iterrows(), total=len(test_behaviors_df), desc="Generating Test Data"):
        user_id = row['user_id']
        impression_id = row['impression_id']
        impressions_str = row['impressions'] # String of 'article_id-label' pairs

        # Skip if user embedding doesn't exist
        if user_id not in user_embeddings:
            continue
        user_emb = user_embeddings[user_id]

        # Process impressions for this session
        if isinstance(impressions_str, str) and impressions_str.strip():
            session_pairs_for_mrr = [] # List of (user_emb, article_emb, label, article_id) for this session

            for impression_pair in impressions_str.split():
                aid_label = impression_pair.split('-')
                if len(aid_label) == 2:
                    aid, label_str = aid_label
                    label = int(label_str) # Convert label to integer (0 or 1)

                    if aid in article_id_to_idx:
                        article_idx = article_id_to_idx[aid]
                        article_emb = article_embeddings[article_idx]

                        # Add to the list for MRR evaluation
                        session_pairs_for_mrr.append((user_emb, article_emb, label, aid))

                        # Also count for AUC/ROC summary
                        if label == 1:
                            positive_samples_test += 1
                        else:
                             negative_samples_test += 1


                    else:
                        skipped_articles_in_impressions += 1

            # Only add the session if it contains at least one impression we could process
            if session_pairs_for_mrr:
                 test_sessions_for_mrr[impression_id] = session_pairs_for_mrr


    # Flatten test_sessions_for_mrr into test_pairs_for_auc_roc
    for impression_id, session_data in test_sessions_for_mrr.items():
        for user_emb, article_emb, label, article_id in session_data:
             test_pairs_for_auc_roc.append((user_emb, article_emb, label))


    if not train_pairs and not test_pairs_for_auc_roc:
        print("\nERROR: No training or test data could be generated. Check input files and logic.")
        train_loader = None
        test_users_auc_roc = None
        test_articles_auc_roc = None
        test_labels_auc_roc_np = None
        test_sessions_for_mrr = {} # Ensure it's empty

    else:
        print(f"\nGenerated {len(train_pairs)} training samples ({positive_samples_train} positive, {negative_samples_train} negative).")
        print(f"Generated {len(test_pairs_for_auc_roc)} samples for AUC/ROC ({positive_samples_test} positive, {negative_samples_test} negative).")
        print(f"Organized test data into {len(test_sessions_for_mrr)} sessions for MRR.")
        if skipped_articles_in_impressions > 0:
            print(f"Skipped {skipped_articles_in_impressions} article impressions (article_id not found in news_df).")


        # --- Create PyTorch Tensors and DataLoaders for Training ---
        print("\nCreating PyTorch tensors and DataLoader for training...")
        train_users = torch.tensor(np.array([x[0] for x in train_pairs]), dtype=torch.float32)
        train_articles = torch.tensor(np.array([x[1] for x in train_pairs]), dtype=torch.float32)
        train_labels = torch.tensor(np.array([x[2] for x in train_pairs]), dtype=torch.float32)

        train_dataset = TensorDataset(train_users, train_articles, train_labels)
        train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True) # Increased batch size for potentially faster training

        print(f"Prepared DataLoader for training with {len(train_loader)} batches.")
        print(f"Train users tensor shape: {train_users.shape}")
        print(f"Train articles tensor shape: {train_articles.shape}")
        print(f"Train labels tensor shape: {train_labels.shape}")


        # --- Prepare PyTorch Tensors for Test (AUC/ROC) ---
        # No DataLoader needed for simple prediction pass for AUC/ROC
        print("\nCreating PyTorch tensors for AUC/ROC evaluation...")
        test_users_auc_roc = torch.tensor(np.array([x[0] for x in test_pairs_for_auc_roc]), dtype=torch.float32)
        test_articles_auc_roc = torch.tensor(np.array([x[1] for x in test_pairs_for_auc_roc]), dtype=torch.float32)
        test_labels_auc_roc_np = np.array([x[2] for x in test_pairs_for_auc_roc]) # Keep as numpy for sklearn

        print(f"Prepared tensors for AUC/ROC evaluation with {len(test_users_auc_roc)} samples.")
        print(f"Test users tensor shape (AUC/ROC): {test_users_auc_roc.shape}")
        print(f"Test articles tensor shape (AUC/ROC): {test_articles_auc_roc.shape}")
        print(f"Test labels numpy shape (AUC/ROC): {test_labels_auc_roc_np.shape}")


else:
    print("\nSkipping data preparation as required DataFrames/embeddings were not loaded.")
    train_loader = None
    test_users_auc_roc = None
    test_articles_auc_roc = None
    test_labels_auc_roc_np = None
    test_sessions_for_mrr = {} # Ensure it's empty


Preparing training and test data...
Splitting behaviors into train and test sets by session...
Split behaviors into 125572 train and 31393 test sessions.
Generating training pairs...


  if 'behaviors_df' is not None and 'user_embeddings' in locals() and 'article_embeddings' in locals() and 'article_id_to_idx' in locals():


Generating Train Pairs:   0%|          | 0/125572 [00:00<?, ?it/s]

Generating test pairs and sessions for MRR evaluation...


Generating Test Data:   0%|          | 0/31393 [00:00<?, ?it/s]


Generated 739311 training samples (362595 positive, 376716 negative).
Generated 1162375 samples for AUC/ROC (47086 positive, 1115289 negative).
Organized test data into 31393 sessions for MRR.

Creating PyTorch tensors and DataLoader for training...
Prepared DataLoader for training with 5776 batches.
Train users tensor shape: torch.Size([739311, 103])
Train articles tensor shape: torch.Size([739311, 103])
Train labels tensor shape: torch.Size([739311])

Creating PyTorch tensors for AUC/ROC evaluation...
Prepared tensors for AUC/ROC evaluation with 1162375 samples.
Test users tensor shape (AUC/ROC): torch.Size([1162375, 103])
Test articles tensor shape (AUC/ROC): torch.Size([1162375, 103])
Test labels numpy shape (AUC/ROC): (1162375,)


In [None]:
# --- Step 9: Train the TwinTowers model ---
if 'train_loader' in locals() and train_loader is not None:
    print("\nStarting model training...")

    # Set device (GPU if available, otherwise CPU)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Using device: {device}")

    # Initialize model, optimizer, and loss
    # Make sure embedding_dim matches the shape of input embeddings (103)
    # Check if embedding_dim was successfully defined in Step 6
    if 'embedding_dim' not in locals():
         print("ERROR: embedding_dim was not defined. Skipping training.")
         model = None
    else:
        model = TwinTowers(embedding_dim=embedding_dim).to(device)
        optimizer = torch.optim.Adam(model.parameters(), lr=0.001) # Adam optimizer
        criterion = nn.BCELoss() # Binary Cross Entropy Loss for click prediction

        num_epochs = 5 # Reduced epochs for quicker demo, increase if needed (e.g., 10-20)

        for epoch in range(num_epochs):
            model.train() # Set model to training mode
            total_loss = 0.0

            # Use tqdm for epoch progress
            progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}", leave=True)

            for user_batch, article_batch, label_batch in progress_bar:
                # Move batch to the correct device
                user_batch = user_batch.to(device)
                article_batch = article_batch.to(device)
                label_batch = label_batch.to(device) # Labels should be float32 for BCELoss

                # --- Forward Pass ---
                predictions = model(user_batch, article_batch)
                # Squeeze predictions to match label shape [batch_size] vs [batch_size, 1] if needed
                # BCELoss expects predictions and labels to have the same shape
                predictions = predictions.squeeze(-1) if predictions.ndim > 1 else predictions


                # --- Calculate Loss ---
                loss = criterion(predictions, label_batch)

                # --- Backward Pass & Optimize ---
                optimizer.zero_grad() # Clear previous gradients
                loss.backward() # Calculate gradients
                optimizer.step() # Update model weights

                total_loss += loss.item()

                # Update progress bar description with current loss
                progress_bar.set_postfix({'loss': f'{loss.item():.4f}'})

            # Print average loss for the epoch
            avg_loss = total_loss / len(train_loader)
            print(f"\nEpoch {epoch+1}/{num_epochs} - Average Loss: {avg_loss:.4f}")

        print("\nTraining finished.")

else:
    print("\nSkipping training - train_loader not prepared.")


Starting model training...
Using device: cuda


Epoch 1/5:   0%|          | 0/5776 [00:00<?, ?it/s]


Epoch 1/5 - Average Loss: 0.1791


Epoch 2/5:   0%|          | 0/5776 [00:00<?, ?it/s]


Epoch 2/5 - Average Loss: 0.1071


Epoch 3/5:   0%|          | 0/5776 [00:00<?, ?it/s]


Epoch 3/5 - Average Loss: 0.0907


Epoch 4/5:   0%|          | 0/5776 [00:00<?, ?it/s]


Epoch 4/5 - Average Loss: 0.0811


Epoch 5/5:   0%|          | 0/5776 [00:00<?, ?it/s]


Epoch 5/5 - Average Loss: 0.0753

Training finished.


In [None]:
# --- Step 10: Evaluate TwinTowers Model on Test Set (AUC, ROC, MRR) ---
# Check if model was trained AND test data was prepared successfully
if 'model' in locals() and model is not None and \
   'test_users_auc_roc' in locals() and test_users_auc_roc is not None and \
   'test_sessions_for_mrr' in locals():

    print("\nEvaluating model on the test set...")

    model.eval() # Set model to evaluation mode (disables dropout, etc.)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # Ensure device is set

    # Move test data for AUC/ROC to the device
    test_users_auc_roc = test_users_auc_roc.to(device)
    test_articles_auc_roc = test_articles_auc_roc.to(device)


    # --- Calculate AUC and ROC ---
    print("\nCalculating AUC and ROC...")
    test_predictions_auc_roc_list = []
    test_batch_size = 512 # Process test set in batches

    with torch.no_grad(): # No need to calculate gradients during evaluation
        for i in range(0, len(test_users_auc_roc), test_batch_size):
            user_batch = test_users_auc_roc[i:i+test_batch_size]
            article_batch = test_articles_auc_roc[i:i+test_batch_size]

            batch_preds = model(user_batch, article_batch).cpu().numpy() # Get predictions, move to CPU, convert to numpy
            test_predictions_auc_roc_list.append(batch_preds)

    # Concatenate predictions from all batches
    test_preds_auc_roc_np = np.concatenate(test_predictions_auc_roc_list).squeeze() # Ensure it's a 1D array

    # Calculate AUC score using sklearn
    try:
        if test_preds_auc_roc_np.shape == test_labels_auc_roc_np.shape:
            auc_score = roc_auc_score(test_labels_auc_roc_np, test_preds_auc_roc_np)
            print(f"Test AUC Score: {auc_score:.4f}")

            # Calculate ROC curve points
            # roc_curve returns fpr, tpr, thresholds
            fpr, tpr, thresholds = roc_curve(test_labels_auc_roc_np, test_preds_auc_roc_np)
            print("\nTest ROC Curve points (FPR, TPR, Thresholds - first 10):")
            # Print first 10 points or fewer if less than 10
            for i in range(min(10, len(fpr))):
                print(f"  FPR: {fpr[i]:.4f}, TPR: {tpr[i]:.4f}, Threshold: {thresholds[i]:.4f}")


        else:
            print(f"Shape mismatch for AUC/ROC: Predictions shape {test_preds_auc_roc_np.shape}, Labels shape {test_labels_auc_roc_np.shape}")
            print("Cannot calculate AUC/ROC.")
            print(f"Unique labels in test set: {np.unique(test_labels_auc_roc_np)}")


    except ValueError as e:
        print(f"Error calculating AUC/ROC: {e}")
        print("This might happen if only one class (clicked or not clicked) is present in the test labels in the test_pairs_for_auc_roc list.")
        print(f"Unique labels in test set (AUC/ROC): {np.unique(test_labels_auc_roc_np)}")


    # --- Calculate MRR ---
    print("\nCalculating MRR...")

    reciprocal_ranks = []
    sessions_with_clicks_count = 0 # Count sessions that have clicks and are included in MRR

    # Iterate through each impression session in the test set
    # Use tqdm for progress over sessions
    for impression_id, session_data in tqdm(test_sessions_for_mrr.items(), desc="Calculating MRR for Sessions"):
        # session_data is a list of (user_emb, article_emb, label, article_id) for this session

        if not session_data: # Skip empty sessions (shouldn't happen with current prep but good check)
            continue

        # Extract data for prediction for this session
        # Note: User embedding is the same for all items in a session, but batching expects lists
        user_embs_session = torch.tensor(np.array([x[0] for x in session_data]), dtype=torch.float32).to(device)
        article_embs_session = torch.tensor(np.array([x[1] for x in session_data]), dtype=torch.float32).to(device)
        labels_session = [x[2] for x in session_data] # Keep labels as list for easy lookup
        article_ids_session = [x[3] for x in session_data] # Keep article IDs

        # Find the clicked articles in this session
        clicked_article_ids_in_session = [aid for aid, label in zip(article_ids_session, labels_session) if label == 1]

        # We only calculate MRR for sessions where *at least one* item was clicked AND processed
        # (i.e., its article_id was found in news_df)
        if not clicked_article_ids_in_session:
            continue # Skip sessions with no clicks

        # Get predictions for all items in the session
        with torch.no_grad():
            # The model takes batches. Even for a single session, we pass all items in that session as a batch
            session_preds = model(user_embs_session, article_embs_session).cpu().numpy().squeeze()

        # Handle cases where the session has only one item (predictions might not be a 1D array)
        if session_preds.ndim == 0: # Single item session
            session_preds = np.array([session_preds]) # Make it an array

        # Rank the items in this session based on predictions
        # Get indices that would sort the predictions in descending order
        ranked_indices = np.argsort(-session_preds) # Use - for descending order

        # Get the article IDs in ranked order
        ranked_article_ids = [article_ids_session[i] for i in ranked_indices]

        # Find the rank of the first clicked item
        # The rank is 1-based. The index in the ranked list is 0-based.
        rank_of_first_clicked = -1
        for rank, aid in enumerate(ranked_article_ids):
            if aid in clicked_article_ids_in_session:
                 # If multiple items were clicked, MRR uses the rank of the *first* clicked item encountered in the ranking
                 rank_of_first_clicked = rank + 1 # Rank is 1-based
                 break # Found the first clicked item, stop searching

        # Calculate and store the reciprocal rank if a clicked item was found and had a valid rank
        if rank_of_first_clicked != -1:
            reciprocal_ranks.append(1 / rank_of_first_clicked)
            sessions_with_clicks_count += 1


    # Calculate Mean Reciprocal Rank
    if reciprocal_ranks:
        mrr_score = np.mean(reciprocal_ranks)
        print(f"\nTest MRR Score calculated over {sessions_with_clicks_count} sessions: {mrr_score:.4f}")
    else:
        print("\nNo test sessions with clicks were processed successfully to calculate MRR.")


else:
    print("\nSkipping evaluation - model or test data not available from previous steps.")

print("\nProcess completed.")


Evaluating model on the test set...

Calculating AUC and ROC...
Test AUC Score: 0.4722

Test ROC Curve points (FPR, TPR, Thresholds - first 10):
  FPR: 0.0000, TPR: 0.0000, Threshold: inf
  FPR: 0.0000, TPR: 0.0001, Threshold: 1.0000
  FPR: 0.0001, TPR: 0.0001, Threshold: 1.0000
  FPR: 0.0001, TPR: 0.0001, Threshold: 1.0000
  FPR: 0.0001, TPR: 0.0001, Threshold: 1.0000
  FPR: 0.0001, TPR: 0.0002, Threshold: 1.0000
  FPR: 0.0001, TPR: 0.0002, Threshold: 1.0000
  FPR: 0.0001, TPR: 0.0002, Threshold: 1.0000
  FPR: 0.0001, TPR: 0.0002, Threshold: 1.0000
  FPR: 0.0001, TPR: 0.0002, Threshold: 1.0000

Calculating MRR...


Calculating MRR for Sessions:   0%|          | 0/31393 [00:00<?, ?it/s]


Test MRR Score calculated over 31393 sessions: 0.2666

Process completed.


In [None]:
# @title Part 1: Generate Article Text Embeddings - Cell A.1 Install Libraries
# **IMPORTANT:** If you see NumPy compatibility errors (like "_center" import error) AFTER running this cell, go to
# Runtime -> Restart runtime, then run all cells from the beginning in order.

print("Installing libraries for text embedding generation...")
# **FIXED:** Using "numpy<2" with quotes to ensure correct installation
# Adding scikit-learn here as it's needed later and easier to install all at once
!pip install -q --upgrade transformers torch sentence-transformers pandas "numpy<2" tqdm scikit-learn

print("\nInstallation complete.")

Installing libraries for text embedding generation...
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m89.9/89.9 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m865.2/865.2 MB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m393.1/393.1 MB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.9/8.9 MB[0m [31m133.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.7/23.7 MB[0m [31m103.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m897.7/897.7 kB[0m [31m54.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m571.0/571.0 MB[0m [31m1.9 MB/s[0m eta [36

In [None]:
# @title Part 1: Generate Article Text Embeddings - Cell A.1 Install Libraries
# IMPORTANT: If you see NumPy compatibility errors AFTER running this cell,
# go to Runtime -> Restart runtime, then run all cells from the beginning in order.
print("Installing libraries for text embedding generation...")

# Make sure torchvision and torchaudio are included!
# Using "numpy<2" might be necessary depending on other dependencies,
# but prioritize getting torch/torchvision/torchaudio compatible first.
# If you still have issues, try removing "numpy<2" or using the exact PyTorch recommended command.

# Example install command (adjust if you need specific CUDA or CPU only)
!pip install -q --upgrade transformers torch torchvision torchaudio sentence-transformers pandas "numpy<2" tqdm scikit-learn
# If the above line still causes torchvision errors, try the official PyTorch line for your setup:
# Example official PyTorch command for CUDA 11.8:
# !pip install -q --upgrade torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118

print("\nInstallation complete.")

Installing libraries for text embedding generation...
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m60.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.5/3.5 MB[0m [31m109.9 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
fastai 2.7.19 requires torch<2.7,>=1.10, but you have torch 2.7.0 which is incompatible.[0m[31m
[0m
Installation complete.


In [None]:
# @title Part 1: Generate Article Text Embeddings - Cell A.2 Import Libraries and Load Model
print("\nImporting libraries and loading sentence embedding model...")
import pandas as pd
import numpy as np
import torch
from sentence_transformers import SentenceTransformer
from tqdm.auto import tqdm

# Define the sentence transformer model name
# all-MiniLM-L6-v2 is a good balance of speed and performance
embedding_model_name = 'sentence-transformers/all-MiniLM-L6-v2'

try:
    # Load the model
    # device='cuda' if torch.cuda.is_available() else 'cpu' -> Use GPU if available
    sentence_model = SentenceTransformer(embedding_model_name, device='cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Sentence embedding model '{embedding_model_name}' loaded.")
    if torch.cuda.is_available():
        print("Model moved to GPU.")
    else:
         print("Running on CPU (GPU recommended for speed).")

except Exception as e:
    print(f"ERROR loading sentence embedding model: {e}")
    sentence_model = None

print("Libraries imported and model initialized (if successful).")


Importing libraries and loading sentence embedding model...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Sentence embedding model 'sentence-transformers/all-MiniLM-L6-v2' loaded.
Model moved to GPU.
Libraries imported and model initialized (if successful).


In [None]:
# @title Part 1: Generate Article Text Embeddings - Cell A.3 Load News Data
print("\nLoading news data from news.tsv...")
# Path to your news.tsv file
news_path = '/content/news.tsv'

try:
    news_df_embedding = pd.read_csv(news_path, sep='\t', header=None, names=[
        'article_id', 'category', 'subcategory', 'title', 'abstract',
        'url', 'title_entities', 'abstract_entities'
    ])
    print(f"Loaded {len(news_df_embedding)} news articles from {news_path}.")
    print("First 5 rows:")
    display(news_df_embedding.head())

except FileNotFoundError:
    print(f"ERROR: File not found at {news_path}. Please upload news.tsv to your Colab session.")
    news_df_embedding = None
except Exception as e:
    print(f"ERROR loading news.tsv: {e}")
    news_df_embedding = None


Loading news data from news.tsv...
Loaded 51282 news articles from /content/news.tsv.
First 5 rows:


Unnamed: 0,article_id,category,subcategory,title,abstract,url,title_entities,abstract_entities
0,N55528,lifestyle,lifestyleroyals,"The Brands Queen Elizabeth, Prince Charles, an...","Shop the notebooks, jackets, and more that the...",https://assets.msn.com/labs/mind/AAGH0ET.html,"[{""Label"": ""Prince Philip, Duke of Edinburgh"",...",[]
1,N19639,health,weightloss,50 Worst Habits For Belly Fat,These seemingly harmless habits are holding yo...,https://assets.msn.com/labs/mind/AAB19MK.html,"[{""Label"": ""Adipose tissue"", ""Type"": ""C"", ""Wik...","[{""Label"": ""Adipose tissue"", ""Type"": ""C"", ""Wik..."
2,N61837,news,newsworld,The Cost of Trump's Aid Freeze in the Trenches...,Lt. Ivan Molchanets peeked over a parapet of s...,https://assets.msn.com/labs/mind/AAJgNsz.html,[],"[{""Label"": ""Ukraine"", ""Type"": ""G"", ""WikidataId..."
3,N53526,health,voices,I Was An NBA Wife. Here's How It Affected My M...,"I felt like I was a fraud, and being an NBA wi...",https://assets.msn.com/labs/mind/AACk2N6.html,[],"[{""Label"": ""National Basketball Association"", ..."
4,N38324,health,medical,"How to Get Rid of Skin Tags, According to a De...","They seem harmless, but there's a very good re...",https://assets.msn.com/labs/mind/AAAKEkt.html,"[{""Label"": ""Skin tag"", ""Type"": ""C"", ""WikidataI...","[{""Label"": ""Skin tag"", ""Type"": ""C"", ""WikidataI..."


In [None]:
# @title Part 1: Generate Article Text Embeddings - Cell A.4 Generate and Save Embeddings
if news_df_embedding is not None and sentence_model is not None:
    print("\nGenerating text embeddings for news articles...")
    print("This can take a significant amount of time depending on the dataset size and available hardware.")

    # Use a combination of title and abstract for embedding
    # Handle potential NaNs by filling with empty string
    texts_to_embed = (news_df_embedding['title'].fillna('') + ' ' + news_df_embedding['abstract'].fillna('')).tolist()

    # Generate embeddings in batches
    # Sentence-transformers handles batching internally with model.encode()
    try:
        # Use a progress bar
        embeddings = sentence_model.encode(texts_to_embed, show_progress_bar=True)
        print("\nText embedding generation complete.")
        print(f"Generated embeddings shape: {embeddings.shape}")

        # Define the path to save the embeddings
        embeddings_output_path = '/content/article_text_embeddings.npy'

        # Save embeddings to a file
        np.save(embeddings_output_path, embeddings)
        print(f"Text embeddings saved successfully to '{embeddings_output_path}'.")
        print("You can download this file if needed, but it will be loaded directly by the next script.")

        # Save a list of article_ids in the same order, needed later
        article_ids_order_path = '/content/article_ids_order.npy'
        np.save(article_ids_order_path, news_df_embedding['article_id'].values)
        print(f"Article ID order saved successfully to '{article_ids_order_path}'.")


    except Exception as e:
        print(f"ERROR during embedding generation or saving: {e}")
        embeddings = None
        news_df_embedding = None # Clear to prevent proceeding if embeddings failed
else:
    print("\nSkipping text embedding generation as news data or model was not loaded.")

print("\nPart 1 completed.")


Generating text embeddings for news articles...
This can take a significant amount of time depending on the dataset size and available hardware.


Batches:   0%|          | 0/1603 [00:00<?, ?it/s]


Text embedding generation complete.
Generated embeddings shape: (51282, 384)
Text embeddings saved successfully to '/content/article_text_embeddings.npy'.
You can download this file if needed, but it will be loaded directly by the next script.
Article ID order saved successfully to '/content/article_ids_order.npy'.

Part 1 completed.


In [None]:
# @title Part 2: Twin Tower Model - Cell B.1 Install Libraries
# Install necessary libraries again for safety in a new session
print("Installing necessary libraries for Twin Tower model...")
!pip install -q --upgrade pandas nltk tqdm transformers torch numpy<2 scikit-learn

print("\nLibrary installation step completed.")

Installing necessary libraries for Twin Tower model...
/bin/bash: line 1: 2: No such file or directory

Library installation step completed.


In [None]:
# @title Part 2: Twin Tower Model - Cell B.2 Download NLTK Data
import nltk
print("\nDownloading NLTK data...")
nltk.download('vader_lexicon', quiet=True)
print("NLTK data download process completed.")
print("\nSetup complete.")


Downloading NLTK data...
NLTK data download process completed.

Setup complete.


In [None]:
# @title Part 2: Twin Tower Model - Cell B.3 Import Libraries
print("\nImporting libraries...")
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from tqdm.auto import tqdm # For progress bar
from sklearn.metrics import roc_auc_score, roc_curve

print("Libraries imported.")


Importing libraries...
Libraries imported.


In [None]:
# @title Part 2: Twin Tower Model - Cell B.4 Load News Data, Embeddings, and Sentiment (Fixed Loading)
print("\nLoading news data, text embeddings, and sentiment data...")

# Path to original news.tsv (needed for article_id mapping)
news_path = '/content/news.tsv'
# Path to generated text embeddings (from Part 1)
embeddings_path = '/content/article_text_embeddings.npy'
# Path to news data with sentiment (from your previous sentiment script)
news_with_sentiment_path = '/content/news_with_sentiment_roberta.csv'
# Path to article ID order used for generating embeddings (from Part 1)
article_ids_order_path = '/content/article_ids_order.npy'


news_df_main = None
article_text_embeddings = None
news_sentiment_df = None
article_ids_order = None
article_id_to_embedding_idx = None # Initialize mapping


# Load original news data to get article_id order if needed (less crucial if article_ids_order.npy is reliable)
# Loading just article_id column is faster
try:
    # Read only the first column (article_id) and the fourth (title) and fifth (abstract) for robustness
    news_df_main = pd.read_csv(news_path, sep='\t', header=None, usecols=[0, 3, 4], names=['article_id', 'title', 'abstract'])
    print(f"Loaded article IDs and text columns from {news_path} ({len(news_df_main)} rows).")
except FileNotFoundError:
    print(f"ERROR: news.tsv not found at {news_path}.")
except Exception as e:
    print(f"ERROR loading news.tsv: {e}")


# Load the generated text embeddings
try:
    article_text_embeddings = np.load(embeddings_path)
    print(f"Loaded text embeddings from {embeddings_path}.")
    print(f"Text embeddings shape: {article_text_embeddings.shape}")
except FileNotFoundError:
    print(f"ERROR: Text embeddings not found at {embeddings_path}. Please run Part 1 to generate them.")
    article_text_embeddings = None
except Exception as e:
    print(f"ERROR loading text embeddings: {e}")
    article_text_embeddings = None


# Load the article ID order used for generating embeddings
try:
    # FIX: Add allow_pickle=True to load object arrays containing strings
    article_ids_order = np.load(article_ids_order_path, allow_pickle=True)
    print(f"Loaded article ID order from {article_ids_order_path}.")

    if article_text_embeddings is not None and len(article_ids_order) != len(article_text_embeddings):
         print("Warning: Mismatch between number of text embeddings and article_ids_order size.")
         article_ids_order = None # Invalidate if size mismatch

    if article_ids_order is not None:
         # Create a mapping from original article_id to its index in the embedding array
         article_id_to_embedding_idx = {aid: idx for idx, aid in enumerate(article_ids_order)}
         print(f"Created mapping from article_id to embedding index for {len(article_id_to_embedding_idx)} articles.")
         if news_df_main is not None and len(article_id_to_embedding_idx) != len(news_df_main):
              print("Warning: Mismatch between news_df size and article_id_to_embedding_idx size.")

except FileNotFoundError:
    print(f"ERROR: Article ID order not found at {article_ids_order_path}. Please run Part 1.")
    article_ids_order = None
    article_id_to_embedding_idx = None # Also ensure mapping is None
except Exception as e:
    print(f"ERROR loading article ID order: {e}")
    article_ids_order = None
    article_id_to_embedding_idx = None # Also ensure mapping is None


# Load the news sentiment data
try:
    news_sentiment_df = pd.read_csv(news_with_sentiment_path)
    print(f"Loaded sentiment data from {news_with_sentiment_path}.")
    # Ensure 'article_id' and 'sentiment' columns are present
    if 'article_id' not in news_sentiment_df.columns or 'sentiment' not in news_sentiment_df.columns:
         raise ValueError("Sentiment CSV must contain 'article_id' and 'sentiment' columns.")
    print("Sentiment data head:")
    display(news_sentiment_df.head())

except FileNotFoundError:
    print(f"ERROR: Sentiment CSV not found at {news_with_sentiment_path}. Please run the sentiment analysis script.")
    news_sentiment_df = None
except Exception as e:
    print(f"ERROR loading sentiment CSV: {e}")
    news_sentiment_df = None

# --- Validate loaded data before proceeding ---
if article_text_embeddings is None or news_sentiment_df is None or article_id_to_embedding_idx is None:
    print("\nERROR: Not all required data files for embeddings were loaded successfully. Skipping embedding preparation.")
    article_embeddings = None # Ensure this is None if data loading failed
    embedding_dim = None
else:
    print("\nAll required data files for embeddings loaded.")
    # Optional: Add sentiment column to news_df_main based on article_id for easier lookup later if needed
    # This merge is for convenience if you inspect news_df_main later, not strictly needed for embedding prep
    if news_df_main is not None:
        news_df_main = news_df_main.merge(news_sentiment_df[['article_id', 'sentiment']], on='article_id', how='left')
        news_df_main['sentiment'] = news_df_main['sentiment'].fillna('neutral') # Fill missing sentiment


Loading news data, text embeddings, and sentiment data...
Loaded article IDs and text columns from /content/news.tsv (51282 rows).
Loaded text embeddings from /content/article_text_embeddings.npy.
Text embeddings shape: (51282, 384)
Loaded article ID order from /content/article_ids_order.npy.
Created mapping from article_id to embedding index for 51282 articles.
Loaded sentiment data from /content/news_with_sentiment_roberta.csv.
Sentiment data head:


Unnamed: 0,article_id,category,subcategory,title,abstract,url,title_entities,abstract_entities,sentiment
0,N55528,lifestyle,lifestyleroyals,"The Brands Queen Elizabeth, Prince Charles, an...","Shop the notebooks, jackets, and more that the...",https://assets.msn.com/labs/mind/AAGH0ET.html,"[{""Label"": ""Prince Philip, Duke of Edinburgh"",...",[],label_1
1,N19639,health,weightloss,50 Worst Habits For Belly Fat,These seemingly harmless habits are holding yo...,https://assets.msn.com/labs/mind/AAB19MK.html,"[{""Label"": ""Adipose tissue"", ""Type"": ""C"", ""Wik...","[{""Label"": ""Adipose tissue"", ""Type"": ""C"", ""Wik...",label_0
2,N61837,news,newsworld,The Cost of Trump's Aid Freeze in the Trenches...,Lt. Ivan Molchanets peeked over a parapet of s...,https://assets.msn.com/labs/mind/AAJgNsz.html,[],"[{""Label"": ""Ukraine"", ""Type"": ""G"", ""WikidataId...",label_1
3,N53526,health,voices,I Was An NBA Wife. Here's How It Affected My M...,"I felt like I was a fraud, and being an NBA wi...",https://assets.msn.com/labs/mind/AACk2N6.html,[],"[{""Label"": ""National Basketball Association"", ...",label_0
4,N38324,health,medical,"How to Get Rid of Skin Tags, According to a De...","They seem harmless, but there's a very good re...",https://assets.msn.com/labs/mind/AAAKEkt.html,"[{""Label"": ""Skin tag"", ""Type"": ""C"", ""WikidataI...","[{""Label"": ""Skin tag"", ""Type"": ""C"", ""WikidataI...",label_1



All required data files for embeddings loaded.


In [None]:
# @title Part 2: Twin Tower Model - Cell B.5 Prepare Embeddings
if 'article_text_embeddings' in locals() and article_text_embeddings is not None and \
   'news_sentiment_df' in locals() and news_sentiment_df is not None and \
   'behaviors_df' in locals() and behaviors_df is not None and \
   'article_id_to_embedding_idx' in locals() and article_id_to_embedding_idx is not None: # Check all dependencies

    print("\nCombining text embeddings and sentiment for article representations...")

    # Define sentiment embedding dimension (one-hot encoding)
    sentiment_embedding_dim = 3 # For 'positive', 'neutral', 'negative'

    # Get sentiment labels from the sentiment DataFrame, aligned with the embedding order
    # We need to merge sentiment with the article IDs that match the embedding order
    article_id_sentiment_map = news_sentiment_df.set_index('article_id')['sentiment'].to_dict()

    # Get sentiment for each article ID in the order the embeddings were generated
    sentiments_in_embedding_order = [article_id_sentiment_map.get(aid, 'neutral') for aid in article_ids_order]

    # Map sentiment labels to indices for one-hot encoding
    sentiment_map = {'negative': 0, 'neutral': 1, 'positive': 2}
    # Handle potential missing or unexpected sentiment labels by mapping to 'neutral'
    sentiment_indices = [sentiment_map.get(s, sentiment_map['neutral']) for s in sentiments_in_embedding_order]
    sentiment_indices = np.array(sentiment_indices)


    # Create one-hot encoding
    sentiment_one_hot = np.eye(sentiment_embedding_dim)[sentiment_indices]

    # Get the dimension of the text embeddings
    text_embedding_dim = article_text_embeddings.shape[1]

    # Concatenate text embeddings and sentiment embeddings
    article_embeddings = np.concatenate((article_text_embeddings, sentiment_one_hot), axis=1)

    # Total article embedding dimension
    embedding_dim = article_embeddings.shape[1]

    print(f"Combined article embeddings shape: {article_embeddings.shape}")
    print(f"Final article embedding dimension: {embedding_dim}")


    # --- Fast Create User Embeddings (Average of New Article Embeddings) ---
    print("\nGenerating user embeddings based on the average of combined article embeddings...")
    user_embeddings = {}
    users_processed = 0
    skipped_users = 0
    skipped_articles_in_history = 0 # Count articles in history not found in news_df or embeddings

    # Use tqdm for progress bar
    from tqdm.auto import tqdm
    for _, row in tqdm(behaviors_df.iterrows(), total=len(behaviors_df), desc="Processing Users for Embeddings"):
        user_id = row['user_id']
        clicked_article_embedding_indices = [] # Store indices in the *embedding array* of clicked articles

        # Process click history if it's a non-empty string
        if isinstance(row['click_history'], str) and row['click_history'].strip():
            clicked_ids = row['click_history'].split() # Split IDs by space
            for aid in clicked_ids:
                # Look up the article ID to get its index in the embedding array
                if aid in article_id_to_embedding_idx:
                    embedding_idx = article_id_to_embedding_idx[aid]
                    clicked_article_embedding_indices.append(embedding_idx)
                else:
                    skipped_articles_in_history += 1 # Count articles not found in mapping

        # Calculate user embedding if they clicked known articles with embeddings
        if clicked_article_embedding_indices:
            # Get the *combined* embeddings for the clicked articles using their indices
            embeddings_for_user = article_embeddings[clicked_article_embedding_indices]
            # Calculate the mean embedding (average pooling)
            user_embedding = np.mean(embeddings_for_user, axis=0)
            users_processed += 1
        else:
            # Cold-start: User has no known click history or clicked unknown articles/embeddings
            # Use a zero vector for cold-start users
            user_embedding = np.zeros(embedding_dim, dtype=np.float32)
            skipped_users +=1 # Count users with no valid history

        user_embeddings[user_id] = user_embedding

    print(f"\nCreated embeddings for {users_processed} users with click history.")
    print(f"Generated zero embeddings for {skipped_users} cold-start users.")
    if skipped_articles_in_history > 0:
        print(f"Skipped {skipped_articles_in_history} article clicks in history (article_id not found in embedding mapping).")


    # Check shape of a sample embedding
    if user_embeddings:
        sample_user_id = list(user_embeddings.keys())[0]
        print(f"Sample user ('{sample_user_id}') embedding shape: {user_embeddings[sample_user_id].shape}")
    else:
        print("No user embeddings were created.")

else:
    print("\nSkipping embedding preparation as required data was not loaded.")
    article_embeddings = None # Ensure this is None
    embedding_dim = None
    user_embeddings = {} # Ensure this is empty


Skipping embedding preparation as required data was not loaded.


In [None]:
# @title Part 2: Twin Tower Model - Cell B.6 Define TwinTowers Model
# --- Step 7: Define a simple Twin Tower (Siamese) Model ---
print("\nDefining TwinTowers model...")
# Make sure embedding_dim was calculated in the previous cell
if 'embedding_dim' in locals() and embedding_dim is not None:
    class TwinTowers(nn.Module):
        def __init__(self, embedding_dim, hidden_dim=128, output_dim=64):
            super(TwinTowers, self).__init__()
            self.embedding_dim = embedding_dim

            # Article Tower
            self.article_tower = nn.Sequential(
                nn.Linear(embedding_dim, hidden_dim),
                nn.ReLU(),
                nn.Linear(hidden_dim, output_dim)
                # Consider adding BatchNorm or Dropout here if needed
                # nn.BatchNorm1d(output_dim)
            )

            # User Tower
            self.user_tower = nn.Sequential(
                nn.Linear(embedding_dim, hidden_dim),
                nn.ReLU(),
                nn.Linear(hidden_dim, output_dim)
                # nn.BatchNorm1d(output_dim)
            )

        def forward(self, user_emb, article_emb):
            # Ensure input tensors are FloatTensors
            user_emb = user_emb.float()
            article_emb = article_emb.float()

            # Pass through respective towers
            user_latent = self.user_tower(user_emb)
            article_latent = self.article_tower(article_emb)

            # Calculate dot product for similarity (cosine similarity is also common)
            # Normalize vectors for cosine similarity (optional, helps stability)
            # user_latent = F.normalize(user_latent, p=2, dim=1)
            # article_latent = F.normalize(article_latent, p=2, dim=1)
            # Element-wise product and sum
            dot_product = torch.sum(user_latent * article_latent, dim=1)

            # Output a probability using sigmoid
            return torch.sigmoid(dot_product)

    # Quick check: Instantiate model with updated embedding_dim and print structure
    temp_model = TwinTowers(embedding_dim=embedding_dim)
    print(temp_model)
    print(f"\nTwinTowers model defined with input dimension: {embedding_dim}.")

else:
    print("\nTwinTowers model definition skipped as embedding_dim is not defined or is None.")
    # Ensure model variable is None if definition failed
    model = None


Defining TwinTowers model...

TwinTowers model definition skipped as embedding_dim is not defined or is None.


In [None]:
# @title Part 2: Twin Tower Model - Cell B.7 Prepare Training and Test Data
# --- Step 8: Prepare Training and Test Data ---
# Restructure test data preparation to enable MRR calculation
if 'behaviors_df' is not None and 'user_embeddings' in locals() and user_embeddings and \
   'article_embeddings' in locals() and article_embeddings is not None and \
   'article_id_to_embedding_idx' in locals() and article_id_to_embedding_idx and \
   'embedding_dim' in locals() and embedding_dim is not None: # Check all dependencies

    print("\nPreparing training and test data...")

    train_pairs = []
    test_pairs_for_auc_roc = [] # Simple list for flattened AUC/ROC calculation
    # For MRR, we need to group by impression session
    test_sessions_for_mrr = {} # Dictionary: {impression_id: [(user_emb, article_emb, label, article_id), ...]}

    positive_samples_train = 0
    negative_samples_train = 0
    positive_samples_test = 0
    negative_samples_test = 0


    max_samples_per_user_train = 3 # Limit positive/negative samples per interaction for training
    # For test, we need all impressions in a session for MRR

    # All article indices list (potential candidates for random negative sampling if needed)
    # These are indices into the combined article_embeddings array
    all_article_embedding_indices = list(range(len(article_embeddings)))

    # Determine the split point for train/test sessions
    # Split behaviors_df to ensure sessions don't overlap between train and test
    # This is better than splitting random pairs after generation
    print("Splitting behaviors into train and test sets by session...")
    train_behaviors_df = behaviors_df.sample(frac=0.8, random_state=42).reset_index(drop=True)
    test_behaviors_df = behaviors_df.drop(train_behaviors_df.index).reset_index(drop=True)

    print(f"Split behaviors into {len(train_behaviors_df)} train and {len(test_behaviors_df)} test sessions.")

    # Process Training Sessions
    print("Generating training pairs...")
    for _, row in tqdm(train_behaviors_df.iterrows(), total=len(train_behaviors_df), desc="Generating Train Pairs"):
        user_id = row['user_id']
        # Skip if user embedding doesn't exist (e.g., user only appeared once with unknown clicks)
        if user_id not in user_embeddings:
            continue
        user_emb = user_embeddings[user_id] # Get the pre-computed user embedding

        clicked_embedding_indices = []
        if isinstance(row['click_history'], str) and row['click_history'].strip():
            clicked_ids = row['click_history'].split() # Split IDs by space
            for aid in clicked_ids:
                # Use the mapping to get the index in the embedding array
                if aid in article_id_to_embedding_idx:
                    clicked_embedding_indices.append(article_id_to_embedding_idx[aid])

        # --- Positive Samples (for training) ---
        # Sample up to N positive examples from this interaction's history
        pos_embedding_indices_to_add = clicked_embedding_indices[:max_samples_per_user_train]
        for pos_idx in pos_embedding_indices_to_add:
            # Append tuple: (user_embedding, article_embedding, label=1)
            train_pairs.append((user_emb, article_embeddings[pos_idx], 1))
            positive_samples_train += 1

        # --- Negative Samples (for training) ---
        # Prioritize impressed but not clicked items from the current session
        impressed_embedding_indices = []
        if isinstance(row['impressions'], str) and row['impressions'].strip():
             # Impressions format: 'article_id-label article_id-label ...'
            for impression_pair in row['impressions'].split():
                aid_label = impression_pair.split('-')
                if len(aid_label) == 2:
                    aid, label_str = aid_label
                    # Only consider explicitly non-clicked items as negative impressions
                    if label_str == '0':
                         # Use the mapping to get the index in the embedding array
                         if aid in article_id_to_embedding_idx:
                            impressed_embedding_indices.append(article_id_to_embedding_idx[aid])

        # Use negatives from impressions first (up to max_samples_per_user_train)
        neg_embedding_indices_from_impressions = impressed_embedding_indices[:max_samples_per_user_train]

        # If not enough negatives from impressions, sample randomly (excluding clicked articles in this session)
        num_negatives_needed = max_samples_per_user_train - len(neg_embedding_indices_from_impressions)
        random_neg_embedding_indices = []
        if num_negatives_needed > 0:
             # Pool of potential random negatives: all article embedding indices MINUS clicked indices in this session
             clicked_embedding_indices_set = set(clicked_embedding_indices)
             negative_pool_embedding_indices = list(set(all_article_embedding_indices) - clicked_embedding_indices_set)

             if negative_pool_embedding_indices: # Ensure pool is not empty
                # Sample without replacement
                random_neg_embedding_indices = np.random.choice(
                    negative_pool_embedding_indices,
                    min(num_negatives_needed, len(negative_pool_embedding_indices)), # Don't sample more than available
                    replace=False
                ).tolist() # Convert to list


        neg_embedding_indices_to_add = neg_embedding_indices_from_impressions + random_neg_embedding_indices
        # Ensure we don't add more than max_samples_per_user_train from the combination
        neg_embedding_indices_to_add = neg_embedding_indices_to_add[:max_samples_per_user_train]


        # Add the selected negative samples to train_pairs
        for neg_idx in neg_embedding_indices_to_add:
            train_pairs.append((user_emb, article_embeddings[neg_idx], 0))
            negative_samples_train += 1

    # Shuffle training data
    np.random.shuffle(train_pairs)

    # Process Test Sessions (structured for MRR)
    print("\nGenerating test pairs and sessions for MRR evaluation...")
    skipped_articles_in_impressions = 0 # Count articles in impressions not found in news_df or embeddings

    for _, row in tqdm(test_behaviors_df.iterrows(), total=len(test_behaviors_df), desc="Generating Test Data"):
        user_id = row['user_id']
        impression_id = row['impression_id']
        impressions_str = row['impressions'] # String of 'article_id-label' pairs

        # Skip if user embedding doesn't exist
        if user_id not in user_embeddings:
            continue
        user_emb = user_embeddings[user_id]

        # Process impressions for this session
        if isinstance(impressions_str, str) and impressions_str.strip():
            session_pairs_for_mrr = [] # List of (user_emb, article_emb, label, article_id) for this session

            for impression_pair in impressions_str.split():
                aid_label = impression_pair.split('-')
                if len(aid_label) == 2:
                    aid, label_str = aid_label
                    label = int(label_str) # Convert label to integer (0 or 1)

                    # Use the mapping to get the index in the embedding array
                    if aid in article_id_to_embedding_idx:
                        article_embedding_idx = article_id_to_embedding_idx[aid]
                        article_emb = article_embeddings[article_embedding_idx]

                        # Add to the list for MRR evaluation
                        session_pairs_for_mrr.append((user_emb, article_emb, label, aid))

                        # Also count for AUC/ROC summary
                        if label == 1:
                            positive_samples_test += 1
                        else:
                             negative_samples_test += 1

                    else:
                        skipped_articles_in_impressions += 1

            # Only add the session if it contains at least one impression we could process
            if session_pairs_for_mrr:
                 test_sessions_for_mrr[impression_id] = session_pairs_for_mrr


    # Flatten test_sessions_for_mrr into test_pairs_for_auc_roc
    for impression_id, session_data in test_sessions_for_mrr.items():
        for user_emb, article_emb, label, article_id in session_data:
             test_pairs_for_auc_roc.append((user_emb, article_emb, label))


    if not train_pairs and not test_pairs_for_auc_roc:
        print("\nERROR: No training or test data could be generated. Check input files and logic.")
        train_loader = None
        test_users_auc_roc = None
        test_articles_auc_roc = None
        test_labels_auc_roc_np = None
        test_sessions_for_mrr = {} # Ensure it's empty

    else:
        print(f"\nGenerated {len(train_pairs)} training samples ({positive_samples_train} positive, {negative_samples_train} negative).")
        print(f"Generated {len(test_pairs_for_auc_roc)} samples for AUC/ROC ({positive_samples_test} positive, {negative_samples_test} negative).")
        print(f"Organized test data into {len(test_sessions_for_mrr)} sessions for MRR.")
        if skipped_articles_in_impressions > 0:
            print(f"Skipped {skipped_articles_in_impressions} article impressions (article_id not found in embedding mapping).")

        # --- Note on Training/Evaluation Mismatch ---
        print("\nNote on Training Objective vs. MRR Evaluation:")
        print("The model is trained using a binary classification loss (predicting if a single user-article pair is a click).")
        print("MRR evaluates ranking performance within a full impression session.")
        print("While the model learns to score relevant items higher, the training is not directly optimized for ranking the *entire list* seen by the user in a session.")
        print("This is a common approach for implicit feedback, but a dedicated ranking loss (e.g., BPR, ListNet) would align training more closely with the MRR evaluation.")
        print("For this script, we keep the binary classification loss as requested, but measure ranking performance with MRR.")
        print("-" * 20)

        # --- Create PyTorch Tensors and DataLoaders for Training ---
        print("\nCreating PyTorch tensors and DataLoader for training...")
        train_users = torch.tensor(np.array([x[0] for x in train_pairs]), dtype=torch.float32)
        train_articles = torch.tensor(np.array([x[1] for x in train_pairs]), dtype=torch.float32)
        train_labels = torch.tensor(np.array([x[2] for x in train_pairs]), dtype=torch.float32)

        train_dataset = TensorDataset(train_users, train_articles, train_labels)
        train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)

        print(f"Prepared DataLoader for training with {len(train_loader)} batches.")
        print(f"Train users tensor shape: {train_users.shape}")
        print(f"Train articles tensor shape: {train_articles.shape}")
        print(f"Train labels tensor shape: {train_labels.shape}")


        # --- Prepare PyTorch Tensors for Test (AUC/ROC) ---
        print("\nCreating PyTorch tensors for AUC/ROC evaluation...")
        test_users_auc_roc = torch.tensor(np.array([x[0] for x in test_pairs_for_auc_roc]), dtype=torch.float32)
        test_articles_auc_roc = torch.tensor(np.array([x[1] for x in test_pairs_for_auc_roc]), dtype=torch.float32)
        test_labels_auc_roc_np = np.array([x[2] for x in test_pairs_for_auc_roc]) # Keep as numpy for sklearn

        print(f"Prepared tensors for AUC/ROC evaluation with {len(test_users_auc_roc)} samples.")
        print(f"Test users tensor shape (AUC/ROC): {test_users_auc_roc.shape}")
        print(f"Test articles tensor shape (AUC/ROC): {test_articles_auc_roc.shape}")
        print(f"Test labels numpy shape (AUC/ROC): {test_labels_auc_roc_np.shape}")


else:
    print("\nSkipping data preparation as required DataFrames/embeddings were not loaded.")
    train_loader = None
    test_users_auc_roc = None
    test_articles_auc_roc = None
    test_labels_auc_roc_np = None
    test_sessions_for_mrr = {} # Ensure it's empty


Skipping data preparation as required DataFrames/embeddings were not loaded.


  if 'behaviors_df' is not None and 'user_embeddings' in locals() and user_embeddings and \


In [None]:
# @title Part 2: Twin Tower Model - Cell B.8 Train Model
# --- Step 9: Train the TwinTowers model ---
# Check if train_loader and model definition were successful
if 'train_loader' in locals() and train_loader is not None and \
   'TwinTowers' in locals() and 'embedding_dim' in locals() and embedding_dim is not None:

    print("\nStarting model training...")

    # Set device (GPU if available, otherwise CPU)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Using device: {device}")

    # Initialize model, optimizer, and loss
    model = TwinTowers(embedding_dim=embedding_dim).to(device) # Use the calculated embedding_dim
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001) # Adam optimizer
    criterion = nn.BCELoss() # Binary Cross Entropy Loss for click prediction

    # --- Hyperparameters to Tune ---
    num_epochs = 10 # Increased epochs for potentially better training
    learning_rate = 0.001 # Can experiment with different learning rates (e.g., 0.0005, 0.002)
    # batch_size is set in DataLoader (Cell B.7)

    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) # Use the variable learning rate


    print(f"Training for {num_epochs} epochs with learning rate {learning_rate}...")

    for epoch in range(num_epochs):
        model.train() # Set model to training mode
        total_loss = 0.0

        # Use tqdm for epoch progress
        progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}", leave=True)

        for user_batch, article_batch, label_batch in progress_bar:
            # Move batch to the correct device
            user_batch = user_batch.to(device)
            article_batch = article_batch.to(device)
            label_batch = label_batch.to(device) # Labels should be float32 for BCELoss

            # --- Forward Pass ---
            predictions = model(user_batch, article_batch)
            # Squeeze predictions to match label shape [batch_size] vs [batch_size, 1] if needed
            # BCELoss expects predictions and labels to have the same shape
            predictions = predictions.squeeze(-1) if predictions.ndim > 1 else predictions


            # --- Calculate Loss ---
            loss = criterion(predictions, label_batch)

            # --- Backward Pass & Optimize ---
            optimizer.zero_grad() # Clear previous gradients
            loss.backward() # Calculate gradients
            optimizer.step() # Update model weights

            total_loss += loss.item()

            # Update progress bar description with current loss
            progress_bar.set_postfix({'loss': f'{loss.item():.4f}'})

        # Print average loss for the epoch
        avg_loss = total_loss / len(train_loader)
        print(f"\nEpoch {epoch+1}/{num_epochs} - Average Loss: {avg_loss:.4f}")

    print("\nTraining finished.")

else:
    print("\nSkipping training - train_loader or model not prepared/defined.")
    model = None # Ensure model is None if training was skipped


Skipping training - train_loader or model not prepared/defined.


In [None]:
# @title Part 2: Twin Tower Model - Cell B.9 Evaluate Model
# --- Step 10: Evaluate TwinTowers Model on Test Set (AUC, ROC, MRR) ---
# Check if model was trained AND test data was prepared successfully
if 'model' in locals() and model is not None and \
   'test_users_auc_roc' in locals() and test_users_auc_roc is not None and \
   'test_sessions_for_mrr' in locals():

    print("\nEvaluating model on the test set...")

    model.eval() # Set model to evaluation mode (disables dropout, etc.)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # Ensure device is set

    # Move test data for AUC/ROC to the device
    test_users_auc_roc = test_users_auc_roc.to(device)
    test_articles_auc_roc = test_articles_auc_roc.to(device)


    # --- Calculate AUC and ROC ---
    print("\nCalculating AUC and ROC...")
    test_predictions_auc_roc_list = []
    test_batch_size_eval = 512 # Process test set in batches for evaluation

    with torch.no_grad(): # No need to calculate gradients during evaluation
        for i in range(0, len(test_users_auc_roc), test_batch_size_eval):
            user_batch = test_users_auc_roc[i:i+test_batch_size_eval]
            article_batch = test_articles_auc_roc[i:i+test_batch_size_eval]

            batch_preds = model(user_batch, article_batch).cpu().numpy() # Get predictions, move to CPU, convert to numpy
            test_predictions_auc_roc_list.append(batch_preds)

    # Concatenate predictions from all batches
    test_preds_auc_roc_np = np.concatenate(test_predictions_auc_roc_list).squeeze() # Ensure it's a 1D array

    # Calculate AUC score using sklearn
    try:
        if test_preds_auc_roc_np.shape == test_labels_auc_roc_np.shape:
            auc_score = roc_auc_score(test_labels_auc_roc_np, test_preds_auc_roc_np)
            print(f"Test AUC Score: {auc_score:.4f}")

            # Calculate ROC curve points
            # roc_curve returns fpr, tpr, thresholds
            fpr, tpr, thresholds = roc_curve(test_labels_auc_roc_np, test_preds_auc_roc_np)
            print("\nTest ROC Curve points (FPR, TPR, Thresholds - first 10):")
            # Print first 10 points or fewer if less than 10
            for i in range(min(10, len(fpr))):
                print(f"  FPR: {fpr[i]:.4f}, TPR: {tpr[i]:.4f}, Threshold: {thresholds[i]:.4f}")


        else:
            print(f"Shape mismatch for AUC/ROC: Predictions shape {test_preds_auc_roc_np.shape}, Labels shape {test_labels_auc_roc_np.shape}")
            print("Cannot calculate AUC/ROC.")
            print(f"Unique labels in test set (AUC/ROC): {np.unique(test_labels_auc_roc_np)}")


    except ValueError as e:
        print(f"Error calculating AUC/ROC: {e}")
        print("This might happen if only one class (clicked or not clicked) is present in the test labels in the test_pairs_for_auc_roc list.")
        print(f"Unique labels in test set (AUC/ROC): {np.unique(test_labels_auc_roc_np)}")


    # --- Calculate MRR ---
    print("\nCalculating MRR...")

    reciprocal_ranks = []
    sessions_with_clicks_count = 0 # Count sessions that have clicks and are included in MRR

    # Iterate through each impression session in the test set
    # Use tqdm for progress over sessions
    for impression_id, session_data in tqdm(test_sessions_for_mrr.items(), desc="Calculating MRR for Sessions"):
        # session_data is a list of (user_emb, article_emb, label, article_id) for this session

        if not session_data: # Skip empty sessions (shouldn't happen with current prep but good check)
            continue

        # Extract data for prediction for this session
        # Note: User embedding is the same for all items in a session, but batching expects lists
        user_embs_session = torch.tensor(np.array([x[0] for x in session_data]), dtype=torch.float32).to(device)
        article_embs_session = torch.tensor(np.array([x[1] for x in session_data]), dtype=torch.float32).to(device)
        labels_session = [x[2] for x in session_data] # Keep labels as list for easy lookup
        article_ids_session = [x[3] for x in session_data] # Keep article IDs

        # Find the clicked articles in this session
        clicked_article_ids_in_session = [aid for aid, label in zip(article_ids_session, labels_session) if label == 1]

        # We only calculate MRR for sessions where *at least one* item was clicked AND processed
        # (i.e., its article_id was found in news_df)
        if not clicked_article_ids_in_session:
            continue # Skip sessions with no clicks

        # Get predictions for all items in the session
        with torch.no_grad():
            # The model takes batches. Even for a single session, we pass all items in that session as a batch
            session_preds = model(user_embs_session, article_embs_session).cpu().numpy().squeeze()

        # Handle cases where the session has only one item (predictions might not be a 1D array)
        if session_preds.ndim == 0: # Single item session
            session_preds = np.array([session_preds]) # Make it an array

        # Rank the items in this session based on predictions
        # Get indices that would sort the predictions in descending order
        ranked_indices = np.argsort(-session_preds) # Use - for descending order

        # Get the article IDs in ranked order
        ranked_article_ids = [article_ids_session[i] for i in ranked_indices]

        # Find the rank of the first clicked item
        # The rank is 1-based. The index in the ranked list is 0-based.
        rank_of_first_clicked = -1
        for rank, aid in enumerate(ranked_article_ids):
            if aid in clicked_article_ids_in_session:
                 # If multiple items were clicked, MRR uses the rank of the *first* clicked item encountered in the ranking
                 rank_of_first_clicked = rank + 1 # Rank is 1-based
                 break # Found the first clicked item, stop searching

        # Calculate and store the reciprocal rank if a clicked item was found and had a valid rank
        if rank_of_first_clicked != -1:
            reciprocal_ranks.append(1 / rank_of_first_clicked)
            sessions_with_clicks_count += 1


    # Calculate Mean Reciprocal Rank
    if reciprocal_ranks:
        mrr_score = np.mean(reciprocal_ranks)
        print(f"\nTest MRR Score calculated over {sessions_with_clicks_count} sessions: {mrr_score:.4f}")
    else:
        print("\nNo test sessions with clicks were processed successfully to calculate MRR.")


else:
    print("\nSkipping evaluation - model or test data not available from previous steps.")

print("\nProcess completed.")


Skipping evaluation - model or test data not available from previous steps.

Process completed.


# final code

In [None]:
# Cell 1: Install Required Libraries
# @title Install Required Libraries
print("Installing libraries for text embedding generation and model training...")

# Install all required libraries in one command to minimize installation calls
!pip install -q --upgrade transformers torch torchvision torchaudio sentence-transformers pandas numpy scikit-learn nltk tqdm

print("\nInstallation complete.")




Installing libraries for text embedding generation and model training...

Installation complete.


In [None]:
# Cell 2: Import Libraries and Set Up Environment
# @title Import Libraries and Set Up Environment
print("\nImporting libraries and setting up the environment...")
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from sentence_transformers import SentenceTransformer
from tqdm.auto import tqdm
from sklearn.metrics import roc_auc_score, roc_curve
import nltk
import os
import random

# Set seed for reproducibility
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

# Download NLTK data (optional, only if you need sentiment analysis)
print("Downloading NLTK data...")
nltk.download('vader_lexicon', quiet=True)

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

print("Libraries imported and environment set up.")


Importing libraries and setting up the environment...
Downloading NLTK data...
Using device: cuda
Libraries imported and environment set up.


In [None]:
# Cell 3: Load News Data
# @title Load News Data
print("\nLoading news data...")
# Path to your news.tsv file
news_path = '/content/news.tsv'

try:
    news_df = pd.read_csv(news_path, sep='\t', header=None, names=[
        'article_id', 'category', 'subcategory', 'title', 'abstract',
        'url', 'title_entities', 'abstract_entities'
    ])
    print(f"Loaded {len(news_df)} news articles from {news_path}.")
    print("First 5 rows:")
    display(news_df.head())
except FileNotFoundError:
    print(f"ERROR: File not found at {news_path}. Please upload news.tsv to your Colab session.")
    # Create a minimal example dataframe for testing
    print("Creating a minimal example dataset for testing...")
    news_df = pd.DataFrame({
        'article_id': [f'N{i}' for i in range(1, 101)],
        'category': ['news'] * 100,
        'subcategory': ['politics'] * 50 + ['technology'] * 50,
        'title': [f'Title {i}' for i in range(1, 101)],
        'abstract': [f'Abstract {i}' for i in range(1, 101)],
        'url': ['http://example.com'] * 100,
        'title_entities': [''] * 100,
        'abstract_entities': [''] * 100
    })



Loading news data...
Loaded 51282 news articles from /content/news.tsv.
First 5 rows:


Unnamed: 0,article_id,category,subcategory,title,abstract,url,title_entities,abstract_entities
0,N55528,lifestyle,lifestyleroyals,"The Brands Queen Elizabeth, Prince Charles, an...","Shop the notebooks, jackets, and more that the...",https://assets.msn.com/labs/mind/AAGH0ET.html,"[{""Label"": ""Prince Philip, Duke of Edinburgh"",...",[]
1,N19639,health,weightloss,50 Worst Habits For Belly Fat,These seemingly harmless habits are holding yo...,https://assets.msn.com/labs/mind/AAB19MK.html,"[{""Label"": ""Adipose tissue"", ""Type"": ""C"", ""Wik...","[{""Label"": ""Adipose tissue"", ""Type"": ""C"", ""Wik..."
2,N61837,news,newsworld,The Cost of Trump's Aid Freeze in the Trenches...,Lt. Ivan Molchanets peeked over a parapet of s...,https://assets.msn.com/labs/mind/AAJgNsz.html,[],"[{""Label"": ""Ukraine"", ""Type"": ""G"", ""WikidataId..."
3,N53526,health,voices,I Was An NBA Wife. Here's How It Affected My M...,"I felt like I was a fraud, and being an NBA wi...",https://assets.msn.com/labs/mind/AACk2N6.html,[],"[{""Label"": ""National Basketball Association"", ..."
4,N38324,health,medical,"How to Get Rid of Skin Tags, According to a De...","They seem harmless, but there's a very good re...",https://assets.msn.com/labs/mind/AAAKEkt.html,"[{""Label"": ""Skin tag"", ""Type"": ""C"", ""WikidataI...","[{""Label"": ""Skin tag"", ""Type"": ""C"", ""WikidataI..."


In [None]:
# Cell 4: Load Behaviors Data
# @title Load Behaviors Data
print("\nLoading behaviors data...")
# Path to your behaviors.tsv file
behaviors_path = '/content/behaviors.tsv'

try:
    behaviors_df = pd.read_csv(behaviors_path, sep='\t', header=None, names=[
        'impression_id', 'user_id', 'time', 'click_history', 'impressions'
    ])
    print(f"Loaded {len(behaviors_df)} behavior records from {behaviors_path}.")
    print("First 5 rows:")
    display(behaviors_df.head())
except FileNotFoundError:
    print(f"ERROR: File not found at {behaviors_path}. Creating minimal example behaviors dataset...")
    # Create a minimal example dataframe for testing
    user_ids = [f'U{i}' for i in range(1, 51)]
    article_ids = [f'N{i}' for i in range(1, 101)]

    behaviors_df = pd.DataFrame({
        'impression_id': [i for i in range(1, 501)],
        'user_id': np.random.choice(user_ids, 500),
        'time': [''] * 500,
        'click_history': [' '.join(np.random.choice(article_ids, size=np.random.randint(0, 6))) for _ in range(500)],
        'impressions': [' '.join([f'{aid}-{np.random.randint(0, 2)}' for aid in
                                 np.random.choice(article_ids, size=np.random.randint(5, 11), replace=False)])
                        for _ in range(500)]
    })


Loading behaviors data...
Loaded 156965 behavior records from /content/behaviors.tsv.
First 5 rows:


Unnamed: 0,impression_id,user_id,time,click_history,impressions
0,1,U13740,11/11/2019 9:05:58 AM,N55189 N42782 N34694 N45794 N18445 N63302 N104...,N55689-1 N35729-0
1,2,U91836,11/12/2019 6:11:30 PM,N31739 N6072 N63045 N23979 N35656 N43353 N8129...,N20678-0 N39317-0 N58114-0 N20495-0 N42977-0 N...
2,3,U73700,11/14/2019 7:01:48 AM,N10732 N25792 N7563 N21087 N41087 N5445 N60384...,N50014-0 N23877-0 N35389-0 N49712-0 N16844-0 N...
3,4,U34670,11/11/2019 5:28:05 AM,N45729 N2203 N871 N53880 N41375 N43142 N33013 ...,N35729-0 N33632-0 N49685-1 N27581-0
4,5,U8125,11/12/2019 4:11:21 PM,N10078 N56514 N14904 N33740,N39985-0 N36050-0 N16096-0 N8400-1 N22407-0 N6...


In [None]:
# Cell 5: Load or Generate Sentiment Data
# @title Load or Generate Sentiment Data
print("\nLoading sentiment data...")
# Path to news with sentiment data
news_with_sentiment_path = '/content/news_with_sentiment_roberta.csv'

try:
    news_sentiment_df = pd.read_csv(news_with_sentiment_path)
    print(f"Loaded sentiment data from {news_with_sentiment_path}.")
    print("First 5 rows:")
    display(news_sentiment_df.head())
except FileNotFoundError:
    print(f"ERROR: File not found at {news_with_sentiment_path}. Generating simple sentiment values...")
    # Create sentiment data from the news dataset
    news_sentiment_df = pd.DataFrame({
        'article_id': news_df['article_id'],
        'sentiment': np.random.choice(['negative', 'neutral', 'positive'], size=len(news_df),
                                    p=[0.2, 0.5, 0.3])  # Realistic distribution
    })


Loading sentiment data...
Loaded sentiment data from /content/news_with_sentiment_roberta.csv.
First 5 rows:


Unnamed: 0,article_id,category,subcategory,title,abstract,url,title_entities,abstract_entities,sentiment
0,N55528,lifestyle,lifestyleroyals,"The Brands Queen Elizabeth, Prince Charles, an...","Shop the notebooks, jackets, and more that the...",https://assets.msn.com/labs/mind/AAGH0ET.html,"[{""Label"": ""Prince Philip, Duke of Edinburgh"",...",[],label_1
1,N19639,health,weightloss,50 Worst Habits For Belly Fat,These seemingly harmless habits are holding yo...,https://assets.msn.com/labs/mind/AAB19MK.html,"[{""Label"": ""Adipose tissue"", ""Type"": ""C"", ""Wik...","[{""Label"": ""Adipose tissue"", ""Type"": ""C"", ""Wik...",label_0
2,N61837,news,newsworld,The Cost of Trump's Aid Freeze in the Trenches...,Lt. Ivan Molchanets peeked over a parapet of s...,https://assets.msn.com/labs/mind/AAJgNsz.html,[],"[{""Label"": ""Ukraine"", ""Type"": ""G"", ""WikidataId...",label_1
3,N53526,health,voices,I Was An NBA Wife. Here's How It Affected My M...,"I felt like I was a fraud, and being an NBA wi...",https://assets.msn.com/labs/mind/AACk2N6.html,[],"[{""Label"": ""National Basketball Association"", ...",label_0
4,N38324,health,medical,"How to Get Rid of Skin Tags, According to a De...","They seem harmless, but there's a very good re...",https://assets.msn.com/labs/mind/AAAKEkt.html,"[{""Label"": ""Skin tag"", ""Type"": ""C"", ""WikidataI...","[{""Label"": ""Skin tag"", ""Type"": ""C"", ""WikidataI...",label_1


In [None]:
# Cell 6: Generate Text Embeddings
# @title Generate Text Embeddings
print("\nGenerating text embeddings for news articles...")

# Define the sentence transformer model name
embedding_model_name = 'sentence-transformers/all-MiniLM-L6-v2'

try:
    # Load the model
    sentence_model = SentenceTransformer(embedding_model_name, device=device)
    print(f"Sentence embedding model '{embedding_model_name}' loaded successfully.")

    # Use a combination of title and abstract for embedding
    texts_to_embed = (news_df['title'].fillna('') + ' ' + news_df['abstract'].fillna('')).tolist()

    # Generate embeddings with a progress bar
    print("Generating embeddings...")
    article_text_embeddings = sentence_model.encode(texts_to_embed, show_progress_bar=True)

    # Save the article IDs order
    article_ids_order = news_df['article_id'].values

    # Create a mapping from article_id to its index in the embedding array
    article_id_to_embedding_idx = {aid: idx for idx, aid in enumerate(article_ids_order)}

    print(f"Generated embeddings shape: {article_text_embeddings.shape}")

    # Optionally save the embeddings to disk
    embeddings_output_path = '/content/article_text_embeddings.npy'
    article_ids_order_path = '/content/article_ids_order.npy'

    np.save(embeddings_output_path, article_text_embeddings)
    np.save(article_ids_order_path, article_ids_order)
    print(f"Text embeddings saved to '{embeddings_output_path}'.")
    print(f"Article ID order saved to '{article_ids_order_path}'.")

except Exception as e:
    print(f"ERROR during embedding generation: {e}")

    # Create minimal embeddings for testing if needed
    print("Creating minimal embeddings for testing...")
    article_text_embeddings = np.random.randn(len(news_df), 384)  # 384 is the dimension for all-MiniLM-L6-v2
    article_ids_order = news_df['article_id'].values
    article_id_to_embedding_idx = {aid: idx for idx, aid in enumerate(article_ids_order)}



Generating text embeddings for news articles...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Sentence embedding model 'sentence-transformers/all-MiniLM-L6-v2' loaded successfully.
Generating embeddings...


Batches:   0%|          | 0/1603 [00:00<?, ?it/s]

Generated embeddings shape: (51282, 384)
Text embeddings saved to '/content/article_text_embeddings.npy'.
Article ID order saved to '/content/article_ids_order.npy'.


In [None]:
# Cell 7: Prepare Combined Embeddings
# @title Prepare Combined Embeddings
print("\nCombining text embeddings and sentiment for article representations...")

# Define sentiment embedding dimension (one-hot encoding)
sentiment_embedding_dim = 3  # For 'positive', 'neutral', 'negative'

# Get sentiment labels from the sentiment DataFrame, aligned with the embedding order
article_id_sentiment_map = news_sentiment_df.set_index('article_id')['sentiment'].to_dict()

# Get sentiment for each article ID in the order the embeddings were generated
sentiments_in_embedding_order = [article_id_sentiment_map.get(aid, 'neutral') for aid in article_ids_order]

# Map sentiment labels to indices for one-hot encoding
sentiment_map = {'negative': 0, 'neutral': 1, 'positive': 2}
sentiment_indices = [sentiment_map.get(s, sentiment_map['neutral']) for s in sentiments_in_embedding_order]
sentiment_indices = np.array(sentiment_indices)

# Create one-hot encoding
sentiment_one_hot = np.eye(sentiment_embedding_dim)[sentiment_indices]

# Get the dimension of the text embeddings
text_embedding_dim = article_text_embeddings.shape[1]

# Concatenate text embeddings and sentiment embeddings
article_embeddings = np.concatenate((article_text_embeddings, sentiment_one_hot), axis=1)

# Total article embedding dimension
embedding_dim = article_embeddings.shape[1]

print(f"Combined article embeddings shape: {article_embeddings.shape}")
print(f"Final article embedding dimension: {embedding_dim}")


Combining text embeddings and sentiment for article representations...
Combined article embeddings shape: (51282, 387)
Final article embedding dimension: 387


In [None]:
# Cell 8: Generate User Embeddings
# @title Generate User Embeddings
print("\nGenerating user embeddings based on the average of combined article embeddings...")

user_embeddings = {}
users_processed = 0
skipped_users = 0
skipped_articles_in_history = 0

for _, row in tqdm(behaviors_df.iterrows(), total=len(behaviors_df), desc="Processing Users"):
    user_id = row['user_id']
    clicked_article_embedding_indices = []

    # Process click history if it's a non-empty string
    if isinstance(row['click_history'], str) and row['click_history'].strip():
        clicked_ids = row['click_history'].split()
        for aid in clicked_ids:
            if aid in article_id_to_embedding_idx:
                embedding_idx = article_id_to_embedding_idx[aid]
                clicked_article_embedding_indices.append(embedding_idx)
            else:
                skipped_articles_in_history += 1

    # Calculate user embedding if they clicked known articles with embeddings
    if clicked_article_embedding_indices:
        # Get the combined embeddings for the clicked articles using their indices
        embeddings_for_user = article_embeddings[clicked_article_embedding_indices]
        # Calculate the mean embedding (average pooling)
        user_embedding = np.mean(embeddings_for_user, axis=0)
        users_processed += 1
    else:
        # Cold-start: User has no known click history
        # Use a zero vector for cold-start users
        user_embedding = np.zeros(embedding_dim, dtype=np.float32)
        skipped_users += 1

    user_embeddings[user_id] = user_embedding

print(f"\nCreated embeddings for {users_processed} users with click history.")
print(f"Generated zero embeddings for {skipped_users} cold-start users.")
if skipped_articles_in_history > 0:
    print(f"Skipped {skipped_articles_in_history} article clicks in history (article_id not found in embedding mapping).")

# Check shape of a sample embedding
if user_embeddings:
    sample_user_id = list(user_embeddings.keys())[0]
    print(f"Sample user ('{sample_user_id}') embedding shape: {user_embeddings[sample_user_id].shape}")



Generating user embeddings based on the average of combined article embeddings...


Processing Users:   0%|          | 0/156965 [00:00<?, ?it/s]


Created embeddings for 153727 users with click history.
Generated zero embeddings for 3238 cold-start users.
Sample user ('U13740') embedding shape: (387,)


In [None]:
# Cell 9: Define TwinTowers Model
# @title Define TwinTowers Model
class TwinTowers(nn.Module):
    def __init__(self, embedding_dim, hidden_dim=128, output_dim=64):
        super(TwinTowers, self).__init__()
        self.embedding_dim = embedding_dim

        # Article Tower
        self.article_tower = nn.Sequential(
            nn.Linear(embedding_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, output_dim)
        )

        # User Tower
        self.user_tower = nn.Sequential(
            nn.Linear(embedding_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, output_dim)
        )

    def forward(self, user_emb, article_emb):
        # Ensure input tensors are FloatTensors
        user_emb = user_emb.float()
        article_emb = article_emb.float()

        # Pass through respective towers
        user_latent = self.user_tower(user_emb)
        article_latent = self.article_tower(article_emb)

        # Calculate dot product for similarity
        dot_product = torch.sum(user_latent * article_latent, dim=1)

        # Output a probability using sigmoid
        return torch.sigmoid(dot_product)

print("\nTwinTowers model defined with input dimension:", embedding_dim)



TwinTowers model defined with input dimension: 387


In [None]:
# Cell 10: Prepare Training and Test Data
# @title Prepare Training and Test Data
print("\nPreparing training and test data...")

train_pairs = []
test_pairs_for_auc_roc = []
# For MRR, we need to group by impression session
test_sessions_for_mrr = {}

positive_samples_train = 0
negative_samples_train = 0
positive_samples_test = 0
negative_samples_test = 0

max_samples_per_user_train = 3  # Limit positive/negative samples per interaction

# All article indices list (potential candidates for random negative sampling)
all_article_embedding_indices = list(range(len(article_embeddings)))

# Split behaviors_df to ensure sessions don't overlap between train and test
print("Splitting behaviors into train and test sets by session...")
train_behaviors_df = behaviors_df.sample(frac=0.8, random_state=SEED).reset_index(drop=True)
test_behaviors_df = behaviors_df.drop(train_behaviors_df.index).reset_index(drop=True)

print(f"Split behaviors into {len(train_behaviors_df)} train and {len(test_behaviors_df)} test sessions.")

# Process Training Sessions
print("Generating training pairs...")
for _, row in tqdm(train_behaviors_df.iterrows(), total=len(train_behaviors_df), desc="Generating Train Pairs"):
    user_id = row['user_id']
    # Skip if user embedding doesn't exist
    if user_id not in user_embeddings:
        continue
    user_emb = user_embeddings[user_id]

    clicked_embedding_indices = []
    if isinstance(row['click_history'], str) and row['click_history'].strip():
        clicked_ids = row['click_history'].split()
        for aid in clicked_ids:
            if aid in article_id_to_embedding_idx:
                clicked_embedding_indices.append(article_id_to_embedding_idx[aid])

    # Positive Samples (for training)
    pos_embedding_indices_to_add = clicked_embedding_indices[:max_samples_per_user_train]
    for pos_idx in pos_embedding_indices_to_add:
        train_pairs.append((user_emb, article_embeddings[pos_idx], 1))
        positive_samples_train += 1

    # Negative Samples (for training)
    impressed_embedding_indices = []
    if isinstance(row['impressions'], str) and row['impressions'].strip():
        for impression_pair in row['impressions'].split():
            aid_label = impression_pair.split('-')
            if len(aid_label) == 2:
                aid, label_str = aid_label
                if label_str == '0':  # Non-clicked items
                    if aid in article_id_to_embedding_idx:
                        impressed_embedding_indices.append(article_id_to_embedding_idx[aid])

    # Use negatives from impressions first (up to max_samples_per_user_train)
    neg_embedding_indices_from_impressions = impressed_embedding_indices[:max_samples_per_user_train]

    # If not enough negatives from impressions, sample randomly
    num_negatives_needed = max_samples_per_user_train - len(neg_embedding_indices_from_impressions)
    random_neg_embedding_indices = []
    if num_negatives_needed > 0:
        # Exclude clicked articles when sampling negatives
        clicked_embedding_indices_set = set(clicked_embedding_indices)
        negative_pool_embedding_indices = list(set(all_article_embedding_indices) - clicked_embedding_indices_set)

        if negative_pool_embedding_indices:  # Ensure pool is not empty
            # Sample without replacement
            random_neg_embedding_indices = np.random.choice(
                negative_pool_embedding_indices,
                min(num_negatives_needed, len(negative_pool_embedding_indices)),
                replace=False
            ).tolist()

    neg_embedding_indices_to_add = neg_embedding_indices_from_impressions + random_neg_embedding_indices
    neg_embedding_indices_to_add = neg_embedding_indices_to_add[:max_samples_per_user_train]

    # Add negative samples to train_pairs
    for neg_idx in neg_embedding_indices_to_add:
        train_pairs.append((user_emb, article_embeddings[neg_idx], 0))
        negative_samples_train += 1

# Shuffle training data
np.random.shuffle(train_pairs)

# Process Test Sessions (structured for MRR)
print("\nGenerating test pairs and sessions for MRR evaluation...")
skipped_articles_in_impressions = 0

for _, row in tqdm(test_behaviors_df.iterrows(), total=len(test_behaviors_df), desc="Generating Test Data"):
    user_id = row['user_id']
    impression_id = row['impression_id']
    impressions_str = row['impressions']

    # Skip if user embedding doesn't exist
    if user_id not in user_embeddings:
        continue
    user_emb = user_embeddings[user_id]

    # Process impressions for this session
    if isinstance(impressions_str, str) and impressions_str.strip():
        session_pairs_for_mrr = []  # For this session

        for impression_pair in impressions_str.split():
            aid_label = impression_pair.split('-')
            if len(aid_label) == 2:
                aid, label_str = aid_label
                label = int(label_str)  # Convert label to integer (0 or 1)

                # Use the mapping to get the index in the embedding array
                if aid in article_id_to_embedding_idx:
                    article_embedding_idx = article_id_to_embedding_idx[aid]
                    article_emb = article_embeddings[article_embedding_idx]

                    # Add to the list for MRR evaluation
                    session_pairs_for_mrr.append((user_emb, article_emb, label, aid))

                    # Count for AUC/ROC summary
                    if label == 1:
                        positive_samples_test += 1
                    else:
                        negative_samples_test += 1
                else:
                    skipped_articles_in_impressions += 1

        # Only add the session if it contains at least one impression
        if session_pairs_for_mrr:
            test_sessions_for_mrr[impression_id] = session_pairs_for_mrr

# Flatten test_sessions_for_mrr into test_pairs_for_auc_roc
for impression_id, session_data in test_sessions_for_mrr.items():
    for user_emb, article_emb, label, article_id in session_data:
        test_pairs_for_auc_roc.append((user_emb, article_emb, label))

print(f"\nGenerated {len(train_pairs)} training samples ({positive_samples_train} positive, {negative_samples_train} negative).")
print(f"Generated {len(test_pairs_for_auc_roc)} samples for AUC/ROC ({positive_samples_test} positive, {negative_samples_test} negative).")
print(f"Organized test data into {len(test_sessions_for_mrr)} sessions for MRR.")
if skipped_articles_in_impressions > 0:
    print(f"Skipped {skipped_articles_in_impressions} article impressions (article_id not found in embedding mapping).")

# Create PyTorch Tensors and DataLoaders for Training
train_users = torch.tensor(np.array([x[0] for x in train_pairs]), dtype=torch.float32)
train_articles = torch.tensor(np.array([x[1] for x in train_pairs]), dtype=torch.float32)
train_labels = torch.tensor(np.array([x[2] for x in train_pairs]), dtype=torch.float32)

train_dataset = TensorDataset(train_users, train_articles, train_labels)
train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)

print(f"Prepared DataLoader for training with {len(train_loader)} batches.")

# Prepare PyTorch Tensors for Test (AUC/ROC)
test_users_auc_roc = torch.tensor(np.array([x[0] for x in test_pairs_for_auc_roc]), dtype=torch.float32)
test_articles_auc_roc = torch.tensor(np.array([x[1] for x in test_pairs_for_auc_roc]), dtype=torch.float32)
test_labels_auc_roc_np = np.array([x[2] for x in test_pairs_for_auc_roc])


Preparing training and test data...
Splitting behaviors into train and test sets by session...
Split behaviors into 125572 train and 31393 test sessions.
Generating training pairs...


Generating Train Pairs:   0%|          | 0/125572 [00:00<?, ?it/s]


Generating test pairs and sessions for MRR evaluation...


Generating Test Data:   0%|          | 0/31393 [00:00<?, ?it/s]


Generated 739311 training samples (362595 positive, 376716 negative).
Generated 1162375 samples for AUC/ROC (47086 positive, 1115289 negative).
Organized test data into 31393 sessions for MRR.
Prepared DataLoader for training with 5776 batches.


In [None]:
# Cell 11: Train Model
# @title Train Model
print("\nStarting model training...")

# Initialize model
model = TwinTowers(embedding_dim=embedding_dim).to(device)
learning_rate = 0.001
num_epochs = 5  # Reduced for efficiency, increase as needed

# Loss and optimizer
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

print(f"Training for {num_epochs} epochs with learning rate {learning_rate}...")

for epoch in range(num_epochs):
    model.train()
    total_loss = 0.0

    # Use tqdm for progress visualization
    progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}")

    for user_batch, article_batch, label_batch in progress_bar:
        # Move to device
        user_batch = user_batch.to(device)
        article_batch = article_batch.to(device)
        label_batch = label_batch.to(device)

        # Forward pass
        predictions = model(user_batch, article_batch)
        predictions = predictions.squeeze(-1) if predictions.ndim > 1 else predictions

        # Calculate loss
        loss = criterion(predictions, label_batch)

        # Backward pass & optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        progress_bar.set_postfix({'loss': f'{loss.item():.4f}'})

    # Print average loss for the epoch
    avg_loss = total_loss / len(train_loader)
    print(f"\nEpoch {epoch+1}/{num_epochs} - Average Loss: {avg_loss:.4f}")

print("\nTraining finished.")



Starting model training...
Training for 5 epochs with learning rate 0.001...


Epoch 1/5:   0%|          | 0/5776 [00:00<?, ?it/s]


Epoch 1/5 - Average Loss: 0.1394


Epoch 2/5:   0%|          | 0/5776 [00:00<?, ?it/s]


Epoch 2/5 - Average Loss: 0.0788


Epoch 3/5:   0%|          | 0/5776 [00:00<?, ?it/s]


Epoch 3/5 - Average Loss: 0.0637


Epoch 4/5:   0%|          | 0/5776 [00:00<?, ?it/s]


Epoch 4/5 - Average Loss: 0.0544


Epoch 5/5:   0%|          | 0/5776 [00:00<?, ?it/s]


Epoch 5/5 - Average Loss: 0.0474

Training finished.


In [None]:
# Cell 12: Evaluate Model (AUC/ROC and MRR)
# @title Evaluate Model (AUC/ROC and MRR)
print("\nEvaluating model on the test set...")
model.eval()  # Set model to evaluation mode

# Calculate AUC and ROC
print("\nCalculating AUC and ROC...")
test_predictions_auc_roc_list = []
test_batch_size_eval = 512  # Process test set in batches

with torch.no_grad():
    for i in range(0, len(test_users_auc_roc), test_batch_size_eval):
        user_batch = test_users_auc_roc[i:i+test_batch_size_eval].to(device)
        article_batch = test_articles_auc_roc[i:i+test_batch_size_eval].to(device)
        batch_preds = model(user_batch, article_batch).cpu().numpy()
        test_predictions_auc_roc_list.append(batch_preds)

# Concatenate predictions from all batches
test_preds_auc_roc_np = np.concatenate(test_predictions_auc_roc_list).squeeze()

# Calculate AUC score
try:
    if test_preds_auc_roc_np.shape == test_labels_auc_roc_np.shape:
        auc_score = roc_auc_score(test_labels_auc_roc_np, test_preds_auc_roc_np)
        print(f"Test AUC Score: {auc_score:.4f}")

        # Calculate ROC curve points
        fpr, tpr, thresholds = roc_curve(test_labels_auc_roc_np, test_preds_auc_roc_np)
        print("\nTest ROC Curve points (FPR, TPR, Thresholds - first 5):")
        for i in range(min(5, len(fpr))):
            print(f"  FPR: {fpr[i]:.4f}, TPR: {tpr[i]:.4f}, Threshold: {thresholds[i]:.4f}")
    else:
        print(f"Shape mismatch: Predictions {test_preds_auc_roc_np.shape}, Labels {test_labels_auc_roc_np.shape}")
except ValueError as e:
    print(f"Error calculating AUC/ROC: {e}")
    print(f"Unique labels in test set: {np.unique(test_labels_auc_roc_np)}")

# Calculate MRR
print("\nCalculating MRR...")
reciprocal_ranks = []
sessions_with_clicks_count = 0

for impression_id, session_data in tqdm(test_sessions_for_mrr.items(), desc="Calculating MRR"):
    if not session_data:
        continue

    # Extract data for prediction
    user_embs_session = torch.tensor(np.array([x[0] for x in session_data]), dtype=torch.float32).to(device)
    article_embs_session = torch.tensor(np.array([x[1] for x in session_data]), dtype=torch.float32).to(device)
    labels_session = [x[2] for x in session_data]
    article_ids_session = [x[3] for x in session_data]

    # Find clicked articles
    clicked_article_ids_in_session = [aid for aid, label in zip(article_ids_session, labels_session) if label == 1]

    if not clicked_article_ids_in_session:
        continue

    # Get predictions
    with torch.no_grad():
        session_preds = model(user_embs_session, article_embs_session).cpu().numpy().squeeze()

    # Handle single item sessions
    if session_preds.ndim == 0:
        session_preds = np.array([session_preds])

    # Rank items
    ranked_indices = np.argsort(-session_preds)
    ranked_article_ids = [article_ids_session[i] for i in ranked_indices]

    # Find first clicked item's rank
    rank_of_first_clicked = -1
    for rank, aid in enumerate(ranked_article_ids):
        if aid in clicked_article_ids_in_session:
            rank_of_first_clicked = rank + 1  # 1-based rank
            break

    if rank_of_first_clicked != -1:
        reciprocal_ranks.append(1 / rank_of_first_clicked)
        sessions_with_clicks_count += 1

# Calculate Mean Reciprocal Rank
if reciprocal_ranks:
    mrr_score = np.mean(reciprocal_ranks)
    print(f"\nTest MRR Score (over {sessions_with_clicks_count} sessions): {mrr_score:.4f}")
else:
    print("\nNo sessions with clicks were processed successfully to calculate MRR.")

print("\nEvaluation completed.")




Evaluating model on the test set...

Calculating AUC and ROC...
Test AUC Score: 0.4998

Test ROC Curve points (FPR, TPR, Thresholds - first 5):
  FPR: 0.0000, TPR: 0.0000, Threshold: inf
  FPR: 0.0000, TPR: 0.0000, Threshold: 1.0000
  FPR: 0.0000, TPR: 0.0000, Threshold: 1.0000
  FPR: 0.0000, TPR: 0.0000, Threshold: 1.0000
  FPR: 0.0000, TPR: 0.0000, Threshold: 1.0000

Calculating MRR...


Calculating MRR:   0%|          | 0/31393 [00:00<?, ?it/s]


Test MRR Score (over 31393 sessions): 0.2960

Evaluation completed.


In [None]:
# Cell 13: Save Model and Results
# @title Save Model and Results
# Save the model
model_save_path = '/content/twin_towers_model.pth'
torch.save(model.state_dict(), model_save_path)
print(f"\nModel saved to {model_save_path}")

# Save evaluation results to file
if 'auc_score' in locals() and 'mrr_score' in locals():
    results = {
        'AUC': auc_score,
        'MRR': mrr_score,
        'training_samples': len(train_pairs),
        'test_samples': len(test_pairs_for_auc_roc),
        'test_sessions': len(test_sessions_for_mrr),
        'embedding_dim': embedding_dim
    }

    results_df = pd.DataFrame([results])
    results_df.to_csv('/content/twin_towers_results.csv', index=False)
    print("Evaluation results saved to '/content/twin_towers_results.csv'")

    print("\nFinal Results Summary:")
    print(f"AUC Score: {auc_score:.4f}")
    print(f"MRR Score: {mrr_score:.4f}")

print("\nModel training and evaluation complete!")


Model saved to /content/twin_towers_model.pth
Evaluation results saved to '/content/twin_towers_results.csv'

Final Results Summary:
AUC Score: 0.4998
MRR Score: 0.2960

Model training and evaluation complete!


In [None]:
# Cell 12: Evaluate Model (AUC/ROC, MRR, and Hit Rate)
# @title Evaluate Model (AUC/ROC, MRR, and Hit Rate)
print("\nEvaluating model on the test set...")
model.eval()  # Set model to evaluation mode

# Calculate AUC and ROC
print("\nCalculating AUC and ROC...")
test_predictions_auc_roc_list = []
test_batch_size_eval = 512  # Process test set in batches

with torch.no_grad():
    for i in range(0, len(test_users_auc_roc), test_batch_size_eval):
        user_batch = test_users_auc_roc[i:i+test_batch_size_eval].to(device)
        article_batch = test_articles_auc_roc[i:i+test_batch_size_eval].to(device)
        batch_preds = model(user_batch, article_batch).cpu().numpy()
        test_predictions_auc_roc_list.append(batch_preds)

# Concatenate predictions from all batches
test_preds_auc_roc_np = np.concatenate(test_predictions_auc_roc_list).squeeze()

# Calculate AUC score
try:
    if test_preds_auc_roc_np.shape == test_labels_auc_roc_np.shape:
        auc_score = roc_auc_score(test_labels_auc_roc_np, test_preds_auc_roc_np)
        print(f"Test AUC Score: {auc_score:.4f}")

        # Calculate ROC curve points
        fpr, tpr, thresholds = roc_curve(test_labels_auc_roc_np, test_preds_auc_roc_np)
        print("\nTest ROC Curve points (FPR, TPR, Thresholds - first 5):")
        for i in range(min(5, len(fpr))):
            print(f"  FPR: {fpr[i]:.4f}, TPR: {tpr[i]:.4f}, Threshold: {thresholds[i]:.4f}")
    else:
        print(f"Shape mismatch: Predictions {test_preds_auc_roc_np.shape}, Labels {test_labels_auc_roc_np.shape}")
except ValueError as e:
    print(f"Error calculating AUC/ROC: {e}")
    print(f"Unique labels in test set: {np.unique(test_labels_auc_roc_np)}")

# Calculate MRR and Hit Rate@k
print("\nCalculating MRR and Hit Rate@k...")
reciprocal_ranks = []
sessions_with_clicks_count = 0

# For Hit Rate@k
k_values = [1, 5, 10]  # Different k values to evaluate
hits_at_k = {k: 0 for k in k_values}
total_sessions_with_clicks = 0

for impression_id, session_data in tqdm(test_sessions_for_mrr.items(), desc="Calculating MRR & Hit Rate"):
    if not session_data:
        continue

    # Extract data for prediction
    user_embs_session = torch.tensor(np.array([x[0] for x in session_data]), dtype=torch.float32).to(device)
    article_embs_session = torch.tensor(np.array([x[1] for x in session_data]), dtype=torch.float32).to(device)
    labels_session = [x[2] for x in session_data]
    article_ids_session = [x[3] for x in session_data]

    # Find clicked articles
    clicked_article_ids_in_session = [aid for aid, label in zip(article_ids_session, labels_session) if label == 1]

    if not clicked_article_ids_in_session:
        continue

    # Track valid sessions for hit rate calculation
    total_sessions_with_clicks += 1

    # Get predictions
    with torch.no_grad():
        session_preds = model(user_embs_session, article_embs_session).cpu().numpy().squeeze()

    # Handle single item sessions
    if session_preds.ndim == 0:
        session_preds = np.array([session_preds])

    # Rank items
    ranked_indices = np.argsort(-session_preds)
    ranked_article_ids = [article_ids_session[i] for i in ranked_indices]

    # Calculate MRR
    rank_of_first_clicked = -1
    for rank, aid in enumerate(ranked_article_ids):
        if aid in clicked_article_ids_in_session:
            rank_of_first_clicked = rank + 1  # 1-based rank
            break

    if rank_of_first_clicked != -1:
        reciprocal_ranks.append(1 / rank_of_first_clicked)
        sessions_with_clicks_count += 1

    # Calculate Hit Rate@k
    for k in k_values:
        # Get top-k recommended items
        top_k_items = ranked_article_ids[:k]
        # Check if any clicked item is in top-k
        hit = any(aid in clicked_article_ids_in_session for aid in top_k_items)
        if hit:
            hits_at_k[k] += 1

# Calculate Mean Reciprocal Rank
if reciprocal_ranks:
    mrr_score = np.mean(reciprocal_ranks)
    print(f"\nTest MRR Score (over {sessions_with_clicks_count} sessions): {mrr_score:.4f}")
else:
    print("\nNo sessions with clicks were processed successfully to calculate MRR.")

# Calculate Hit Rate@k
print("\nHit Rate@k Results:")
if total_sessions_with_clicks > 0:
    for k in k_values:
        hit_rate = hits_at_k[k] / total_sessions_with_clicks
        print(f"Hit Rate@{k}: {hit_rate:.4f} ({hits_at_k[k]} / {total_sessions_with_clicks})")
else:
    print("No sessions with clicks were processed successfully to calculate Hit Rate.")

print("\nEvaluation completed.")




Evaluating model on the test set...

Calculating AUC and ROC...
Test AUC Score: 0.4998

Test ROC Curve points (FPR, TPR, Thresholds - first 5):
  FPR: 0.0000, TPR: 0.0000, Threshold: inf
  FPR: 0.0000, TPR: 0.0000, Threshold: 1.0000
  FPR: 0.0000, TPR: 0.0000, Threshold: 1.0000
  FPR: 0.0000, TPR: 0.0000, Threshold: 1.0000
  FPR: 0.0000, TPR: 0.0000, Threshold: 1.0000

Calculating MRR and Hit Rate@k...


Calculating MRR & Hit Rate:   0%|          | 0/31393 [00:00<?, ?it/s]


Test MRR Score (over 31393 sessions): 0.2960

Hit Rate@k Results:
Hit Rate@1: 0.1383 (4343 / 31393)
Hit Rate@5: 0.4681 (14694 / 31393)
Hit Rate@10: 0.6477 (20333 / 31393)

Evaluation completed.


In [None]:
# Cell 13: Save Model and Results
# @title Save Model and Results
# Save the model
model_save_path = '/content/twin_towers_model.pth'
torch.save(model.state_dict(), model_save_path)
print(f"\nModel saved to {model_save_path}")

# Save evaluation results to file
if 'auc_score' in locals() and 'mrr_score' in locals() and 'hits_at_k' in locals():
    results = {
        'AUC': auc_score,
        'MRR': mrr_score,
        'training_samples': len(train_pairs),
        'test_samples': len(test_pairs_for_auc_roc),
        'test_sessions': len(test_sessions_for_mrr)
    }

    # Add Hit Rate@k results
    for k in k_values:
        results[f'HR@{k}'] = hits_at_k[k] / total_sessions_with_clicks if total_sessions_with_clicks > 0 else 0

    results_df = pd.DataFrame([results])
    results_df.to_csv('/content/twin_towers_results.csv', index=False)
    print("Evaluation results saved to '/content/twin_towers_results.csv'")

    print("\nFinal Results Summary:")
    print(f"AUC Score: {auc_score:.4f}")
    print(f"MRR Score: {mrr_score:.4f}")
    for k in k_values:
        hit_rate = hits_at_k[k] / total_sessions_with_clicks if total_sessions_with_clicks > 0 else 0
        print(f"Hit Rate@{k}: {hit_rate:.4f}")

print("\nModel training and evaluation complete!")


Model saved to /content/twin_towers_model.pth
Evaluation results saved to '/content/twin_towers_results.csv'

Final Results Summary:
AUC Score: 0.4998
MRR Score: 0.2960
Hit Rate@1: 0.1383
Hit Rate@5: 0.4681
Hit Rate@10: 0.6477

Model training and evaluation complete!
