In [None]:
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer
import pandas as pd
from sklearn.metrics import roc_auc_score, ndcg_score
from tqdm import tqdm
import time
from typing import Dict, List, Tuple, Optional

class NewsRecommender:
    def __init__(self, sample_size=15000):
        self.searcher = SemanticSearch()
        self.fitted = False
        self.news_df = None
        self.behaviors_df = None
        self.user_history = {}
        self.news_embeddings = {}
        self.sample_size = sample_size
        self.faiss_index = None

    def load_data(self, news_path: str, behaviors_path: str) -> None:
        """Loading and processing the sampled MIND dataset files here."""
        self.news_df = pd.read_csv(news_path, sep='\t',
                                   names=['news_id', 'category', 'subcategory',
                                          'title', 'abstract', 'url', 'title_entities',
                                          'abstract_entities'])

        if len(self.news_df) > self.sample_size:
            self.news_df = self.news_df.sample(n=self.sample_size, random_state=42)

        self.behaviors_df = pd.read_csv(behaviors_path, sep='\t',
                                        names=['impression_id', 'user_id', 'time',
                                               'history', 'impressions'])

        if len(self.behaviors_df) > self.sample_size:
            self.behaviors_df = self.behaviors_df.sample(n=self.sample_size, random_state=42)

        relevant_news_ids = set()

        history_news = self.behaviors_df['history'].dropna().str.split().explode()
        relevant_news_ids.update(history_news)

        impression_news = self.behaviors_df['impressions'].dropna().str.split().explode()
        impression_news = impression_news.str.split('-').str[0]
        relevant_news_ids.update(impression_news)

        self.news_df = self.news_df[self.news_df['news_id'].isin(relevant_news_ids)]
        self.process_user_history()
        self.generate_news_embeddings()

        print(f"Data loaded successfully. Using {len(self.news_df)} news articles and {len(self.behaviors_df)} behavior records.")

    def process_user_history(self) -> None:
        """Processing the user history and impressions from behaviors data file."""
        for _, row in self.behaviors_df.iterrows():
            user_id = row['user_id']
            if pd.notna(row['history']):
                history = [h for h in row['history'].split() if h in self.news_df['news_id'].values]
            else:
                history = []

            if pd.notna(row['impressions']):
                impressions = [imp.split('-') for imp in row['impressions'].split()]
                clicked = [imp[0] for imp in impressions if imp[1] == '1' and imp[0] in self.news_df['news_id'].values]
                history.extend(clicked)

            if history:
                self.user_history[user_id] = list(set(history))

    def generate_news_embeddings(self) -> None:
        """Here generating the embeddings for all news articles and building the FAISS index."""
        print("Generating embeddings for news articles...")

        self.news_df['content'] = self.news_df['title'] + ' ' + self.news_df['abstract'].fillna('')
        contents = self.news_df['content'].tolist()

        batch_size = 128
        embeddings = []

        for i in range(0, len(contents), batch_size):
            batch = contents[i:i + batch_size]
            batch_embeddings = self.searcher.model.encode(batch, convert_to_numpy=True)
            embeddings.extend(batch_embeddings)

        embeddings = np.array(embeddings)

        news_ids = self.news_df['news_id'].values
        for idx, news_id in enumerate(news_ids):
            self.news_embeddings[news_id] = embeddings[idx]

        print("Building FAISS index...")
        self.faiss_index = faiss.IndexFlatIP(self.searcher.dim)
        faiss.normalize_L2(embeddings)
        self.faiss_index.add(embeddings)
        self.news_id_map = {idx: news_id for idx, news_id in enumerate(news_ids)}
        print("FAISS index built successfully.")

    def get_user_preferences(self, user_id: str) -> Optional[np.ndarray]:
        if user_id not in self.user_history or not self.user_history[user_id]:
            return None

        history_embeddings = [self.news_embeddings[news_id]
                              for news_id in self.user_history[user_id]
                              if news_id in self.news_embeddings]

        if not history_embeddings:
            return None

        return np.mean(history_embeddings, axis=0)

    def recommend_articles(self, user_id: str, top_n: int = 5) -> Tuple[List[Dict], float]:
        start_time = time.time()
        user_embedding = self.get_user_preferences(user_id)

        if user_embedding is None:
            recommendations = self.get_popular_diverse_articles(top_n)
            response_time = time.time() - start_time
            return recommendations, response_time

        faiss.normalize_L2(user_embedding.reshape(1, -1))
        distances, indices = self.faiss_index.search(user_embedding.reshape(1, -1), top_n)
        recommended_ids = [self.news_id_map[idx] for idx in indices[0]]

        recommendations = []
        for idx, news_id in enumerate(recommended_ids):
            article = self.news_df[self.news_df['news_id'] == news_id].iloc[0]
            recommendations.append({
                'news_id': news_id,
                'title': article['title'],
                'category': article['category'],
                'abstract': article['abstract'],
                'similarity_score': float(1 - distances[0][idx])
            })

        response_time = time.time() - start_time
        return recommendations, response_time

    def get_popular_diverse_articles(self, top_n: int = 5) -> List[Dict]:
        category_counts = self.news_df['category'].value_counts()
        recommendations = []
        for category in category_counts.index[:top_n]:
            article = self.news_df[self.news_df['category'] == category].sample(1).iloc[0]
            recommendations.append({
                'news_id': article['news_id'],
                'title': article['title'],
                'category': article['category'],
                'abstract': article['abstract'],
                'similarity_score': None
            })
        return recommendations

    def calculate_metrics(self, test_behaviors: pd.DataFrame) -> Dict[str, float]:
        """Here calculating multiple evaluation metrics: AUC, nDCG, MRR, and average response time."""
        print("Calculating evaluation metrics...")

        y_true = []
        y_scores = []
        ndcg_scores = []
        reciprocal_ranks = []
        response_times = []

        for _, row in tqdm(test_behaviors.iterrows(), desc="Processing test behaviors"):
            user_id = row['user_id']

            # Get user embedding and timing
            start_time = time.time()
            user_embedding = self.get_user_preferences(user_id)

            if user_embedding is None:
                continue

            # Process impressions
            if pd.notna(row['impressions']):
                impressions = [imp.split('-') for imp in row['impressions'].split()]

                # Prepare ground truth and predictions for this impression
                true_labels = []
                pred_scores = []

                for news_id, label in impressions:
                    if news_id not in self.news_embeddings:
                        continue

                    article_embedding = self.news_embeddings[news_id]
                    similarity = 1 - np.dot(user_embedding, article_embedding) / (
                        np.linalg.norm(user_embedding) * np.linalg.norm(article_embedding))

                    true_labels.append(int(label))
                    pred_scores.append(-similarity)  # Negative because higher similarity should predict clicks

                if len(true_labels) > 1:  # Only calculate NDCG if we have more than one document
                    try:
                        ndcg = ndcg_score([true_labels], [pred_scores])
                        ndcg_scores.append(ndcg)
                    except ValueError:
                        pass  # Skip if NDCG calculation fails

            # AUC
                if true_labels and pred_scores:
                    y_true.extend(true_labels)
                    y_scores.extend(pred_scores)

            # MRR
                if true_labels and pred_scores:
                    sorted_pairs = sorted(zip(pred_scores, true_labels), reverse=True)
                    for rank, (_, label) in enumerate(sorted_pairs, 1):
                        if label == 1:
                            reciprocal_ranks.append(1.0 / rank)
                            break
                    else:
                            reciprocal_ranks.append(0)

                response_times.append(time.time() - start_time)

        metrics = {}

        # Calculate AUC
        if y_true and y_scores:
            metrics['auc'] = roc_auc_score(y_true, y_scores)

        # Calculate mean NDCG
        if ndcg_scores:
            metrics['ndcg'] = np.mean(ndcg_scores)

        # Calculate MRR
        if reciprocal_ranks:
            metrics['mrr'] = np.mean(reciprocal_ranks)

        # Calculate response time metrics
        if response_times:
            metrics['avg_response_time'] = np.mean(response_times)
            metrics['max_response_time'] = np.max(response_times)
            metrics['min_response_time'] = np.min(response_times)

        return metrics
class SemanticSearch:
    def __init__(self):
        """Here we are loading the Sentence Transformer models"""
        self.model = SentenceTransformer("sentence-transformers/paraphrase-MiniLM-L6-v2")
        self.dim = self.model.get_sentence_embedding_dimension()

def recommend_news(user_id: str, news_path: str, behaviors_path: str,
                  sample_size: int = 15000) -> List[Dict]:
    """This is the main function to get news recommendations for a user."""
    print(f"Generating recommendations for user {user_id}...")

    recommender = NewsRecommender(sample_size=sample_size)
    recommender.load_data(news_path, behaviors_path)

    recommendations, _ = recommender.recommend_articles(user_id, top_n=5)
    return recommendations

def evaluate_recommender(news_path: str, behaviors_path: str, test_behaviors_path: str,
                        sample_size: int = 15000) -> Dict[str, float]:
    """Evaluate the recommender system using multiple metrics."""
    print("Initializing recommender system...")
    recommender = NewsRecommender(sample_size=sample_size)
    recommender.load_data(news_path, behaviors_path)

    print("Loading test behaviors...")
    test_behaviors = pd.read_csv(test_behaviors_path, sep='\t',
                               names=['impression_id', 'user_id', 'time',
                                     'history', 'impressions'])

    if len(test_behaviors) > sample_size:
        test_behaviors = test_behaviors.sample(n=sample_size, random_state=42)

    metrics = recommender.calculate_metrics(test_behaviors)

    print("\nEvaluation Results:")
    print(f"AUC Score: {metrics.get('auc', 'N/A'):.4f}")
    print(f"nDCG Score: {metrics.get('ndcg', 'N/A'):.4f}")
    print(f"MRR Score: {metrics.get('mrr', 'N/A'):.4f}")
    print(f"Average Response Time: {metrics.get('avg_response_time', 'N/A'):.4f} seconds")
    print(f"Max Response Time: {metrics.get('max_response_time', 'N/A'):.4f} seconds")
    print(f"Min Response Time: {metrics.get('min_response_time', 'N/A'):.4f} seconds")

    return metrics



news_path = 'news_V.tsv'
behaviors_path = 'behaviors.tsv'
test_behaviors_path = 'behaviors_V.tsv'

metrics = evaluate_recommender(
    news_path=news_path,
    behaviors_path=behaviors_path,
    test_behaviors_path=test_behaviors_path,
    sample_size=15000
)

Initializing recommender system...
Generating embeddings for news articles...
Building FAISS index...
FAISS index built successfully.
Data loaded successfully. Using 9881 news articles and 15000 behavior records.
Loading test behaviors...
Calculating evaluation metrics...


Processing test behaviors: 15000it [00:16, 921.70it/s]



Evaluation Results:
AUC Score: 0.9332
nDCG Score: 0.4621
MRR Score: 0.4374
Average Response Time: 0.0010 seconds
Max Response Time: 0.0068 seconds
Min Response Time: 0.0000 seconds


In [None]:
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer
import pandas as pd
from sklearn.metrics import roc_auc_score, ndcg_score
from tqdm import tqdm
import time
from typing import Dict, List, Tuple, Optional

class NewsRecommender:
    def __init__(self, sample_size=15000):
        self.searcher = SemanticSearch()
        self.fitted = False
        self.news_df = None
        self.behaviors_df = None
        self.user_history = {}
        self.news_embeddings = {}
        self.sample_size = sample_size
        self.faiss_index = None

    def load_data(self, news_path: str, behaviors_path: str) -> None:
      
        self.news_df = pd.read_csv(news_path, sep='\t',
                                   names=['news_id', 'category', 'subcategory',
                                          'title', 'abstract', 'url', 'title_entities',
                                          'abstract_entities'])

        if len(self.news_df) > self.sample_size:
            self.news_df = self.news_df.sample(n=self.sample_size, random_state=42)

        self.behaviors_df = pd.read_csv(behaviors_path, sep='\t',
                                        names=['impression_id', 'user_id', 'time',
                                               'history', 'impressions'])

        if len(self.behaviors_df) > self.sample_size:
            self.behaviors_df = self.behaviors_df.sample(n=self.sample_size, random_state=42)

        relevant_news_ids = set()

        history_news = self.behaviors_df['history'].dropna().str.split().explode()
        relevant_news_ids.update(history_news)

        impression_news = self.behaviors_df['impressions'].dropna().str.split().explode()
        impression_news = impression_news.str.split('-').str[0]
        relevant_news_ids.update(impression_news)

        self.news_df = self.news_df[self.news_df['news_id'].isin(relevant_news_ids)]
        self.process_user_history()
        self.generate_news_embeddings()
        print(f"Data loaded successfully. Using {len(self.news_df)} news articles and {len(self.behaviors_df)} behavior records.")

    def process_user_history(self) -> None:
        for _, row in self.behaviors_df.iterrows():
            user_id = row['user_id']
            if pd.notna(row['history']):
                history = [h for h in row['history'].split() if h in self.news_df['news_id'].values]
            else:
                history = []

            if pd.notna(row['impressions']):
                impressions = [imp.split('-') for imp in row['impressions'].split()]
                clicked = [imp[0] for imp in impressions if imp[1] == '1' and imp[0] in self.news_df['news_id'].values]
                history.extend(clicked)

            if history:
                self.user_history[user_id] = list(set(history))

    def generate_news_embeddings(self) -> None:
        print("Generating embeddings for news articles...")
        self.news_df['content'] = self.news_df['title'] + ' ' + self.news_df['abstract'].fillna('')
        contents = self.news_df['content'].tolist()

        batch_size = 128
        embeddings = []

        for i in range(0, len(contents), batch_size):
            batch = contents[i:i + batch_size]
            batch_embeddings = self.searcher.model.encode(batch, convert_to_numpy=True)
            embeddings.extend(batch_embeddings)

        embeddings = np.array(embeddings)

        news_ids = self.news_df['news_id'].values
        for idx, news_id in enumerate(news_ids):
            self.news_embeddings[news_id] = embeddings[idx]

        print("Building FAISS index...")
        self.faiss_index = faiss.IndexFlatIP(self.searcher.dim)
        faiss.normalize_L2(embeddings)
        self.faiss_index.add(embeddings)
        self.news_id_map = {idx: news_id for idx, news_id in enumerate(news_ids)}
        print("FAISS index built successfully.")

    def get_user_preferences(self, user_id: str) -> Optional[np.ndarray]:
        if user_id not in self.user_history or not self.user_history[user_id]:
            return None

        history_embeddings = [self.news_embeddings[news_id]
                              for news_id in self.user_history[user_id]
                              if news_id in self.news_embeddings]

        if not history_embeddings:
            return None

        return np.mean(history_embeddings, axis=0)

    def recommend_articles(self, user_id: str, top_n: int = 5) -> Tuple[List[Dict], float]:
        start_time = time.time()
        user_embedding = self.get_user_preferences(user_id)

        if user_embedding is None:
            recommendations = self.get_popular_diverse_articles(top_n)
            response_time = time.time() - start_time
            return recommendations, response_time

        faiss.normalize_L2(user_embedding.reshape(1, -1))
        distances, indices = self.faiss_index.search(user_embedding.reshape(1, -1), top_n)
        recommended_ids = [self.news_id_map[idx] for idx in indices[0]]

        recommendations = []
        for idx, news_id in enumerate(recommended_ids):
            article = self.news_df[self.news_df['news_id'] == news_id].iloc[0]
            recommendations.append({
                'news_id': news_id,
                'title': article['title'],
                'category': article['category'],
                'abstract': article['abstract'],
                'similarity_score': float(1 - distances[0][idx])
            })

        response_time = time.time() - start_time
        return recommendations, response_time

    def get_popular_diverse_articles(self, top_n: int = 5) -> List[Dict]:
        category_counts = self.news_df['category'].value_counts()
        recommendations = []
        for category in category_counts.index[:top_n]:
            article = self.news_df[self.news_df['category'] == category].sample(1).iloc[0]
            recommendations.append({
                'news_id': article['news_id'],
                'title': article['title'],
                'category': article['category'],
                'abstract': article['abstract'],
                'similarity_score': None
            })
        return recommendations

    def calculate_metrics(self, test_behaviors: pd.DataFrame) -> Dict[str, float]:
        print("Calculating evaluation metrics...")
        y_true = []
        y_scores = []
        ndcg_scores = []
        reciprocal_ranks = []
        response_times = []

        for _, row in tqdm(test_behaviors.iterrows(), desc="Processing test behaviors"):
            user_id = row['user_id']

            # Get user embedding and timing
            start_time = time.time()
            user_embedding = self.get_user_preferences(user_id)

            if user_embedding is None:
                continue

            # Process impressions
            if pd.notna(row['impressions']):
                impressions = [imp.split('-') for imp in row['impressions'].split()]

                # Prepare ground truth and predictions for this impression
                true_labels = []
                pred_scores = []

                for news_id, label in impressions:
                    if news_id not in self.news_embeddings:
                        continue

                    article_embedding = self.news_embeddings[news_id]
                    similarity = 1 - np.dot(user_embedding, article_embedding) / (
                        np.linalg.norm(user_embedding) * np.linalg.norm(article_embedding))

                    true_labels.append(int(label))
                    pred_scores.append(-similarity)  # Negative because higher similarity should predict clicks

                if len(true_labels) > 1:  # Only calculate NDCG if we have more than one document
                    try:
                        ndcg = ndcg_score([true_labels], [pred_scores])
                        ndcg_scores.append(ndcg)
                    except ValueError:
                        pass  # Skip if NDCG calculation fails

            # AUC
                if true_labels and pred_scores:
                    y_true.extend(true_labels)
                    y_scores.extend(pred_scores)

            # MRR
                if true_labels and pred_scores:
                    sorted_pairs = sorted(zip(pred_scores, true_labels), reverse=True)
                    for rank, (_, label) in enumerate(sorted_pairs, 1):
                        if label == 1:
                            reciprocal_ranks.append(1.0 / rank)
                            break
                    else:
                            reciprocal_ranks.append(0)

                response_times.append(time.time() - start_time)

        metrics = {}

        # Calculate AUC 
        if y_true and y_scores:
            metrics['auc'] = roc_auc_score(y_true, y_scores)

        # Calculate mean NDCG 
        if ndcg_scores:
            metrics['ndcg'] = np.mean(ndcg_scores)

        # Calculate MRR 
        if reciprocal_ranks:
            metrics['mrr'] = np.mean(reciprocal_ranks)

        # Calculate response time metrics
        if response_times:
            metrics['avg_response_time'] = np.mean(response_times)
            metrics['max_response_time'] = np.max(response_times)
            metrics['min_response_time'] = np.min(response_times)

        return metrics
class SemanticSearch:
    def __init__(self):
        self.model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
        self.dim = self.model.get_sentence_embedding_dimension()

def recommend_news(user_id: str, news_path: str, behaviors_path: str,
                  sample_size: int = 15000) -> List[Dict]:
    print(f"Generating recommendations for user {user_id}...")

    recommender = NewsRecommender(sample_size=sample_size)
    recommender.load_data(news_path, behaviors_path)

    recommendations, _ = recommender.recommend_articles(user_id, top_n=5)
    return recommendations

def evaluate_recommender(news_path: str, behaviors_path: str, test_behaviors_path: str,
                        sample_size: int = 15000) -> Dict[str, float]:
    print("Initializing recommender system...")
    recommender = NewsRecommender(sample_size=sample_size)
    recommender.load_data(news_path, behaviors_path)

    print("Loading test behaviors...")
    test_behaviors = pd.read_csv(test_behaviors_path, sep='\t',
                               names=['impression_id', 'user_id', 'time',
                                     'history', 'impressions'])

    if len(test_behaviors) > sample_size:
        test_behaviors = test_behaviors.sample(n=sample_size, random_state=42)

    metrics = recommender.calculate_metrics(test_behaviors)

    print("\nEvaluation Results:")
    print(f"AUC Score: {metrics.get('auc', 'N/A'):.4f}")
    print(f"nDCG Score: {metrics.get('ndcg', 'N/A'):.4f}")
    print(f"MRR Score: {metrics.get('mrr', 'N/A'):.4f}")
    print(f"Average Response Time: {metrics.get('avg_response_time', 'N/A'):.4f} seconds")
    print(f"Max Response Time: {metrics.get('max_response_time', 'N/A'):.4f} seconds")
    print(f"Min Response Time: {metrics.get('min_response_time', 'N/A'):.4f} seconds")

    return metrics



news_path = 'news_V.tsv'
behaviors_path = 'behaviors.tsv'
test_behaviors_path = 'behaviors_V.tsv'

metrics = evaluate_recommender(
    news_path=news_path,
    behaviors_path=behaviors_path,
    test_behaviors_path=test_behaviors_path,
    sample_size=15000
)

Initializing recommender system...
Generating embeddings for news articles...
Building FAISS index...
FAISS index built successfully.
Data loaded successfully. Using 9881 news articles and 15000 behavior records.
Loading test behaviors...
Calculating evaluation metrics...


Processing test behaviors: 15000it [00:16, 923.18it/s] 



Evaluation Results:
AUC Score: 0.9414
nDCG Score: 0.4638
MRR Score: 0.4396
Average Response Time: 0.0010 seconds
Max Response Time: 0.0065 seconds
Min Response Time: 0.0000 seconds


In [None]:
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer
import pandas as pd
from sklearn.metrics import roc_auc_score, ndcg_score
from tqdm import tqdm
import time
from typing import Dict, List, Tuple, Optional

class NewsRecommender:
    def __init__(self, sample_size=15000):
        self.searcher = SemanticSearch()
        self.fitted = False
        self.news_df = None
        self.behaviors_df = None
        self.user_history = {}
        self.news_embeddings = {}
        self.sample_size = sample_size
        self.faiss_index = None

    def load_data(self, news_path: str, behaviors_path: str) -> None:
        self.news_df = pd.read_csv(news_path, sep='\t',
                                   names=['news_id', 'category', 'subcategory',
                                          'title', 'abstract', 'url', 'title_entities',
                                          'abstract_entities'])

        if len(self.news_df) > self.sample_size:
            self.news_df = self.news_df.sample(n=self.sample_size, random_state=42)

        self.behaviors_df = pd.read_csv(behaviors_path, sep='\t',
                                        names=['impression_id', 'user_id', 'time',
                                               'history', 'impressions'])

        if len(self.behaviors_df) > self.sample_size:
            self.behaviors_df = self.behaviors_df.sample(n=self.sample_size, random_state=42)

        relevant_news_ids = set()

        history_news = self.behaviors_df['history'].dropna().str.split().explode()
        relevant_news_ids.update(history_news)

        impression_news = self.behaviors_df['impressions'].dropna().str.split().explode()
        impression_news = impression_news.str.split('-').str[0]
        relevant_news_ids.update(impression_news)

        self.news_df = self.news_df[self.news_df['news_id'].isin(relevant_news_ids)]
        self.process_user_history()
        self.generate_news_embeddings()

        print(f"Data loaded successfully. Using {len(self.news_df)} news articles and {len(self.behaviors_df)} behavior records.")

    def process_user_history(self) -> None:
        for _, row in self.behaviors_df.iterrows():
            user_id = row['user_id']
            if pd.notna(row['history']):
                history = [h for h in row['history'].split() if h in self.news_df['news_id'].values]
            else:
                history = []

            if pd.notna(row['impressions']):
                impressions = [imp.split('-') for imp in row['impressions'].split()]
                clicked = [imp[0] for imp in impressions if imp[1] == '1' and imp[0] in self.news_df['news_id'].values]
                history.extend(clicked)

            if history:
                self.user_history[user_id] = list(set(history))

    def generate_news_embeddings(self) -> None:
        print("Generating embeddings for news articles...")
        self.news_df['content'] = self.news_df['title'] + ' ' + self.news_df['abstract'].fillna('')
        contents = self.news_df['content'].tolist()

        batch_size = 128
        embeddings = []

        for i in range(0, len(contents), batch_size):
            batch = contents[i:i + batch_size]
            batch_embeddings = self.searcher.model.encode(batch, convert_to_numpy=True)
            embeddings.extend(batch_embeddings)

        embeddings = np.array(embeddings)

        news_ids = self.news_df['news_id'].values
        for idx, news_id in enumerate(news_ids):
            self.news_embeddings[news_id] = embeddings[idx]

        print("Building FAISS index...")
        self.faiss_index = faiss.IndexFlatIP(self.searcher.dim)
        faiss.normalize_L2(embeddings)
        self.faiss_index.add(embeddings)
        self.news_id_map = {idx: news_id for idx, news_id in enumerate(news_ids)}
        print("FAISS index built successfully.")

    def get_user_preferences(self, user_id: str) -> Optional[np.ndarray]:
        if user_id not in self.user_history or not self.user_history[user_id]:
            return None

        history_embeddings = [self.news_embeddings[news_id]
                              for news_id in self.user_history[user_id]
                              if news_id in self.news_embeddings]

        if not history_embeddings:
            return None

        return np.mean(history_embeddings, axis=0)

    def recommend_articles(self, user_id: str, top_n: int = 5) -> Tuple[List[Dict], float]:
        start_time = time.time()
        user_embedding = self.get_user_preferences(user_id)

        if user_embedding is None:
            recommendations = self.get_popular_diverse_articles(top_n)
            response_time = time.time() - start_time
            return recommendations, response_time

        faiss.normalize_L2(user_embedding.reshape(1, -1))
        distances, indices = self.faiss_index.search(user_embedding.reshape(1, -1), top_n)
        recommended_ids = [self.news_id_map[idx] for idx in indices[0]]

        recommendations = []
        for idx, news_id in enumerate(recommended_ids):
            article = self.news_df[self.news_df['news_id'] == news_id].iloc[0]
            recommendations.append({
                'news_id': news_id,
                'title': article['title'],
                'category': article['category'],
                'abstract': article['abstract'],
                'similarity_score': float(1 - distances[0][idx])
            })

        response_time = time.time() - start_time
        return recommendations, response_time

    def get_popular_diverse_articles(self, top_n: int = 5) -> List[Dict]:
        category_counts = self.news_df['category'].value_counts()
        recommendations = []
        for category in category_counts.index[:top_n]:
            article = self.news_df[self.news_df['category'] == category].sample(1).iloc[0]
            recommendations.append({
                'news_id': article['news_id'],
                'title': article['title'],
                'category': article['category'],
                'abstract': article['abstract'],
                'similarity_score': None
            })
        return recommendations

    def calculate_metrics(self, test_behaviors: pd.DataFrame) -> Dict[str, float]:
        print("Calculating evaluation metrics...")

        y_true = []
        y_scores = []
        ndcg_scores = []
        reciprocal_ranks = []
        response_times = []

        for _, row in tqdm(test_behaviors.iterrows(), desc="Processing test behaviors"):
            user_id = row['user_id']

            # Get user embedding and timing
            start_time = time.time()
            user_embedding = self.get_user_preferences(user_id)

            if user_embedding is None:
                continue

            # Process impressions
            if pd.notna(row['impressions']):
                impressions = [imp.split('-') for imp in row['impressions'].split()]

                # Prepare ground truth and predictions for this impression
                true_labels = []
                pred_scores = []

                for news_id, label in impressions:
                    if news_id not in self.news_embeddings:
                        continue

                    article_embedding = self.news_embeddings[news_id]
                    similarity = 1 - np.dot(user_embedding, article_embedding) / (
                        np.linalg.norm(user_embedding) * np.linalg.norm(article_embedding))

                    true_labels.append(int(label))
                    pred_scores.append(-similarity)  # Negative because higher similarity should predict clicks

                if len(true_labels) > 1:  # Only calculate NDCG if we have more than one document
                    try:
                        ndcg = ndcg_score([true_labels], [pred_scores])
                        ndcg_scores.append(ndcg)
                    except ValueError:
                        pass  # Skip if NDCG calculation fails

            # AUC
                if true_labels and pred_scores:
                    y_true.extend(true_labels)
                    y_scores.extend(pred_scores)

            # MRR
                if true_labels and pred_scores:
                    sorted_pairs = sorted(zip(pred_scores, true_labels), reverse=True)
                    for rank, (_, label) in enumerate(sorted_pairs, 1):
                        if label == 1:
                            reciprocal_ranks.append(1.0 / rank)
                            break
                    else:
                            reciprocal_ranks.append(0)

                response_times.append(time.time() - start_time)

        metrics = {}

        # Calculate AUC
        if y_true and y_scores:
            metrics['auc'] = roc_auc_score(y_true, y_scores)

        # Calculate mean NDCG
        if ndcg_scores:
            metrics['ndcg'] = np.mean(ndcg_scores)

        # Calculate MRR 
        if reciprocal_ranks:
            metrics['mrr'] = np.mean(reciprocal_ranks)

        # Calculate response time metrics
        if response_times:
            metrics['avg_response_time'] = np.mean(response_times)
            metrics['max_response_time'] = np.max(response_times)
            metrics['min_response_time'] = np.min(response_times)

        return metrics
class SemanticSearch:
    def __init__(self):
        self.model = SentenceTransformer("sentence-transformers/msmarco-distilbert-base-tas-b")
        self.dim = self.model.get_sentence_embedding_dimension()

def recommend_news(user_id: str, news_path: str, behaviors_path: str,
                  sample_size: int = 15000) -> List[Dict]:
    print(f"Generating recommendations for user {user_id}...")

    recommender = NewsRecommender(sample_size=sample_size)
    recommender.load_data(news_path, behaviors_path)

    recommendations, _ = recommender.recommend_articles(user_id, top_n=5)
    return recommendations

def evaluate_recommender(news_path: str, behaviors_path: str, test_behaviors_path: str,
                        sample_size: int = 15000) -> Dict[str, float]:
    print("Initializing recommender system...")
    recommender = NewsRecommender(sample_size=sample_size)
    recommender.load_data(news_path, behaviors_path)

    print("Loading test behaviors...")
    test_behaviors = pd.read_csv(test_behaviors_path, sep='\t',
                               names=['impression_id', 'user_id', 'time',
                                     'history', 'impressions'])

    if len(test_behaviors) > sample_size:
        test_behaviors = test_behaviors.sample(n=sample_size, random_state=42)

    metrics = recommender.calculate_metrics(test_behaviors)

    print("\nEvaluation Results:")
    print(f"AUC Score: {metrics.get('auc', 'N/A'):.4f}")
    print(f"nDCG Score: {metrics.get('ndcg', 'N/A'):.4f}")
    print(f"MRR Score: {metrics.get('mrr', 'N/A'):.4f}")
    print(f"Average Response Time: {metrics.get('avg_response_time', 'N/A'):.4f} seconds")
    print(f"Max Response Time: {metrics.get('max_response_time', 'N/A'):.4f} seconds")
    print(f"Min Response Time: {metrics.get('min_response_time', 'N/A'):.4f} seconds")

    return metrics



news_path = 'news_V.tsv'
behaviors_path = 'behaviors.tsv'
test_behaviors_path = 'behaviors_V.tsv'

metrics = evaluate_recommender(
    news_path=news_path,
    behaviors_path=behaviors_path,
    test_behaviors_path=test_behaviors_path,
    sample_size=15000
)

Initializing recommender system...
Generating embeddings for news articles...
Building FAISS index...
FAISS index built successfully.
Data loaded successfully. Using 9881 news articles and 15000 behavior records.
Loading test behaviors...
Calculating evaluation metrics...


Processing test behaviors: 15000it [00:16, 895.55it/s]



Evaluation Results:
AUC Score: 0.8744
nDCG Score: 0.4051
MRR Score: 0.3656
Average Response Time: 0.0010 seconds
Max Response Time: 0.0070 seconds
Min Response Time: 0.0000 seconds


In [None]:
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer
import pandas as pd
from sklearn.metrics import roc_auc_score, ndcg_score
from tqdm import tqdm
import time
from typing import Dict, List, Tuple, Optional

class NewsRecommender:
    def __init__(self, sample_size=15000):
        self.searcher = SemanticSearch()
        self.fitted = False
        self.news_df = None
        self.behaviors_df = None
        self.user_history = {}
        self.news_embeddings = {}
        self.sample_size = sample_size
        self.faiss_index = None

    def load_data(self, news_path: str, behaviors_path: str) -> None:
        self.news_df = pd.read_csv(news_path, sep='\t',
                                   names=['news_id', 'category', 'subcategory',
                                          'title', 'abstract', 'url', 'title_entities',
                                          'abstract_entities'])

        if len(self.news_df) > self.sample_size:
            self.news_df = self.news_df.sample(n=self.sample_size, random_state=42)

        self.behaviors_df = pd.read_csv(behaviors_path, sep='\t',
                                        names=['impression_id', 'user_id', 'time',
                                               'history', 'impressions'])

        if len(self.behaviors_df) > self.sample_size:
            self.behaviors_df = self.behaviors_df.sample(n=self.sample_size, random_state=42)

        relevant_news_ids = set()

        history_news = self.behaviors_df['history'].dropna().str.split().explode()
        relevant_news_ids.update(history_news)

        impression_news = self.behaviors_df['impressions'].dropna().str.split().explode()
        impression_news = impression_news.str.split('-').str[0]
        relevant_news_ids.update(impression_news)

        self.news_df = self.news_df[self.news_df['news_id'].isin(relevant_news_ids)]
        self.process_user_history()
        self.generate_news_embeddings()

        print(f"Data loaded successfully. Using {len(self.news_df)} news articles and {len(self.behaviors_df)} behavior records.")

    def process_user_history(self) -> None:
        for _, row in self.behaviors_df.iterrows():
            user_id = row['user_id']
            if pd.notna(row['history']):
                history = [h for h in row['history'].split() if h in self.news_df['news_id'].values]
            else:
                history = []

            if pd.notna(row['impressions']):
                impressions = [imp.split('-') for imp in row['impressions'].split()]
                clicked = [imp[0] for imp in impressions if imp[1] == '1' and imp[0] in self.news_df['news_id'].values]
                history.extend(clicked)

            if history:
                self.user_history[user_id] = list(set(history))

    def generate_news_embeddings(self) -> None:
        print("Generating embeddings for news articles...")

        self.news_df['content'] = self.news_df['title'] + ' ' + self.news_df['abstract'].fillna('')
        contents = self.news_df['content'].tolist()

        batch_size = 128
        embeddings = []

        for i in range(0, len(contents), batch_size):
            batch = contents[i:i + batch_size]
            batch_embeddings = self.searcher.model.encode(batch, convert_to_numpy=True)
            embeddings.extend(batch_embeddings)

        embeddings = np.array(embeddings)

        news_ids = self.news_df['news_id'].values
        for idx, news_id in enumerate(news_ids):
            self.news_embeddings[news_id] = embeddings[idx]

        print("Building FAISS index...")
        self.faiss_index = faiss.IndexFlatIP(self.searcher.dim)
        faiss.normalize_L2(embeddings)
        self.faiss_index.add(embeddings)
        self.news_id_map = {idx: news_id for idx, news_id in enumerate(news_ids)}
        print("FAISS index built successfully.")

    def get_user_preferences(self, user_id: str) -> Optional[np.ndarray]:
        if user_id not in self.user_history or not self.user_history[user_id]:
            return None

        history_embeddings = [self.news_embeddings[news_id]
                              for news_id in self.user_history[user_id]
                              if news_id in self.news_embeddings]

        if not history_embeddings:
            return None

        return np.mean(history_embeddings, axis=0)

    def recommend_articles(self, user_id: str, top_n: int = 5) -> Tuple[List[Dict], float]:
        start_time = time.time()
        user_embedding = self.get_user_preferences(user_id)

        if user_embedding is None:
            recommendations = self.get_popular_diverse_articles(top_n)
            response_time = time.time() - start_time
            return recommendations, response_time

        faiss.normalize_L2(user_embedding.reshape(1, -1))
        distances, indices = self.faiss_index.search(user_embedding.reshape(1, -1), top_n)
        recommended_ids = [self.news_id_map[idx] for idx in indices[0]]

        recommendations = []
        for idx, news_id in enumerate(recommended_ids):
            article = self.news_df[self.news_df['news_id'] == news_id].iloc[0]
            recommendations.append({
                'news_id': news_id,
                'title': article['title'],
                'category': article['category'],
                'abstract': article['abstract'],
                'similarity_score': float(1 - distances[0][idx])
            })

        response_time = time.time() - start_time
        return recommendations, response_time

    def get_popular_diverse_articles(self, top_n: int = 5) -> List[Dict]:
        category_counts = self.news_df['category'].value_counts()
        recommendations = []
        for category in category_counts.index[:top_n]:
            article = self.news_df[self.news_df['category'] == category].sample(1).iloc[0]
            recommendations.append({
                'news_id': article['news_id'],
                'title': article['title'],
                'category': article['category'],
                'abstract': article['abstract'],
                'similarity_score': None
            })
        return recommendations

    def calculate_metrics(self, test_behaviors: pd.DataFrame) -> Dict[str, float]:
        print("Calculating evaluation metrics...")

        y_true = []
        y_scores = []
        ndcg_scores = []
        reciprocal_ranks = []
        response_times = []

        for _, row in tqdm(test_behaviors.iterrows(), desc="Processing test behaviors"):
            user_id = row['user_id']

            # Get user embedding and timing
            start_time = time.time()
            user_embedding = self.get_user_preferences(user_id)

            if user_embedding is None:
                continue

            # Process impressions
            if pd.notna(row['impressions']):
                impressions = [imp.split('-') for imp in row['impressions'].split()]

                # Prepare ground truth and predictions for this impression
                true_labels = []
                pred_scores = []

                for news_id, label in impressions:
                    if news_id not in self.news_embeddings:
                        continue

                    article_embedding = self.news_embeddings[news_id]
                    similarity = 1 - np.dot(user_embedding, article_embedding) / (
                        np.linalg.norm(user_embedding) * np.linalg.norm(article_embedding))

                    true_labels.append(int(label))
                    pred_scores.append(-similarity)  # Negative because higher similarity should predict clicks

                if len(true_labels) > 1:  # Only calculate NDCG if we have more than one document
                    try:
                        ndcg = ndcg_score([true_labels], [pred_scores])
                        ndcg_scores.append(ndcg)
                    except ValueError:
                        pass  # Skip if NDCG calculation fails

            # AUC
                if true_labels and pred_scores:
                    y_true.extend(true_labels)
                    y_scores.extend(pred_scores)

            # MRR
                if true_labels and pred_scores:
                    sorted_pairs = sorted(zip(pred_scores, true_labels), reverse=True)
                    for rank, (_, label) in enumerate(sorted_pairs, 1):
                        if label == 1:
                            reciprocal_ranks.append(1.0 / rank)
                            break
                    else:
                            reciprocal_ranks.append(0)

                response_times.append(time.time() - start_time)

        metrics = {}

        # Calculate AUC
        if y_true and y_scores:
            metrics['auc'] = roc_auc_score(y_true, y_scores)

        # Calculate mean NDCG 
        if ndcg_scores:
            metrics['ndcg'] = np.mean(ndcg_scores)

        # Calculate MRR
        if reciprocal_ranks:
            metrics['mrr'] = np.mean(reciprocal_ranks)

        # Calculate response time metrics
        if response_times:
            metrics['avg_response_time'] = np.mean(response_times)
            metrics['max_response_time'] = np.max(response_times)
            metrics['min_response_time'] = np.min(response_times)

        return metrics
class SemanticSearch:
    def __init__(self):
        self.model = SentenceTransformer("sentence-transformers/stsb-roberta-base-v2")
        self.dim = self.model.get_sentence_embedding_dimension()

def recommend_news(user_id: str, news_path: str, behaviors_path: str,
                  sample_size: int = 15000) -> List[Dict]:
    print(f"Generating recommendations for user {user_id}...")

    recommender = NewsRecommender(sample_size=sample_size)
    recommender.load_data(news_path, behaviors_path)

    recommendations, _ = recommender.recommend_articles(user_id, top_n=5)
    return recommendations

def evaluate_recommender(news_path: str, behaviors_path: str, test_behaviors_path: str,
                        sample_size: int = 15000) -> Dict[str, float]:

    print("Initializing recommender system...")
    recommender = NewsRecommender(sample_size=sample_size)
    recommender.load_data(news_path, behaviors_path)

    print("Loading test behaviors...")
    test_behaviors = pd.read_csv(test_behaviors_path, sep='\t',
                               names=['impression_id', 'user_id', 'time',
                                     'history', 'impressions'])

    if len(test_behaviors) > sample_size:
        test_behaviors = test_behaviors.sample(n=sample_size, random_state=42)

    metrics = recommender.calculate_metrics(test_behaviors)

    print("\nEvaluation Results:")
    print(f"AUC Score: {metrics.get('auc', 'N/A'):.4f}")
    print(f"nDCG Score: {metrics.get('ndcg', 'N/A'):.4f}")
    print(f"MRR Score: {metrics.get('mrr', 'N/A'):.4f}")
    print(f"Average Response Time: {metrics.get('avg_response_time', 'N/A'):.4f} seconds")
    print(f"Max Response Time: {metrics.get('max_response_time', 'N/A'):.4f} seconds")
    print(f"Min Response Time: {metrics.get('min_response_time', 'N/A'):.4f} seconds")

    return metrics



news_path = 'news_V.tsv'
behaviors_path = 'behaviors.tsv'
test_behaviors_path = 'behaviors_V.tsv'

metrics = evaluate_recommender(
    news_path=news_path,
    behaviors_path=behaviors_path,
    test_behaviors_path=test_behaviors_path,
    sample_size=15000
)

Initializing recommender system...
Generating embeddings for news articles...
Building FAISS index...
FAISS index built successfully.
Data loaded successfully. Using 9881 news articles and 15000 behavior records.
Loading test behaviors...
Calculating evaluation metrics...


Processing test behaviors: 15000it [00:16, 890.20it/s] 



Evaluation Results:
AUC Score: 0.9378
nDCG Score: 0.4573
MRR Score: 0.4320
Average Response Time: 0.0010 seconds
Max Response Time: 0.0075 seconds
Min Response Time: 0.0000 seconds


In [None]:
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer
import pandas as pd
from sklearn.metrics import roc_auc_score, ndcg_score
from tqdm import tqdm
import time
from typing import Dict, List, Tuple, Optional

class NewsRecommender:
    def __init__(self, sample_size=15000):
        self.searcher = SemanticSearch()
        self.fitted = False
        self.news_df = None
        self.behaviors_df = None
        self.user_history = {}
        self.news_embeddings = {}
        self.sample_size = sample_size
        self.faiss_index = None

    def load_data(self, news_path: str, behaviors_path: str) -> None:
        self.news_df = pd.read_csv(news_path, sep='\t',
                                   names=['news_id', 'category', 'subcategory',
                                          'title', 'abstract', 'url', 'title_entities',
                                          'abstract_entities'])

        if len(self.news_df) > self.sample_size:
            self.news_df = self.news_df.sample(n=self.sample_size, random_state=42)

        self.behaviors_df = pd.read_csv(behaviors_path, sep='\t',
                                        names=['impression_id', 'user_id', 'time',
                                               'history', 'impressions'])

        if len(self.behaviors_df) > self.sample_size:
            self.behaviors_df = self.behaviors_df.sample(n=self.sample_size, random_state=42)

        relevant_news_ids = set()

        history_news = self.behaviors_df['history'].dropna().str.split().explode()
        relevant_news_ids.update(history_news)

        impression_news = self.behaviors_df['impressions'].dropna().str.split().explode()
        impression_news = impression_news.str.split('-').str[0]
        relevant_news_ids.update(impression_news)

        self.news_df = self.news_df[self.news_df['news_id'].isin(relevant_news_ids)]
        self.process_user_history()
        self.generate_news_embeddings()

        print(f"Data loaded successfully. Using {len(self.news_df)} news articles and {len(self.behaviors_df)} behavior records.")

    def process_user_history(self) -> None:
        for _, row in self.behaviors_df.iterrows():
            user_id = row['user_id']
            if pd.notna(row['history']):
                history = [h for h in row['history'].split() if h in self.news_df['news_id'].values]
            else:
                history = []

            if pd.notna(row['impressions']):
                impressions = [imp.split('-') for imp in row['impressions'].split()]
                clicked = [imp[0] for imp in impressions if imp[1] == '1' and imp[0] in self.news_df['news_id'].values]
                history.extend(clicked)

            if history:
                self.user_history[user_id] = list(set(history))

    def generate_news_embeddings(self) -> None:
        print("Generating embeddings for news articles...")

        self.news_df['content'] = self.news_df['title'] + ' ' + self.news_df['abstract'].fillna('')
        contents = self.news_df['content'].tolist()

        batch_size = 128
        embeddings = []

        for i in range(0, len(contents), batch_size):
            batch = contents[i:i + batch_size]
            batch_embeddings = self.searcher.model.encode(batch, convert_to_numpy=True)
            embeddings.extend(batch_embeddings)

        embeddings = np.array(embeddings)

        news_ids = self.news_df['news_id'].values
        for idx, news_id in enumerate(news_ids):
            self.news_embeddings[news_id] = embeddings[idx]

        print("Building FAISS index...")
        self.faiss_index = faiss.IndexFlatIP(self.searcher.dim)
        faiss.normalize_L2(embeddings)
        self.faiss_index.add(embeddings)
        self.news_id_map = {idx: news_id for idx, news_id in enumerate(news_ids)}
        print("FAISS index built successfully.")

    def get_user_preferences(self, user_id: str) -> Optional[np.ndarray]:
        if user_id not in self.user_history or not self.user_history[user_id]:
            return None

        history_embeddings = [self.news_embeddings[news_id]
                              for news_id in self.user_history[user_id]
                              if news_id in self.news_embeddings]

        if not history_embeddings:
            return None

        return np.mean(history_embeddings, axis=0)

    def recommend_articles(self, user_id: str, top_n: int = 5) -> Tuple[List[Dict], float]:
        start_time = time.time()
        user_embedding = self.get_user_preferences(user_id)

        if user_embedding is None:
            recommendations = self.get_popular_diverse_articles(top_n)
            response_time = time.time() - start_time
            return recommendations, response_time

        faiss.normalize_L2(user_embedding.reshape(1, -1))
        distances, indices = self.faiss_index.search(user_embedding.reshape(1, -1), top_n)
        recommended_ids = [self.news_id_map[idx] for idx in indices[0]]

        recommendations = []
        for idx, news_id in enumerate(recommended_ids):
            article = self.news_df[self.news_df['news_id'] == news_id].iloc[0]
            recommendations.append({
                'news_id': news_id,
                'title': article['title'],
                'category': article['category'],
                'abstract': article['abstract'],
                'similarity_score': float(1 - distances[0][idx])
            })

        response_time = time.time() - start_time
        return recommendations, response_time

    def get_popular_diverse_articles(self, top_n: int = 5) -> List[Dict]:
        category_counts = self.news_df['category'].value_counts()
        recommendations = []
        for category in category_counts.index[:top_n]:
            article = self.news_df[self.news_df['category'] == category].sample(1).iloc[0]
            recommendations.append({
                'news_id': article['news_id'],
                'title': article['title'],
                'category': article['category'],
                'abstract': article['abstract'],
                'similarity_score': None
            })
        return recommendations

    def calculate_metrics(self, test_behaviors: pd.DataFrame) -> Dict[str, float]:
        print("Calculating evaluation metrics...")

        y_true = []
        y_scores = []
        ndcg_scores = []
        reciprocal_ranks = []
        response_times = []

        for _, row in tqdm(test_behaviors.iterrows(), desc="Processing test behaviors"):
            user_id = row['user_id']

            # Get user embedding and timing
            start_time = time.time()
            user_embedding = self.get_user_preferences(user_id)

            if user_embedding is None:
                continue

            # Process impressions
            if pd.notna(row['impressions']):
                impressions = [imp.split('-') for imp in row['impressions'].split()]

                # Prepare ground truth and predictions for this impression
                true_labels = []
                pred_scores = []

                for news_id, label in impressions:
                    if news_id not in self.news_embeddings:
                        continue

                    article_embedding = self.news_embeddings[news_id]
                    similarity = 1 - np.dot(user_embedding, article_embedding) / (
                        np.linalg.norm(user_embedding) * np.linalg.norm(article_embedding))

                    true_labels.append(int(label))
                    pred_scores.append(-similarity)  # Negative because higher similarity should predict clicks

                if len(true_labels) > 1:  # Only calculate NDCG if we have more than one document
                    try:
                        ndcg = ndcg_score([true_labels], [pred_scores])
                        ndcg_scores.append(ndcg)
                    except ValueError:
                        pass  # Skip if NDCG calculation fails

            # AUC
                if true_labels and pred_scores:
                    y_true.extend(true_labels)
                    y_scores.extend(pred_scores)

            # MRR
                if true_labels and pred_scores:
                    sorted_pairs = sorted(zip(pred_scores, true_labels), reverse=True)
                    for rank, (_, label) in enumerate(sorted_pairs, 1):
                        if label == 1:
                            reciprocal_ranks.append(1.0 / rank)
                            break
                    else:
                            reciprocal_ranks.append(0)

                response_times.append(time.time() - start_time)

        metrics = {}

        # Calculate AUC 
        if y_true and y_scores:
            metrics['auc'] = roc_auc_score(y_true, y_scores)

        # Calculate mean NDCG
        if ndcg_scores:
            metrics['ndcg'] = np.mean(ndcg_scores)

        # Calculate MRR 
        if reciprocal_ranks:
            metrics['mrr'] = np.mean(reciprocal_ranks)

        # Calculate response time metrics
        if response_times:
            metrics['avg_response_time'] = np.mean(response_times)
            metrics['max_response_time'] = np.max(response_times)
            metrics['min_response_time'] = np.min(response_times)

        return metrics
class SemanticSearch:
    def __init__(self):
        self.model = SentenceTransformer("sentence-transformers/all-distilroberta-v1")
        self.dim = self.model.get_sentence_embedding_dimension()

def recommend_news(user_id: str, news_path: str, behaviors_path: str,
                  sample_size: int = 15000) -> List[Dict]:
    print(f"Generating recommendations for user {user_id}...")

    recommender = NewsRecommender(sample_size=sample_size)
    recommender.load_data(news_path, behaviors_path)

    recommendations, _ = recommender.recommend_articles(user_id, top_n=5)
    return recommendations

def evaluate_recommender(news_path: str, behaviors_path: str, test_behaviors_path: str,
                        sample_size: int = 15000) -> Dict[str, float]:

    print("Initializing recommender system...")
    recommender = NewsRecommender(sample_size=sample_size)
    recommender.load_data(news_path, behaviors_path)

    print("Loading test behaviors...")
    test_behaviors = pd.read_csv(test_behaviors_path, sep='\t',
                               names=['impression_id', 'user_id', 'time',
                                     'history', 'impressions'])

    if len(test_behaviors) > sample_size:
        test_behaviors = test_behaviors.sample(n=sample_size, random_state=42)

    metrics = recommender.calculate_metrics(test_behaviors)

    print("\nEvaluation Results:")
    print(f"AUC Score: {metrics.get('auc', 'N/A'):.4f}")
    print(f"nDCG Score: {metrics.get('ndcg', 'N/A'):.4f}")
    print(f"MRR Score: {metrics.get('mrr', 'N/A'):.4f}")
    print(f"Average Response Time: {metrics.get('avg_response_time', 'N/A'):.4f} seconds")
    print(f"Max Response Time: {metrics.get('max_response_time', 'N/A'):.4f} seconds")
    print(f"Min Response Time: {metrics.get('min_response_time', 'N/A'):.4f} seconds")

    return metrics



news_path = 'news_V.tsv'
behaviors_path = 'behaviors.tsv'
test_behaviors_path = 'behaviors_V.tsv'

metrics = evaluate_recommender(
    news_path=news_path,
    behaviors_path=behaviors_path,
    test_behaviors_path=test_behaviors_path,
    sample_size=15000
)

Initializing recommender system...
Generating embeddings for news articles...
Building FAISS index...
FAISS index built successfully.
Data loaded successfully. Using 9881 news articles and 15000 behavior records.
Loading test behaviors...
Calculating evaluation metrics...


Processing test behaviors: 15000it [00:16, 910.25it/s] 



Evaluation Results:
AUC Score: 0.9434
nDCG Score: 0.4641
MRR Score: 0.4398
Average Response Time: 0.0010 seconds
Max Response Time: 0.0078 seconds
Min Response Time: 0.0000 seconds


In [None]:
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer
import pandas as pd
from sklearn.metrics import roc_auc_score, ndcg_score
from tqdm import tqdm
import time
from typing import Dict, List, Tuple, Optional

class NewsRecommender:
    def __init__(self, sample_size=15000):
        self.searcher = SemanticSearch()
        self.fitted = False
        self.news_df = None
        self.behaviors_df = None
        self.user_history = {}
        self.news_embeddings = {}
        self.sample_size = sample_size
        self.faiss_index = None

    def load_data(self, news_path: str, behaviors_path: str) -> None:
        self.news_df = pd.read_csv(news_path, sep='\t',
                                   names=['news_id', 'category', 'subcategory',
                                          'title', 'abstract', 'url', 'title_entities',
                                          'abstract_entities'])

        if len(self.news_df) > self.sample_size:
            self.news_df = self.news_df.sample(n=self.sample_size, random_state=42)

        self.behaviors_df = pd.read_csv(behaviors_path, sep='\t',
                                        names=['impression_id', 'user_id', 'time',
                                               'history', 'impressions'])

        if len(self.behaviors_df) > self.sample_size:
            self.behaviors_df = self.behaviors_df.sample(n=self.sample_size, random_state=42)

        relevant_news_ids = set()

        history_news = self.behaviors_df['history'].dropna().str.split().explode()
        relevant_news_ids.update(history_news)

        impression_news = self.behaviors_df['impressions'].dropna().str.split().explode()
        impression_news = impression_news.str.split('-').str[0]
        relevant_news_ids.update(impression_news)

        self.news_df = self.news_df[self.news_df['news_id'].isin(relevant_news_ids)]
        self.process_user_history()
        self.generate_news_embeddings()

        print(f"Data loaded successfully. Using {len(self.news_df)} news articles and {len(self.behaviors_df)} behavior records.")

    def process_user_history(self) -> None:
        for _, row in self.behaviors_df.iterrows():
            user_id = row['user_id']
            if pd.notna(row['history']):
                history = [h for h in row['history'].split() if h in self.news_df['news_id'].values]
            else:
                history = []

            if pd.notna(row['impressions']):
                impressions = [imp.split('-') for imp in row['impressions'].split()]
                clicked = [imp[0] for imp in impressions if imp[1] == '1' and imp[0] in self.news_df['news_id'].values]
                history.extend(clicked)

            if history:
                self.user_history[user_id] = list(set(history))

    def generate_news_embeddings(self) -> None:
        print("Generating embeddings for news articles...")

        self.news_df['content'] = self.news_df['title'] + ' ' + self.news_df['abstract'].fillna('')
        contents = self.news_df['content'].tolist()

        batch_size = 128
        embeddings = []

        for i in range(0, len(contents), batch_size):
            batch = contents[i:i + batch_size]
            batch_embeddings = self.searcher.model.encode(batch, convert_to_numpy=True)
            embeddings.extend(batch_embeddings)

        embeddings = np.array(embeddings)

        news_ids = self.news_df['news_id'].values
        for idx, news_id in enumerate(news_ids):
            self.news_embeddings[news_id] = embeddings[idx]

        print("Building FAISS index...")
        self.faiss_index = faiss.IndexFlatIP(self.searcher.dim)
        faiss.normalize_L2(embeddings)
        self.faiss_index.add(embeddings)
        self.news_id_map = {idx: news_id for idx, news_id in enumerate(news_ids)}
        print("FAISS index built successfully.")

    def get_user_preferences(self, user_id: str) -> Optional[np.ndarray]:
        if user_id not in self.user_history or not self.user_history[user_id]:
            return None

        history_embeddings = [self.news_embeddings[news_id]
                              for news_id in self.user_history[user_id]
                              if news_id in self.news_embeddings]

        if not history_embeddings:
            return None

        return np.mean(history_embeddings, axis=0)

    def recommend_articles(self, user_id: str, top_n: int = 5) -> Tuple[List[Dict], float]:
        start_time = time.time()
        user_embedding = self.get_user_preferences(user_id)

        if user_embedding is None:
            recommendations = self.get_popular_diverse_articles(top_n)
            response_time = time.time() - start_time
            return recommendations, response_time

        faiss.normalize_L2(user_embedding.reshape(1, -1))
        distances, indices = self.faiss_index.search(user_embedding.reshape(1, -1), top_n)
        recommended_ids = [self.news_id_map[idx] for idx in indices[0]]

        recommendations = []
        for idx, news_id in enumerate(recommended_ids):
            article = self.news_df[self.news_df['news_id'] == news_id].iloc[0]
            recommendations.append({
                'news_id': news_id,
                'title': article['title'],
                'category': article['category'],
                'abstract': article['abstract'],
                'similarity_score': float(1 - distances[0][idx])
            })

        response_time = time.time() - start_time
        return recommendations, response_time

    def get_popular_diverse_articles(self, top_n: int = 5) -> List[Dict]:
        category_counts = self.news_df['category'].value_counts()
        recommendations = []
        for category in category_counts.index[:top_n]:
            article = self.news_df[self.news_df['category'] == category].sample(1).iloc[0]
            recommendations.append({
                'news_id': article['news_id'],
                'title': article['title'],
                'category': article['category'],
                'abstract': article['abstract'],
                'similarity_score': None
            })
        return recommendations

    def calculate_metrics(self, test_behaviors: pd.DataFrame) -> Dict[str, float]:
        print("Calculating evaluation metrics...")

        y_true = []
        y_scores = []
        ndcg_scores = []
        reciprocal_ranks = []
        response_times = []

        for _, row in tqdm(test_behaviors.iterrows(), desc="Processing test behaviors"):
            user_id = row['user_id']

            # Get user embedding and timing
            start_time = time.time()
            user_embedding = self.get_user_preferences(user_id)

            if user_embedding is None:
                continue

            # Process impressions
            if pd.notna(row['impressions']):
                impressions = [imp.split('-') for imp in row['impressions'].split()]

                # Prepare ground truth and predictions for this impression
                true_labels = []
                pred_scores = []

                for news_id, label in impressions:
                    if news_id not in self.news_embeddings:
                        continue

                    article_embedding = self.news_embeddings[news_id]
                    similarity = 1 - np.dot(user_embedding, article_embedding) / (
                        np.linalg.norm(user_embedding) * np.linalg.norm(article_embedding))

                    true_labels.append(int(label))
                    pred_scores.append(-similarity)  # Negative because higher similarity should predict clicks

                if len(true_labels) > 1:  # Only calculate NDCG if we have more than one document
                    try:
                        ndcg = ndcg_score([true_labels], [pred_scores])
                        ndcg_scores.append(ndcg)
                    except ValueError:
                        pass  # Skip if NDCG calculation fails

            # AUC
                if true_labels and pred_scores:
                    y_true.extend(true_labels)
                    y_scores.extend(pred_scores)

            # MRR
                if true_labels and pred_scores:
                    sorted_pairs = sorted(zip(pred_scores, true_labels), reverse=True)
                    for rank, (_, label) in enumerate(sorted_pairs, 1):
                        if label == 1:
                            reciprocal_ranks.append(1.0 / rank)
                            break
                    else:
                            reciprocal_ranks.append(0)

                response_times.append(time.time() - start_time)

        metrics = {}

        # Calculate AUC
        if y_true and y_scores:
            metrics['auc'] = roc_auc_score(y_true, y_scores)

        # Calculate mean NDCG
        if ndcg_scores:
            metrics['ndcg'] = np.mean(ndcg_scores)

        # Calculate MRR
        if reciprocal_ranks:
            metrics['mrr'] = np.mean(reciprocal_ranks)

        # Calculate response time metrics
        if response_times:
            metrics['avg_response_time'] = np.mean(response_times)
            metrics['max_response_time'] = np.max(response_times)
            metrics['min_response_time'] = np.min(response_times)

        return metrics
class SemanticSearch:
    def __init__(self):
        self.model = SentenceTransformer("sentence-transformers/all-distilroberta-v1")
        self.dim = self.model.get_sentence_embedding_dimension()

def recommend_news(user_id: str, news_path: str, behaviors_path: str,
                  sample_size: int = 15000) -> List[Dict]:
    print(f"Generating recommendations for user {user_id}...")

    recommender = NewsRecommender(sample_size=sample_size)
    recommender.load_data(news_path, behaviors_path)

    recommendations, _ = recommender.recommend_articles(user_id, top_n=5)
    return recommendations

def evaluate_recommender(news_path: str, behaviors_path: str, test_behaviors_path: str,
                        sample_size: int = 15000) -> Dict[str, float]:
    print("Initializing recommender system...")
    recommender = NewsRecommender(sample_size=sample_size)
    recommender.load_data(news_path, behaviors_path)

    print("Loading test behaviors...")
    test_behaviors = pd.read_csv(test_behaviors_path, sep='\t',
                               names=['impression_id', 'user_id', 'time',
                                     'history', 'impressions'])

    if len(test_behaviors) > sample_size:
        test_behaviors = test_behaviors.sample(n=sample_size, random_state=42)

    metrics = recommender.calculate_metrics(test_behaviors)

    print("\nEvaluation Results:")
    print(f"AUC Score: {metrics.get('auc', 'N/A'):.4f}")
    print(f"nDCG Score: {metrics.get('ndcg', 'N/A'):.4f}")
    print(f"MRR Score: {metrics.get('mrr', 'N/A'):.4f}")
    print(f"Average Response Time: {metrics.get('avg_response_time', 'N/A'):.4f} seconds")
    print(f"Max Response Time: {metrics.get('max_response_time', 'N/A'):.4f} seconds")
    print(f"Min Response Time: {metrics.get('min_response_time', 'N/A'):.4f} seconds")

    return metrics



news_path = 'news_V.tsv'
behaviors_path = 'behaviors.tsv'
test_behaviors_path = 'behaviors_V.tsv'

metrics = evaluate_recommender(
    news_path=news_path,
    behaviors_path=behaviors_path,
    test_behaviors_path=test_behaviors_path,
    sample_size=15000
)

Initializing recommender system...
Generating embeddings for news articles...
Building FAISS index...
FAISS index built successfully.
Data loaded successfully. Using 9881 news articles and 15000 behavior records.
Loading test behaviors...
Calculating evaluation metrics...


Processing test behaviors: 15000it [00:16, 893.23it/s]



Evaluation Results:
AUC Score: 0.9434
nDCG Score: 0.4641
MRR Score: 0.4398
Average Response Time: 0.0010 seconds
Max Response Time: 0.0106 seconds
Min Response Time: 0.0000 seconds


In [None]:
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer
import pandas as pd
from sklearn.metrics import roc_auc_score, ndcg_score
from tqdm import tqdm
import time
from typing import Dict, List, Tuple, Optional

class NewsRecommender:
    def __init__(self, sample_size=15000):
        self.searcher = SemanticSearch()
        self.fitted = False
        self.news_df = None
        self.behaviors_df = None
        self.user_history = {}
        self.news_embeddings = {}
        self.sample_size = sample_size
        self.faiss_index = None

    def load_data(self, news_path: str, behaviors_path: str) -> None:
        self.news_df = pd.read_csv(news_path, sep='\t',
                                   names=['news_id', 'category', 'subcategory',
                                          'title', 'abstract', 'url', 'title_entities',
                                          'abstract_entities'])

        if len(self.news_df) > self.sample_size:
            self.news_df = self.news_df.sample(n=self.sample_size, random_state=42)

        self.behaviors_df = pd.read_csv(behaviors_path, sep='\t',
                                        names=['impression_id', 'user_id', 'time',
                                               'history', 'impressions'])

        if len(self.behaviors_df) > self.sample_size:
            self.behaviors_df = self.behaviors_df.sample(n=self.sample_size, random_state=42)

        relevant_news_ids = set()

        history_news = self.behaviors_df['history'].dropna().str.split().explode()
        relevant_news_ids.update(history_news)

        impression_news = self.behaviors_df['impressions'].dropna().str.split().explode()
        impression_news = impression_news.str.split('-').str[0]
        relevant_news_ids.update(impression_news)

        self.news_df = self.news_df[self.news_df['news_id'].isin(relevant_news_ids)]
        self.process_user_history()
        self.generate_news_embeddings()

        print(f"Data loaded successfully. Using {len(self.news_df)} news articles and {len(self.behaviors_df)} behavior records.")

    def process_user_history(self) -> None:
        for _, row in self.behaviors_df.iterrows():
            user_id = row['user_id']
            if pd.notna(row['history']):
                history = [h for h in row['history'].split() if h in self.news_df['news_id'].values]
            else:
                history = []

            if pd.notna(row['impressions']):
                impressions = [imp.split('-') for imp in row['impressions'].split()]
                clicked = [imp[0] for imp in impressions if imp[1] == '1' and imp[0] in self.news_df['news_id'].values]
                history.extend(clicked)

            if history:
                self.user_history[user_id] = list(set(history))

    def generate_news_embeddings(self) -> None:
        print("Generating embeddings for news articles...")

        self.news_df['content'] = self.news_df['title'] + ' ' + self.news_df['abstract'].fillna('')
        contents = self.news_df['content'].tolist()

        batch_size = 128
        embeddings = []

        for i in range(0, len(contents), batch_size):
            batch = contents[i:i + batch_size]
            batch_embeddings = self.searcher.model.encode(batch, convert_to_numpy=True)
            embeddings.extend(batch_embeddings)

        embeddings = np.array(embeddings)

        news_ids = self.news_df['news_id'].values
        for idx, news_id in enumerate(news_ids):
            self.news_embeddings[news_id] = embeddings[idx]

        print("Building FAISS index...")
        self.faiss_index = faiss.IndexFlatIP(self.searcher.dim)
        faiss.normalize_L2(embeddings)
        self.faiss_index.add(embeddings)
        self.news_id_map = {idx: news_id for idx, news_id in enumerate(news_ids)}
        print("FAISS index built successfully.")

    def get_user_preferences(self, user_id: str) -> Optional[np.ndarray]:
        if user_id not in self.user_history or not self.user_history[user_id]:
            return None

        history_embeddings = [self.news_embeddings[news_id]
                              for news_id in self.user_history[user_id]
                              if news_id in self.news_embeddings]

        if not history_embeddings:
            return None

        return np.mean(history_embeddings, axis=0)

    def recommend_articles(self, user_id: str, top_n: int = 5) -> Tuple[List[Dict], float]:
        start_time = time.time()
        user_embedding = self.get_user_preferences(user_id)

        if user_embedding is None:
            recommendations = self.get_popular_diverse_articles(top_n)
            response_time = time.time() - start_time
            return recommendations, response_time

        faiss.normalize_L2(user_embedding.reshape(1, -1))
        distances, indices = self.faiss_index.search(user_embedding.reshape(1, -1), top_n)
        recommended_ids = [self.news_id_map[idx] for idx in indices[0]]

        recommendations = []
        for idx, news_id in enumerate(recommended_ids):
            article = self.news_df[self.news_df['news_id'] == news_id].iloc[0]
            recommendations.append({
                'news_id': news_id,
                'title': article['title'],
                'category': article['category'],
                'abstract': article['abstract'],
                'similarity_score': float(1 - distances[0][idx])
            })

        response_time = time.time() - start_time
        return recommendations, response_time

    def get_popular_diverse_articles(self, top_n: int = 5) -> List[Dict]:
        category_counts = self.news_df['category'].value_counts()
        recommendations = []
        for category in category_counts.index[:top_n]:
            article = self.news_df[self.news_df['category'] == category].sample(1).iloc[0]
            recommendations.append({
                'news_id': article['news_id'],
                'title': article['title'],
                'category': article['category'],
                'abstract': article['abstract'],
                'similarity_score': None
            })
        return recommendations

    def calculate_metrics(self, test_behaviors: pd.DataFrame) -> Dict[str, float]:
        print("Calculating evaluation metrics...")

        y_true = []
        y_scores = []
        ndcg_scores = []
        reciprocal_ranks = []
        response_times = []

        for _, row in tqdm(test_behaviors.iterrows(), desc="Processing test behaviors"):
            user_id = row['user_id']

            # Get user embedding and timing
            start_time = time.time()
            user_embedding = self.get_user_preferences(user_id)

            if user_embedding is None:
                continue

            # Process impressions
            if pd.notna(row['impressions']):
                impressions = [imp.split('-') for imp in row['impressions'].split()]

                # Prepare ground truth and predictions for this impression
                true_labels = []
                pred_scores = []

                for news_id, label in impressions:
                    if news_id not in self.news_embeddings:
                        continue

                    article_embedding = self.news_embeddings[news_id]
                    similarity = 1 - np.dot(user_embedding, article_embedding) / (
                        np.linalg.norm(user_embedding) * np.linalg.norm(article_embedding))

                    true_labels.append(int(label))
                    pred_scores.append(-similarity)  # Negative because higher similarity should predict clicks

                if len(true_labels) > 1:  # Only calculate NDCG if we have more than one document
                    try:
                        ndcg = ndcg_score([true_labels], [pred_scores])
                        ndcg_scores.append(ndcg)
                    except ValueError:
                        pass  # Skip if NDCG calculation fails

            # AUC
                if true_labels and pred_scores:
                    y_true.extend(true_labels)
                    y_scores.extend(pred_scores)

            # MRR
                if true_labels and pred_scores:
                    sorted_pairs = sorted(zip(pred_scores, true_labels), reverse=True)
                    for rank, (_, label) in enumerate(sorted_pairs, 1):
                        if label == 1:
                            reciprocal_ranks.append(1.0 / rank)
                            break
                    else:
                            reciprocal_ranks.append(0)

                response_times.append(time.time() - start_time)

        metrics = {}

        # Calculate AUC
        if y_true and y_scores:
            metrics['auc'] = roc_auc_score(y_true, y_scores)

        # Calculate mean NDCG 
        if ndcg_scores:
            metrics['ndcg'] = np.mean(ndcg_scores)

        # Calculate MRR
        if reciprocal_ranks:
            metrics['mrr'] = np.mean(reciprocal_ranks)

        # Calculate response time metrics
        if response_times:
            metrics['avg_response_time'] = np.mean(response_times)
            metrics['max_response_time'] = np.max(response_times)
            metrics['min_response_time'] = np.min(response_times)

        return metrics
class SemanticSearch:
    def __init__(self):
        self.model = SentenceTransformer("sentence-transformers/paraphrase-MPNet-base-v2")
        self.dim = self.model.get_sentence_embedding_dimension()

def recommend_news(user_id: str, news_path: str, behaviors_path: str,
                  sample_size: int = 15000) -> List[Dict]:
    print(f"Generating recommendations for user {user_id}...")

    recommender = NewsRecommender(sample_size=sample_size)
    recommender.load_data(news_path, behaviors_path)

    recommendations, _ = recommender.recommend_articles(user_id, top_n=5)
    return recommendations

def evaluate_recommender(news_path: str, behaviors_path: str, test_behaviors_path: str,
                        sample_size: int = 15000) -> Dict[str, float]:
    print("Initializing recommender system...")
    recommender = NewsRecommender(sample_size=sample_size)
    recommender.load_data(news_path, behaviors_path)

    print("Loading test behaviors...")
    test_behaviors = pd.read_csv(test_behaviors_path, sep='\t',
                               names=['impression_id', 'user_id', 'time',
                                     'history', 'impressions'])

    if len(test_behaviors) > sample_size:
        test_behaviors = test_behaviors.sample(n=sample_size, random_state=42)

    metrics = recommender.calculate_metrics(test_behaviors)

    print("\nEvaluation Results:")
    print(f"AUC Score: {metrics.get('auc', 'N/A'):.4f}")
    print(f"nDCG Score: {metrics.get('ndcg', 'N/A'):.4f}")
    print(f"MRR Score: {metrics.get('mrr', 'N/A'):.4f}")
    print(f"Average Response Time: {metrics.get('avg_response_time', 'N/A'):.4f} seconds")
    print(f"Max Response Time: {metrics.get('max_response_time', 'N/A'):.4f} seconds")
    print(f"Min Response Time: {metrics.get('min_response_time', 'N/A'):.4f} seconds")

    return metrics



news_path = 'news_V.tsv'
behaviors_path = 'behaviors.tsv'
test_behaviors_path = 'behaviors_V.tsv'

metrics = evaluate_recommender(
    news_path=news_path,
    behaviors_path=behaviors_path,
    test_behaviors_path=test_behaviors_path,
    sample_size=15000
)

Initializing recommender system...
Generating embeddings for news articles...
Building FAISS index...
FAISS index built successfully.
Data loaded successfully. Using 9881 news articles and 15000 behavior records.
Loading test behaviors...
Calculating evaluation metrics...


Processing test behaviors: 15000it [00:18, 804.96it/s]



Evaluation Results:
AUC Score: 0.9392
nDCG Score: 0.4624
MRR Score: 0.4378
Average Response Time: 0.0011 seconds
Max Response Time: 0.0184 seconds
Min Response Time: 0.0000 seconds
