# Most-popular method

In [17]:
# import libraries and datasets

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from datetime import datetime, timedelta

# load the dataset

behaviors_dev_df = pd.read_csv("data/MINDsmall_dev/behaviors.tsv", sep="\t", header=None, names=["Impression ID", "User ID", "Time", "History", "Impressions"])
news_dev_df = pd.read_csv("data/MINDsmall_dev/news.tsv", sep="\t", header=None, names=["News ID", "Category", "Subcategory", "Title", "Abstract", "URL", "Title Entities", "Abstract Entities", "Title Topics", "Abstract Topics"])

behaviors_train_df = pd.read_csv("data/MINDsmall_train/behaviors.tsv", sep="\t", header=None, names=["Impression ID", "User ID", "Time", "History", "Impressions"])
news_train_df = pd.read_csv("data/MINDsmall_train/news.tsv", sep="\t", header=None, names=["News ID", "Category", "Subcategory", "Title", "Abstract", "URL", "Title Entities", "Abstract Entities", "Title Topics", "Abstract Topics"])


In [18]:
#Fill missing abstracts with placeholder
news_dev_df['Abstract'].fillna('No abstract available', inplace=True)
news_train_df['Abstract'].fillna('No abstract available', inplace=True)


# if there are rows with no impressions, drop them
behaviors_dev_df = behaviors_dev_df.dropna(subset=['Impressions']) # this looses some user information, could instead manually overwrite and fill in the missing values based on the typo combining the impression and history columns
behaviors_train_df = behaviors_train_df.dropna(subset=['Impressions']) # this looses some user information, could instead manually overwrite and fill in the missing values based on the typo combining the impression and history columns


In [19]:
def find_most_popular_news(behaviors_df, n):
    # Convert 'Time' column to datetime
    behaviors_df['Time'] = pd.to_datetime(behaviors_df['Time'])
    
    # Find the most recent interaction time
    most_recent_interaction_time = behaviors_df['Time'].max()
    
    # Consider only interactions within the last 24 hours
    recent_behaviors_df = behaviors_df[behaviors_df['Time'] >= most_recent_interaction_time - timedelta(days=1)]
    
    # Initialize a dictionary to hold weighted click counts
    weighted_clicks = {}
    
    # Apply recency weighting to clicks
    for index, row in recent_behaviors_df.iterrows():
        interaction_time = row['Time']
        weight = (most_recent_interaction_time - interaction_time).total_seconds() / (24 * 3600)  # Normalize to a 0-1 scale based on 24-hour period
        weight = 1 - weight  # Invert so recent interactions have higher weight
        
        for impression in row["Impressions"].split(" "):
            article_id, clicked = impression.split("-")
            if clicked == "1":
                if article_id not in weighted_clicks:
                    weighted_clicks[article_id] = 0
                weighted_clicks[article_id] += weight
    
    # Convert dictionary to DataFrame for sorting
    weighted_clicks_df = pd.DataFrame(list(weighted_clicks.items()), columns=['News ID', 'Weighted Clicks'])
    
    # Sort by weighted clicks to get the most popular articles
    weighted_clicks_df = weighted_clicks_df.sort_values(by='Weighted Clicks', ascending=False)
    
    # Return top 10 most popular news articles based on recency-weighted clicks
    return weighted_clicks_df.head(n)

In [20]:
# Find and print the most popular news articles
#most_popular_news = find_most_popular_news(behaviors_df=behaviors_train_df, n=10)
#print(most_popular_news)

    News ID  Weighted Clicks
32   N23446       707.192535
33   N38779       574.013322
46   N61233       559.266609
129  N45523       556.067720
30   N19661       493.098623
8    N34185       446.595544
59   N56211       390.775856
90   N41934       388.134063
24    N6837       335.275995
6     N6477       325.801343
