In [None]:
import pandas as pd
from scipy.sparse import lil_matrix

In [None]:
# Add the first time the article has been seen in the behaviors as the Est_PublishedTime in the articles.
AllTrainingData = pd.read_csv("../data/MINDsmall_train/behaviors.tsv", sep="\t", header=None, names=["UserID", "DateTime", "History", "ClickData"])
AllValidationData = pd.read_csv("../data/MINDsmall_dev/behaviors.tsv", sep="\t", header=None, names=["UserID", "DateTime", "History", "ClickData"])
AllData = pd.concat([AllTrainingData, AllValidationData], ignore_index=True)

ArticlesTrain = pd.read_csv("../data/MINDsmall_train/news.tsv", sep="\t", header=None, names=["NewsID", "Category", "SubCategory", "Title", "Abstract", "URL", "TitleEntities", "AbstractEntities"])
ArticlesValidation = pd.read_csv("../data/MINDsmall_dev/news.tsv", sep="\t", header=None, names=["NewsID", "Category", "SubCategory", "Title", "Abstract", "URL", "TitleEntities", "AbstractEntities"])
AllArticles = pd.concat([ArticlesTrain, ArticlesValidation], ignore_index=True)

In [None]:
def get_sparse_matrix(behaviors):

  behaviors['History'] = behaviors['History'].str.split()

  # behaviors.reset_index(inplace=True)

  # Create new column with list of user ID and time stamp
  behaviors['UserIDDateTime'] = behaviors['UserID'].astype(str) + ', ' + behaviors['DateTime'].astype(str)

  # Generate a list of unique articles from the 'articles_read' column
  articles = sorted(set(article for history in behaviors['History'] if isinstance(history, list) for article in history))

  # Create a dictionary to map article IDs to column indices
  article_to_index = {article: index for index, article in enumerate(articles)}

  # Create a dictionary to map user ID-timestamp combinations to indices
  user_id_timestamps = sorted(behaviors['UserIDDateTime'].unique())
  user_to_index = {user_id_timestamps: index for index, user_id_timestamps in enumerate(user_id_timestamps)}

  # Create an empty sparse matrix with boolean values
  num_articles = len(articles)
  num_users = len(user_id_timestamps)
  sparse_matrix = lil_matrix((num_users, num_articles), dtype=bool)

  # Fill the sparse matrix with user interactions
  for _, row in behaviors.iterrows():
      user_id = row['UserIDDateTime']
      user_index = user_to_index[user_id]  # Assuming user IDs start with 'user' followed by a number
      history = row['History']
      if isinstance(history, list):
          for article in history:
              if article in article_to_index:
                  article_index = article_to_index[article]
                  sparse_matrix[user_index, article_index] = True
      else:
          if history in article_to_index:
              article_index = article_to_index[history]
              sparse_matrix[user_index, article_index] = True

  # Convert the sparse matrix to a DataFrame
  data_sparse = pd.DataFrame.sparse.from_spmatrix(sparse_matrix, columns=articles, index=user_id_timestamps)

  data_sparse.reset_index(inplace=True)

  # Extracting user id and timestamp from combined index column
  data_sparse['UserID'] = data_sparse['index'].str.split(',').str[0]
  data_sparse['DateTime'] = data_sparse['index'].str.split(',').str[1]

  # Dropping combined user_id & timestamp index column
  data_sparse.drop(columns='index',inplace=True)

  # Moving the last two columns to the front
  # Get the column names of the last two columns
  last_two_columns = data_sparse.columns[-2:]
  # Recreating dataframe
  data_sparse = data_sparse[last_two_columns.tolist() + data_sparse.columns[:-2].tolist()]

  return data_sparse

In [None]:
def get_release_date(data_sparse, news):
  # Sort the data_sparse DataFrame by the timestamp column
  sorted_sparse_df = data_sparse.sort_values(by='DateTime')

  # Find the release date of each news article
  release_dates = {}

  # Iterate over the columns of data_sparse starting from the third column
  for col in sorted_sparse_df.columns[2:]:
      # Find the first index where the column has a value of 1
      first_nonzero_index = sorted_sparse_df[col].idxmax()

      # Get the timestamp from the sorted_sparse_df using the first_nonzero_index
      timestamp = sorted_sparse_df.loc[first_nonzero_index, 'DateTime']

      # Store the release date in the release_dates dictionary with the article ID as the key
      release_dates[col] = timestamp

  # Convert the release_dates dictionary into a DataFrame
  release_dates_df = pd.DataFrame(release_dates.items(), columns=['NewsID', 'ReleaseDate'])

  # Join the two dataframes based on the 'news_id' column using left join
  news_dates = news.merge(release_dates_df, on='NewsID', how='left')

  news_dates['ReleaseDate'] = pd.to_datetime(news_dates['ReleaseDate'], errors='coerce')

  # Fill the nulls with the first date
  news_dates['ReleaseDate'].fillna(news_dates['ReleaseDate'].min(), inplace=True)

  return news_dates

In [None]:
SparseData = get_sparse_matrix(AllData)

In [None]:
AllArticlesWithTime = get_release_date(SparseData, AllArticles)
AllArticlesWithTime.to_csv("../data/NewsWithTime/small/AllNewsWithTime.csv", index=False, sep=",")
AllArticlesWithTime.head(3)

In [None]:
TrainArticlesWithTime = get_release_date(SparseData, ArticlesTrain)
TrainArticlesWithTime.to_csv("../data/NewsWithTime/small/TrainNewsWithTime.csv", index=False, sep=",")
TrainArticlesWithTime.head(3)

In [None]:
ValArticlesWithTime = get_release_date(SparseData, ArticlesValidation)
ValArticlesWithTime.to_csv("../data/NewsWithTime/small/DevNewsWithTime.csv", index=False, sep=",")
ValArticlesWithTime.head(3)