In [201]:

import shutil
shutil.rmtree('__pycache__', ignore_errors=True)

from tqdm import tqdm
import numpy as np
import pandas as pd
from scipy.sparse import lil_matrix
import importlib
# Our functions
from EvalFunctions import AUCEval, MMREval, nDCGEval
from RecAlgs import MostPopBaseline, CollaborativeFiltering, Hybrid, News_Recommender_CBF

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


The parameters to select how to tun the algorithm

In [202]:
# General
TimeCutOffDays = 3              # How old the articles can be that we would consider recommending (older than X days are not considered)

# Data selection                
TrainingDataStartDate = 0       # From what day we want to collect data
TrainingDataWindowSize = 2      # How many days of training data we want, 0 for all
TestDataWindowSize = 1          # How many days of test data we want, 0 for all

# Algorithm specifics 
TypeOfRecAlg = 0                # Which RecAlg we want to use 0-Pop, 1-Rand, 2-CBF, 3-CF, 4-Hybrid

# Popular Baseline
TimePenaltyPerHour = 0.1        # The percentage on penalty per hour the news gets
TimePenaltyStart = 24           # After howmany hours in the past the penalty starts

# Random Baseline
MinScore = 0                    # Minimum score that can be given
MaxScore = 1                    # Maximum score that can be given

# Content based filtering

# Collaborative filtering

# Hybrid
UsePopBaseline = False          # Whether to use Popularity baseline
UseRandBaseLine = False         # Whether to use Random baseline
UseCBF = True                   # Whether to use Content based filtering
UseCF = True                    # Whether to use Collaborative filtering
TakeMax = False                 # Whether to take the max between CBF and CF before applying weights
Weights = [0.2, 0.4, 0.4]       # The weights for the different parts (in order of appearance above)


Data selection

In [203]:
def getAvailableArticles(GivenTime, AllArticles):
    # Ensure the time column is in datetime format
    AllArticles['ReleaseDate'] = pd.to_datetime(AllArticles['ReleaseDate'])
    GivenTime = pd.to_datetime(GivenTime)

    # Filter rows where time is less than or equal to the given time
    return AllArticles[AllArticles['ReleaseDate'] >= GivenTime]

def getPastBehaviors(GivenTime, AllBehaviors):
    # Ensure the time column is in datetime format
    AllBehaviors['DateTime'] = pd.to_datetime(AllBehaviors['DateTime'])
    GivenTime = pd.to_datetime(GivenTime)

    # Filter rows where time is less than or equal to the given time
    return AllBehaviors[AllBehaviors['DateTime'] >= GivenTime], AllBehaviors[AllBehaviors['DateTime'] <= GivenTime]

def getGroundTruth(FutureBehaviors, RequestedUserID):
    ClickedArticles = []

    # Count how many times the user appears
    user_rows = [row for row in FutureBehaviors.itertuples(index=False) if str(row.UserID) == str(RequestedUserID)]

    for row in user_rows:
        if not isinstance(row.ClickData, str):
            continue  # Skip if ClickData is not a string

        for Click in row.ClickData.split(" "):
            if Click.endswith("-1"):
                ClickedArticles.append(Click.replace("-1", ""))

    # print(f"Ground truth for user {RequestedUserID}: {ClickedArticles}")
    return ClickedArticles


    

In [204]:
# Add the first time the article has been seen in the behaviors as the Est_PublishedTime in the articles.
AllTrainingData = pd.read_csv("../data/MINDsmall_train/behaviors.tsv", sep="\t", header=None, names=["UserID", "DateTime", "History", "ClickData"])
AllValidationData = pd.read_csv("../data/MINDsmall_dev/behaviors.tsv", sep="\t", header=None, names=["UserID", "DateTime", "History", "ClickData"])
AllData = pd.concat([AllTrainingData, AllValidationData], ignore_index=True)

ArticlesTrain = pd.read_csv("../data/MINDsmall_train/news.tsv", sep="\t", header=None, names=["NewsID", "Category", "SubCategory", "Title", "Abstract", "URL", "TitleEntities", "AbstractEntities"])
ArticlesValidation = pd.read_csv("../data/MINDsmall_dev/news.tsv", sep="\t", header=None, names=["NewsID", "Category", "SubCategory", "Title", "Abstract", "URL", "TitleEntities", "AbstractEntities"])
AllArticles = pd.concat([ArticlesTrain, ArticlesValidation], ignore_index=True)

ArticlesTrainWithTime = pd.read_csv("../data/NewsWithTime/small/TrainNewsWithTime.csv")
ArticlesValidationWithTime = pd.read_csv("../data/NewsWithTime/small/DevNewsWithTime.csv")
AllArticlesWithTime = pd.read_csv("../data/NewsWithTime/small/AllNewsWithTime.csv")


In [182]:
#Maybe add something to reduce the amount of data??

In [183]:
# colab_filter = CollaborativeFiltering.CollaborativeFiltering(AllTrainingData, epochs=2)
# 
# colab_filter.initialize()

In [205]:
path_items = "../data/MINDsmall_train/news.tsv"
path_user_behavior = "../data/MINDsmall_train/behaviors.tsv"

recommender = News_Recommender_CBF.NewsRecommenderCBF(path_items, path_user_behavior)
recommender.get_user_frame()

00 ----------> ITEM data loaded successfully: 51282 records!
01 ----------> USER data loaded successfully: 156965 records!
02 ----------> Corpus created: 51282 documents!
03 ----------> TF-IDF matrix created: 51282 documents, 167113 terms!
04 ----------> Category matrix created: 51282 documents, 281 categories!
05 ----------> Combined matrix created, shape: (51282, 167394)


Unnamed: 0,uID,t,ClickHist,ImpLog
1,U13740,2019-11-11 09:05:58,N55189 N42782 N34694 N45794 N18445 N63302 N104...,N55689-1 N35729-0
2,U91836,2019-11-12 18:11:30,N31739 N6072 N63045 N23979 N35656 N43353 N8129...,N20678-0 N39317-0 N58114-0 N20495-0 N42977-0 N...
3,U73700,2019-11-14 07:01:48,N10732 N25792 N7563 N21087 N41087 N5445 N60384...,N50014-0 N23877-0 N35389-0 N49712-0 N16844-0 N...
4,U34670,2019-11-11 05:28:05,N45729 N2203 N871 N53880 N41375 N43142 N33013 ...,N35729-0 N33632-0 N49685-1 N27581-0
5,U8125,2019-11-12 16:11:21,N10078 N56514 N14904 N33740,N39985-0 N36050-0 N16096-0 N8400-1 N22407-0 N6...
...,...,...,...,...
156961,U21593,2019-11-14 22:24:05,N7432 N58559 N1954 N43353 N14343 N13008 N28833...,N2235-0 N22975-0 N64037-0 N47652-0 N11378-0 N4...
156962,U10123,2019-11-13 06:57:04,N9803 N104 N24462 N57318 N55743 N40526 N31726 ...,N3841-0 N61571-0 N58813-0 N28213-0 N4428-0 N25...
156963,U75630,2019-11-14 10:58:13,N29898 N59704 N4408 N9803 N53644 N26103 N812 N...,N55913-0 N62318-0 N53515-0 N10960-0 N9135-0 N5...
156964,U44625,2019-11-13 14:57:02,N4118 N47297 N3164 N43295 N6056 N38747 N42973 ...,N6219-0 N3663-0 N31147-0 N58363-0 N4107-0 N457...


In [206]:
PopularityDict = {}
for row in AllData.itertuples(index=False):
    for click in row.ClickData.split(" "):  # split string of clicks
        if click.endswith("-1"):  # Only clicked articles
            ArticleID = click.replace("-1", "")
            PopularityDict[ArticleID] = PopularityDict.get(ArticleID, 0) + 1
# Assuming it's a dict: {news_id: count}
PopularityDict = sorted(PopularityDict.items(), key=lambda x: x[1], reverse=True)


In [209]:
#Main loop
TypeOfRecAlg = 0
#Assume we use the past behaviors we have to predict the click behavior on the test set (-1's aka clicked articles)
#We hope our recommendations include these articles
TotalAUCEvalScore = 0
TotalMMREEvalScore = 0
TotalNDCGEvalScore = 0
i=0
amountOfColdStarts = 0
for _, instance in tqdm(AllValidationData.iterrows(), total=len(AllValidationData), desc="Processing Instances"):
    # Get necessary parameters
    UserID = instance['UserID']
    Time = pd.to_datetime(instance['DateTime'])
    AvailableNews = getAvailableArticles(Time, ArticlesValidationWithTime)
    PastBehaviors, FutureBehaviors = getPastBehaviors(Time, AllValidationData)
    AvailableNews.loc[:, 'ReleaseDate'] = pd.to_datetime(AvailableNews['ReleaseDate'])
    
    GT = getGroundTruth(FutureBehaviors, UserID)
    # skip user if there is no future data for this user
    if len(GT) == 0:
        continue
    # Run the selected RecAlg
    if TypeOfRecAlg == 0:
        # PossibleArticles, CurrentTime, GlobalPopularity, TimePenaltyPerHour, TimePenaltyStart
        TopTenArticleRecommendations = MostPopBaseline.RecommendMostPopular(AvailableNews, Time, PopularityDict, TimePenaltyPerHour, TimePenaltyStart)
    elif TypeOfRecAlg == 1:
        TopTenArticleRecommendations = recommender.recommend(UserID, 10)

    # elif TypeOfRecAlg == 2:
    #     TopTenArticleRecommendations = colab_filter.getRecommended(UserID, k=10)

    elif TypeOfRecAlg == 3:
        TopTenArticleRecommendations = Hybrid()
    else:
        continue
        
    # For cold start
    if len(TopTenArticleRecommendations) == 0:
        amountOfColdStarts += 1
        TopTenArticleRecommendations = MostPopBaseline.RecommendMostPopular(AvailableNews, PastBehaviors, 
                                                                            Time, TimePenaltyPerHour, TimePenaltyStart)
        
    # Calculate evaluation scores
    AUCScore = AUCEval.AUCEval(TopTenArticleRecommendations, GT)
    MMREScore = MMREval.MMREval(TopTenArticleRecommendations, GT)
    NDCGScore = nDCGEval.nDCG(TopTenArticleRecommendations, GT)
    
    # Print the scores for the current user and generation
    # print(f"Generation {i}: User {UserID} - AUC: {AUCScore}, MMRE: {MMREScore}, NDCG: {NDCGScore}")
    
    # Accumulate the total scores
    TotalAUCEvalScore += AUCScore
    TotalMMREEvalScore += MMREScore
    TotalNDCGEvalScore += NDCGScore
    
    i+=1


AvgAUCScore = TotalAUCEvalScore/i
AvgMMREScore = TotalMMREEvalScore/i
AvgNDCGScore = TotalNDCGEvalScore/i


Processing Instances:   0%|          | 28/73152 [00:06<4:23:49,  4.62it/s]


KeyboardInterrupt: 

In [92]:
# Average Evaluation

# Look at the results, and evaluate them with the different evaluation functions