In [1]:
import numpy as np
import pandas as pd
from scipy.sparse import lil_matrix

# Our functions
from EvalFunctions import AUCEval, MMREval, nDCGEval
from RecAlgs import CollaborativeFiltering, MostPopBaseline

The parameters to select how to tun the algorithm

In [2]:
# General
TimeCutOffDays = 3              # How old the articles can be that we would consider recommending (older than X days are not considered)

# Data selection                
TrainingDataStartDate = 0       # From what day we want to collect data
TrainingDataWindowSize = 2      # How many days of training data we want, 0 for all
TestDataWindowSize = 1          # How many days of test data we want, 0 for all

# Algorithm specifics 
TypeOfRecAlg = 3                # Which RecAlg we want to use 0-Pop, 1-Rand, 2-CBF, 3-CF, 4-Hybrid

# Popular Baseline
TimePenaltyPerHour = 0.1        # The percentage on penalty per hour the news gets
TimePenaltyStart = 24           # After howmany hours in the past the penalty starts

# Random Baseline
MinScore = 0                    # Minimum score that can be given
MaxScore = 1                    # Maximum score that can be given

# Content based filtering

# Collaborative filtering

# Hybrid
UsePopBaseline = False          # Whether to use Popularity baseline
UseRandBaseLine = False         # Whether to use Random baseline
UseCBF = True                   # Whether to use Content based filtering
UseCF = True                    # Whether to use Collaborative filtering
TakeMax = False                 # Whether to take the max between CBF and CF before applying weights
Weights = [0.2, 0.4, 0.4]       # The weights for the different parts (in order of appearance above)


Data selection

In [3]:
def getAvailableArticles(GivenTime, AllArticles):
    # Ensure the time column is in datetime format
    AllArticles['ReleaseDate'] = pd.to_datetime(AllArticles['ReleaseDate'])
    GivenTime = pd.to_datetime(GivenTime)

    # Filter rows where time is less than or equal to the given time
    return AllArticles[AllArticles['ReleaseDate'] >= GivenTime]

def getPastBehaviors(GivenTime, AllBehaviors):
    # Ensure the time column is in datetime format
    AllBehaviors['DateTime'] = pd.to_datetime(AllBehaviors['DateTime'])
    GivenTime = pd.to_datetime(GivenTime)

    # Filter rows where time is less than or equal to the given time
    return AllBehaviors[AllBehaviors['DateTime'] >= GivenTime], AllBehaviors[AllBehaviors['DateTime'] < GivenTime]

def getGroundTruth(FutureBehaviors, RequestedUserID):
    UserFuture = {ClickData for UserID, DateTime, History, ClickData in FutureBehaviors if UserID == RequestedUserID}
    ClickedArticles = []
    for ClickData in UserFuture:
        for Article in ClickData:
            if Article.endswith("-1"):
                Article = Article.removesuffix("-1")
                if Article not in ClickedArticles:
                    ClickedArticles.append(Article)
                
    return ClickedArticles
    

In [4]:
# Add the first time the article has been seen in the behaviors as the Est_PublishedTime in the articles.
AllTrainingData = pd.read_csv("../data/MINDsmall_train/behaviors.tsv", sep="\t", header=None, names=["UserID", "DateTime", "History", "ClickData"])
AllValidationData = pd.read_csv("../data/MINDsmall_dev/behaviors.tsv", sep="\t", header=None, names=["UserID", "DateTime", "History", "ClickData"])
AllData = pd.concat([AllTrainingData, AllValidationData], ignore_index=True)

ArticlesTrain = pd.read_csv("../data/MINDsmall_train/news.tsv", sep="\t", header=None, names=["NewsID", "Category", "SubCategory", "Title", "Abstract", "URL", "TitleEntities", "AbstractEntities"])
ArticlesValidation = pd.read_csv("../data/MINDsmall_dev/news.tsv", sep="\t", header=None, names=["NewsID", "Category", "SubCategory", "Title", "Abstract", "URL", "TitleEntities", "AbstractEntities"])
AllArticles = pd.concat([ArticlesTrain, ArticlesValidation], ignore_index=True)

ArticlesTrainWithTime = pd.read_csv("../data/NewsWithTime/small/TrainNewsWithTime.csv")
ArticlesValidationWithTime = pd.read_csv("../data/NewsWithTime/small/DevNewsWithTime.csv")
AllArticlesWithTime = pd.read_csv("../data/NewsWithTime/small/AllNewsWithTime.csv")


In [5]:
#Maybe add something to reduce the amount of data??

In [6]:
colab_filter = CollaborativeFiltering.CollaborativeFiltering(AllTrainingData, epochs=4)

colab_filter.initialize()

Initializing collaborative filtering...
apply start
explode start
sparse matrix start
Starting ALS using cpu


  indices=torch.tensor([rows, cols], device=device),


interaction_sparse.shape:  torch.Size([50000, 33196])
user_embeddings.shape, item_embeddings.shape:  torch.Size([50000, 3]) torch.Size([33196, 3])
training start
Epoch 1/4, Loss: 57.603092193603516
Epoch 2/4, Loss: 55.81801223754883
Epoch 3/4, Loss: 55.38041305541992
Epoch 4/4, Loss: 55.25794219970703


In [7]:
colab_filter

<RecAlgs.CollaborativeFiltering.CollaborativeFiltering at 0x28570c6fbc0>

In [1]:
#Main loop
#Assume we use the past behaviors we have to predict the click behavior on the test set (-1's aka clicked articles)
#We hope our recommendations include these articles
TotalAUCEvalScore = 0
TotalMMREEvalScore = 0
TotalNDCGEvalScore = 0
i=0
for instance in AllValidationData:
    # Get necessary parameters
    UserID = instance['UserID']
    Time = instance['DateTime']
    AvailableNews = getAvailableArticles(Time, ArticlesValidationWithTime)
    PastBehaviors, FutureBehaviors = getPastBehaviors(Time, AllValidationData)

    # Run the selected RecAlg
    if TypeOfRecAlg == 0:
        TopTenArticleRecommendations = MostPopBaseline.ReccomendMostPopular(AvailableNews, PastBehaviors, 
                                                                            Time, TimePenaltyPerHour, TimePenaltyStart)
    elif TypeOfRecAlg == 1:
        TopTenArticleRecommendations = RecAlgs.RandomBaseline()
    elif TypeOfRecAlg == 2:
        TopTenArticleRecommendations = RecAlgs.ContentBasedFiltering()    
    elif TypeOfRecAlg == 3:
        TopTenArticleRecommendations = colab_filter.getRecommended(AvailableNews, UserID, k=10)
        # TopTenArticleRecommendations = RecAlgs.CollaborativeFiltering()

    elif TypeOfRecAlg == 4:
        TopTenArticleRecommendations = RecAlgs.Hybrid()
    else:
        continue
    
    TotalAUCEvalScore += AUCEval.AUCEval(TopTenArticleRecommendations, getGroundTruth(FutureBehaviors, UserID))
    TotalMMREEvalScore += MMREval.MMREval(TopTenArticleRecommendations, getGroundTruth(FutureBehaviors, UserID))
    TotalNDCGEvalScore += nDCGEval.nDCG(TopTenArticleRecommendations, getGroundTruth(FutureBehaviors, UserID))
    i+=1

    break

AvgAUCScore = TotalAUCEvalScore/i
AvgMMREScore = TotalMMREEvalScore/i
AvgNDCGScore = TotalNDCGEvalScore/i

NameError: name 'AllValidationData' is not defined

In [None]:
# Average Evaluation

# Look at the results, and evaluate them with the different evaluation functions