In [1]:
# import libraries and datasets

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# load the dataset

behavior = pd.read_csv("data/MINDsmall_dev/behaviors.tsv", sep="\t", header=None, names=["Impression ID", "User ID", "Time", "History", "Impressions"])
news = pd.read_csv("data/MINDsmall_dev/news.tsv", sep="\t", header=None, names=["News ID", "Category", "Subcategory", "Title", "Abstract", "URL", "Title Entities", "Abstract Entities", "Title Topics", "Abstract Topics"])

In [6]:
#Find category and subcategory of an article from an articleID
def article_tags(articleID):
    tags = []
    article = news.loc[news["News ID"] == articleID]
    tags.append(article["Category"].values[0])
    tags.append(article["Subcategory"].values[0])
    return tags[0], tags[1]

def add_feature_to_xi(x_i, feature, weight):
    if feature in x_i:
        x_i[feature] += weight
    else:
        x_i[feature] = weight

#Create user vector from a userID
def create_x_i(userID):
    x_i = {}

    # Weights
    w_cat = 0.3  
    w_subcat = 0.8
    w_history = 0.5
    w_impression = 1.0

    article_count = 0

    # Add categories and subcategories
    for index, row in behavior.iterrows():
        if row["User ID"] == userID:
            # Adding features from articles in history
            history_count = 0
            # Assuming history starting with oldest click - must be checked, if not, remove reverse operator
            history = row["History"].split(" ")
            history.reverse()
            for articleID in history:
                history_count += 1
                article_count += 1

                category, subcategory = article_tags(articleID)
                add_feature_to_xi(x_i, category, w_cat*w_history/history_count)
                add_feature_to_xi(x_i, subcategory, w_subcat*w_history/history_count)

                # TODO Add entities from articles in history here. We are already iterating through all articles in a users history, need method for extracting entities
                # entities = find_entities(articleID)   -   to be implemented
                # for entity in entities:
                #   add_feature_to_xi(x_i, entity, w_entity*w_history)
    
            # Adding features from articles in impressions
            for impression in row["Impressions"].split(" "):
                if impression.split("-")[1] == "1":
                    article_count += 1

                    category, subcategory = article_tags(impression.split("-")[0])
                    add_feature_to_xi(x_i, category, w_cat*w_impression)
                    add_feature_to_xi(x_i, subcategory, w_subcat*w_impression)

                    # TODO Add entities from articles in impressions here. We are already iterating through all articles in a users impressions, need method for extracting entities
                    # entities = find_entities(articleID)   -   to be implemented
                    # for entity in entities:
                    #   add_feature_to_xi(x_i, entity, w_entity*w_history)

    
    # Normalizing
    for trait, score in x_i.items():
        x_i[trait] = score/article_count

    return x_i

exID = behavior["User ID"].values[1]

x_i = create_x_i(exID)
print(x_i)

{'lifestyle': 0.010714285714285714, 'lifestyleroyals': 0.028571428571428574, 'health': 0.005357142857142857, 'fitness': 0.014285714285714287, 'finance': 0.008027597402597402, 'finance-companies': 0.021406926406926405, 'weather': 0.0026785714285714286, 'weathertopstories': 0.0071428571428571435, 'news': 0.004871794871794871, 'newscrime': 0.0047619047619047615, 'music': 0.001530612244897959, 'music-celebrity': 0.004081632653061225, 'newsus': 0.005372405372405373, 'newsscienceandtechnology': 0.002857142857142857, 'travel': 0.0008928571428571428, 'travelnews': 0.0023809523809523807, 'sports': 0.02142857142857143, 'football_nfl': 0.05714285714285715}


In [3]:
def create_all_x_j():
    all_x_j = []
    for index, row in news.iterrows():
        x_j = {}
        x_j["News ID"] = row["News ID"]
        x_j[row["Category"]] = 1.0
        x_j[row["Subcategory"]] = 1.0
        all_x_j.append(x_j)

        # TODO add entities to item vector x_j

    return all_x_j

X_j = create_all_x_j()
print(X_j[1])

{'News ID': 'N18955', 'health': 1.0, 'medical': 1.0}


In [8]:
def score(userID):
    x_i = create_x_i(userID)
    X_j = create_all_x_j()

    scores = {}

    for x_j in X_j:
        for c, s in x_j.items():
            article_score = 0
            if c in x_i.keys():
                article_score += s * x_i[c]
        scores[x_j["News ID"]] = article_score
    
    return sorted(scores.items(), key=lambda x: x[1], reverse=True)[:10]

print(score(exID))

[('N2073', 0.05714285714285715), ('N16587', 0.05714285714285715), ('N29120', 0.05714285714285715), ('N64723', 0.05714285714285715), ('N27190', 0.05714285714285715), ('N9035', 0.05714285714285715), ('N41277', 0.05714285714285715), ('N42921', 0.05714285714285715), ('N19888', 0.05714285714285715), ('N27334', 0.05714285714285715)]
