In [1]:
# import libraries and datasets

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# load the dataset

behavior = pd.read_csv("data/MINDsmall_dev/behaviors.tsv", sep="\t", header=None, names=["Impression ID", "User ID", "Time", "History", "Impressions"])
news = pd.read_csv("data/MINDsmall_dev/news.tsv", sep="\t", header=None, names=["News ID", "Category", "Subcategory", "Title", "Abstract", "URL", "Title Entities", "Abstract Entities", "Title Topics", "Abstract Topics"])

In [2]:
#Find category and subcategory of an article from an articleID
def article_tags(articleID):
    tags = []
    article = news.loc[news["News ID"] == articleID]
    tags.append(article["Category"].values[0])
    tags.append(article["Subcategory"].values[0])
    return tags[0], tags[1]

def add_feature_to_xi(x_i, feature, weight):
    if feature in x_i:
        x_i[feature] += weight
    else:
        x_i[feature] = weight

#Create user vector from a userID
def create_x_i(userID):
    x_i = {}

    # Weights
    w_cat = 0.3  
    w_subcat = 0.8
    w_history = 0.5
    w_impression = 1.0

    article_count = 0

    # Add categories and subcategories
    for index, row in behavior.iterrows():
        if row["User ID"] == userID:
            # Adding features from articles in history
            for articleID in row["History"].split(" "):
                article_count += 1

                category, subcategory = article_tags(articleID)
                add_feature_to_xi(x_i, category, w_cat*w_history)
                add_feature_to_xi(x_i, subcategory, w_subcat*w_history)

                # TODO Add entities from articles in history here. We are already iterating through all articles in a users history, need method for extracting entities
                # entities = find_entities(articleID)   -   to be implemented
                # for entity in entities:
                #   add_feature_to_xi(x_i, entity, w_entity*w_history)
    
            # Adding features from articles in impressions
            for impression in row["Impressions"].split(" "):
                if impression.split("-")[1] == "1":
                    article_count += 1

                    category, subcategory = article_tags(impression.split("-")[0])
                    add_feature_to_xi(x_i, category, w_cat*w_impression)
                    add_feature_to_xi(x_i, subcategory, w_subcat*w_impression)

                    # TODO Add entities from articles in impressions here. We are already iterating through all articles in a users impressions, need method for extracting entities
                    # entities = find_entities(articleID)   -   to be implemented
                    # for entity in entities:
                    #   add_feature_to_xi(x_i, entity, w_entity*w_history)

    
    # Normalizing
    for trait, score in x_i.items():
        x_i[trait] = score/article_count

    return x_i

exID = behavior["User ID"].values[1]

x_i = create_x_i(exID)
print(x_i)

{'news': 0.04285714285714286, 'newsus': 0.07142857142857142, 'travel': 0.010714285714285714, 'travelnews': 0.03571428571428571, 'finance': 0.04285714285714286, 'finance-companies': 0.14285714285714285, 'newsscienceandtechnology': 0.03571428571428571, 'music': 0.010714285714285714, 'music-celebrity': 0.03571428571428571, 'newscrime': 0.03571428571428571, 'weather': 0.010714285714285714, 'weathertopstories': 0.03571428571428571, 'health': 0.010714285714285714, 'fitness': 0.03571428571428571, 'lifestyle': 0.010714285714285714, 'lifestyleroyals': 0.03571428571428571, 'sports': 0.02142857142857143, 'football_nfl': 0.07142857142857142}


In [3]:
def create_all_x_j():
    all_x_j = []
    for index, row in news.iterrows():
        x_j = {}
        x_j["News ID"] = row["News ID"]
        x_j[row["Category"]] = 1.0
        x_j[row["Subcategory"]] = 1.0
        all_x_j.append(x_j)

        # TODO add entities to item vector x_j

    return all_x_j

X_j = create_all_x_j()
print(X_j[1])

{'News ID': 'N18955', 'health': 1.0, 'medical': 1.0}


In [4]:
def score(userID):
    x_i = create_x_i(userID)
    X_j = create_all_x_j()

    scores = {}

    for x_j in X_j:
        for c, s in x_j.items():
            article_score = 0
            if c in x_i.keys():
                article_score += s * x_i[c]
        scores[x_j["News ID"]] = article_score
    
    return sorted(scores.items(), key=lambda x: x[1], reverse=True)

print(score(exID))

[('N36064', 0.14285714285714285), ('N15254', 0.14285714285714285), ('N17430', 0.14285714285714285), ('N41257', 0.14285714285714285), ('N42994', 0.14285714285714285), ('N38014', 0.14285714285714285), ('N704', 0.14285714285714285), ('N8767', 0.14285714285714285), ('N31585', 0.14285714285714285), ('N45847', 0.14285714285714285), ('N8046', 0.14285714285714285), ('N9772', 0.14285714285714285), ('N48229', 0.14285714285714285), ('N7210', 0.14285714285714285), ('N30432', 0.14285714285714285), ('N21832', 0.14285714285714285), ('N6627', 0.14285714285714285), ('N5810', 0.14285714285714285), ('N40934', 0.14285714285714285), ('N55267', 0.14285714285714285), ('N64149', 0.14285714285714285), ('N32696', 0.14285714285714285), ('N31557', 0.14285714285714285), ('N4529', 0.14285714285714285), ('N57800', 0.14285714285714285), ('N33289', 0.14285714285714285), ('N62641', 0.14285714285714285), ('N22867', 0.14285714285714285), ('N30473', 0.14285714285714285), ('N44060', 0.14285714285714285), ('N49798', 0.14285