In [None]:
# import libraries and datasets

import pandas as pd
import numpy as np
from sklearn.metrics import ndcg_score
from datetime import datetime as datetime

#### Defining Helper Functions

In [None]:
# Get all impressions for a user and add them to a df
def get_impressions(userID, behavior_view):
    test = []
    for index, row in behavior_view.iterrows():
        if row["User ID"] == userID:
            for impression in row["Impressions"].split(" "):
                imp = impression.split("-")
                if imp[1] == "1":
                    test.append((imp[0], 1))
                else:
                    test.append((imp[0], 0))
    return pd.DataFrame(test, columns=["News ID", "Response"])


# Join response to our predictions in order to sort them before evaluation
def create_evaluation_data(scored_data, userID):
    return scored_data.join(get_impressions(userID).set_index("News ID"), how="inner", on="News ID")


# Get df of all user IDs
def get_users(view1, view2):
    return view1["User ID"].join(view2["User ID"].set_index("User ID"), on="User ID", how="inner").unique()

#userList = __get_users()

In [None]:
def get_view(behavior, t0, t1):
    df = behavior[(behavior["Time"] >= t0) & (behavior["Time"] < t1)]
    return df

#### Loading Data

In [None]:
behavior = pd.read_csv("data/MINDsmall_train/behaviors.tsv", sep="\t", header=None, names=["Impression ID", "User ID", "Time", "History", "Impressions"])
news = pd.read_csv("data/MINDsmall_train/news.tsv", sep="\t", header=None, names=["News ID", "Category", "Subcategory", "Title", "Abstract", "URL", "Title Entities", "Abstract Entities", "Title Topics", "Abstract Topics"])

In [None]:
def str_to_timestamp(str):
    return datetime.strptime(str, "%m/%d/%Y %H:%M:%S %p").timestamp()

timestamps = behavior["Time"].apply(str_to_timestamp)
behavior["Time"] = timestamps
behavior.sort_values(by="Time")

#### Evaluate Feature-Based Model

In [None]:
%run feature_based.ipynb

In [None]:
X_j, news_id_to_index = create_all_item_vectors(news, behavior)

evaluations_ndcg = []
evaluations_pak = []
t0 = float(behavior["Time"][0])
tn = float(behavior["Time"][len(behavior["Time"])-1])
split_ratio = 2/3
dt = (tn-t0)/10
k = 5

while t0 <= tn-dt:
    tsplit = t0 + dt*split_ratio
    t1 = t0 + dt

    train_view = get_view(behavior, t0, tsplit)
    test_view = get_view(behavior, tsplit, t1)

    users = get_users(test_view, train_view)

    for i, row in users.iterrows():
        user = row["User ID"]
        x_i = create_user_vector(user, X_j, train_view)
        prediction = pd.DataFrame(find_top_k_articles(user, X_j, 10, news_id_to_index), columns=["News ID", "Score"])
        response = get_impressions(user, test_view)
        pred_resp = prediction.join(response.set_index("News ID"), on="News ID", how="inner")
        try:
            evaluation = ndcg_score(np.array([pred_resp["Response"].to_numpy()]), np.array([pred_resp["Score"].to_numpy()]))
            evaluations_ndcg.append(evaluation)
            k_slice = pred_resp["Response"].iloc[:k]
            evaluation = k_slice.sum()/min(len(k_slice), k)
            evaluations_pak.append(evaluation)
            print("Predictions for user " + user + " evaluated!")
        except:
            print("Eval failed")
            pass

    print("moving timeframe")
    t0 = tsplit

In [None]:
print("nDCG:")
print(sum(evaluations_ndcg)/len(evaluations_ndcg))
print("Precision at K:")
print(sum(evaluations_pak)/len(evaluations_pak))


#### Evaluate Collaborative Filtering Model

In [None]:
%run item_collab_filtering.ipynb

In [None]:
evaluations_ndcg = []
evaluations_pak = []
t0 = float(behavior["Time"][0])
tn = float(behavior["Time"][len(behavior["Time"])-1])
split_ratio = 2/3
dt = (tn-t0)/10
k = 5

while t0 <= tn-dt:
    tsplit = t0 + dt*split_ratio
    t1 = t0 + dt

    train_view = get_view(behavior, t0, tsplit)
    test_view = get_view(behavior, tsplit, t1)

    model = train_collaborative_filtering_model(train_view)
    users = get_users(test_view, train_view)

    for i, row in users.iterrows():
        user = row["User ID"]
        prediction = get_top_n_recommendations(user, model, N=10).toPandas()
        response = get_impressions(user, test_view)
        pred_resp = prediction.join(response.set_index("News ID"), on="news_id", how="inner")
        try:
            evaluation = ndcg_score(np.array([pred_resp["Response"].to_numpy()]), np.array([pred_resp["Score"].to_numpy()]))
            evaluations_ndcg.append(evaluation)
            k_slice = pred_resp["Response"].iloc[:k]
            evaluation = k_slice.sum()/min(len(k_slice), k)
            evaluations_pak.append(evaluation)
            print("Predictions for user " + user + " evaluated!")
        except:
            print("Eval failed")
            pass

    print("moving timeframe")
    t0 = tsplit

In [None]:
print("nDCG:")
print(sum(evaluations_ndcg)/len(evaluations_ndcg))
print("Precision at K:")
print(sum(evaluations_pak)/len(evaluations_pak))