In [1]:
# import libraries and datasets

import pandas as pd
import numpy as np
from sklearn.metrics import ndcg_score
from datetime import datetime as datetime

#### Defining Helper Functions

In [20]:
# Get all impressions for a user and add them to a df
def get_impressions(userID, behavior_view):
    test = []
    for index, row in behavior_view.iterrows():
        if row["User ID"] == userID:
            for impression in row["Impressions"].split(" "):
                imp = impression.split("-")
                if imp[1] == "1":
                    test.append((imp[0], 1))
                else:
                    test.append((imp[0], 0))
    return pd.DataFrame(test, columns=["News ID", "Response"])


# Join response to our predictions in order to sort them before evaluation
def create_evaluation_data(scored_data, userID):
    return scored_data.join(get_impressions(userID).set_index("News ID"), how="inner", on="News ID")


# Get df of all user IDs
def get_users(view1, view2):
    return pd.merge(left=view1["User ID"], right=view2["User ID"], how="inner", on="User ID")["User ID"].unique()

#userList = __get_users()

In [12]:
def get_view(behavior, t0, t1):
    df = behavior[(behavior["Timestamp"] >= t0) & (behavior["Timestamp"] < t1)]
    return df

#### Loading Data

In [4]:
behavior = pd.read_csv("data/MINDsmall_train/behaviors.tsv", sep="\t", header=None, names=["Impression ID", "User ID", "Time", "History", "Impressions"])
news = pd.read_csv("data/MINDsmall_train/news.tsv", sep="\t", header=None, names=["News ID", "Category", "Subcategory", "Title", "Abstract", "URL", "Title Entities", "Abstract Entities", "Title Topics", "Abstract Topics"])

In [5]:
def str_to_timestamp(str):
    return datetime.strptime(str, "%m/%d/%Y %H:%M:%S %p").timestamp()

timestamps = behavior["Time"].apply(str_to_timestamp)
behavior["Timestamp"] = timestamps
behavior.sort_values(by="Timestamp")
# Add to new column instead of overwriting

Unnamed: 0,Impression ID,User ID,Time,History,Impressions,Timestamp
103760,103761,U68089,11/9/2019 1:00:03 PM,N138 N29177 N28850 N22745 N55326 N53100 N33969...,N52000-0 N41881-0 N60374-0 N5442-0 N51398-0 N5...,1.573258e+09
42450,42451,U56509,11/9/2019 1:00:17 PM,N51214 N60979 N9293 N4786 N13380 N14149 N2155,N63685-0 N34799-0 N26130-0 N51378-0 N7891-0 N6...,1.573258e+09
116953,116954,U34617,11/9/2019 1:00:22 PM,N11863 N44310 N31064,N52000-0 N41881-0 N27845-1 N47020-0 N51398-0,1.573258e+09
26087,26088,U11984,11/9/2019 1:00:33 PM,N39074 N3501 N31457 N61864 N3493 N25971 N29718...,N47020-1 N27845-0 N41881-0 N51398-0 N52000-0,1.573258e+09
14464,14465,U74966,11/9/2019 1:00:39 PM,N39074 N19760 N20530 N58668 N44495 N20039 N339...,N59852-0 N47020-0 N39115-0 N58051-0 N37088-0 N...,1.573258e+09
...,...,...,...,...,...,...
95717,95718,U6300,11/14/2019 12:59:46 PM,N28296 N34087 N37942 N27311 N35022 N42620,N50872-0 N25165-0 N29212-0 N32567-0 N16439-0 N...,1.573733e+09
140318,140319,U15094,11/14/2019 12:59:47 PM,N33038 N19494 N54377 N21242 N29499 N55743 N330...,N56142-0 N23446-0 N19661-1 N18529-0 N41387-0 N...,1.573733e+09
107052,107053,U80707,11/14/2019 12:59:50 PM,N33358 N8887 N55922 N23554 N30578 N48904 N4595...,N23446-0 N50872-0 N1952-0 N45523-0 N38779-0 N3...,1.573733e+09
124875,124876,U43003,11/14/2019 12:59:55 PM,N13427 N16158 N16233 N42526 N7422 N9226 N55743...,N63060-0 N38779-0 N10960-0 N1539-0 N9284-0 N34...,1.573733e+09


#### Evaluate Feature-Based Model

In [46]:
%run feature_based.ipynb

In [47]:
#X_j, news_id_to_index = create_all_item_vectors(news, behavior)
X_j= create_all_x_j()

evaluations_ndcg = []
evaluations_pak = []
t0 = float(behavior["Timestamp"][0])
tn = float(behavior["Timestamp"][len(behavior["Timestamp"])-1])
split_ratio = 2/3
dt = (tn-t0)/10
k = 5

while t0 <= tn-dt:
    tsplit = t0 + dt*split_ratio
    t1 = t0 + dt

    train_view = get_view(behavior, t0, tsplit)
    test_view = get_view(behavior, tsplit, t1)

    users = get_users(test_view, train_view)

    for user in users:
        #x_i = create_user_vector(user, X_j, train_view)
        #x_i = create_x_i(user, train_view)
        prediction = pd.DataFrame(find_top_k_articles(user, X_j, 10, news_id_to_index), columns=["News ID", "Score"])
        response = get_impressions(user, test_view)
        #prediction = pd.DataFrame(score(x_i, X_j, response["News ID"]), columns=["News ID", "Score"])
        pred_resp = prediction.join(response.set_index("News ID"), on="News ID", how="inner")
        try:
            evaluation = ndcg_score(np.array([pred_resp["Response"].to_numpy()]), np.array([pred_resp["Score"].to_numpy()]))
            evaluations_ndcg.append(evaluation)
            k_slice = pred_resp["Response"].iloc[:k]
            evaluation = k_slice.sum()/min(len(k_slice), k)
            evaluations_pak.append(evaluation)
            print("Predictions for user " + user + " evaluated!")
        except:
            print("Eval failed")
            pass

    print("moving timeframe")
    t0 = tsplit

moving timeframe
moving timeframe
moving timeframe
Predictions for user U85982 evaluated!
Predictions for user U31142 evaluated!
Predictions for user U9235 evaluated!
Predictions for user U56223 evaluated!
Predictions for user U23595 evaluated!
Predictions for user U48683 evaluated!
Predictions for user U66447 evaluated!
Predictions for user U18291 evaluated!
Predictions for user U24591 evaluated!
Predictions for user U52342 evaluated!
Predictions for user U83533 evaluated!
Predictions for user U32479 evaluated!
Predictions for user U11828 evaluated!
Predictions for user U78924 evaluated!
Predictions for user U40260 evaluated!
Predictions for user U13061 evaluated!
Predictions for user U90567 evaluated!
Predictions for user U58141 evaluated!
Predictions for user U58191 evaluated!


KeyboardInterrupt: 

In [None]:
print("nDCG:")
print(sum(evaluations_ndcg)/len(evaluations_ndcg))
print("Precision at K:")
print(sum(evaluations_pak)/len(evaluations_pak))


#### Evaluate Collaborative Filtering Model

In [None]:
%run item_collab_filtering.ipynb

In [None]:
evaluations_ndcg = []
evaluations_pak = []
t0 = float(behavior["Timestamp"][0])
tn = float(behavior["Timestamp"][len(behavior["Timestamp"])-1])
split_ratio = 2/3
dt = (tn-t0)/10
k = 5

while t0 <= tn-dt:
    tsplit = t0 + dt*split_ratio
    t1 = t0 + dt

    train_view = get_view(behavior, t0, tsplit)
    test_view = get_view(behavior, tsplit, t1)

    model = train_collaborative_filtering_model(train_view)
    users = get_users(test_view, train_view)

    for user in users:
        prediction = get_top_n_recommendations(user, model, N=10).toPandas()
        response = get_impressions(user, test_view)
        pred_resp = prediction.join(response.set_index("News ID"), on="news_id", how="inner")
        try:
            evaluation = ndcg_score(np.array([pred_resp["Response"].to_numpy()]), np.array([pred_resp["Score"].to_numpy()]))
            evaluations_ndcg.append(evaluation)
            k_slice = pred_resp["Response"].iloc[:k]
            evaluation = k_slice.sum()/min(len(k_slice), k)
            evaluations_pak.append(evaluation)
            print("Predictions for user " + user + " evaluated!")
        except:
            print("Eval failed")
            pass

    print("moving timeframe")
    t0 = tsplit

In [None]:
print("nDCG:")
print(sum(evaluations_ndcg)/len(evaluations_ndcg))
print("Precision at K:")
print(sum(evaluations_pak)/len(evaluations_pak))