In [1]:
# import libraries and datasets

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from math import log2 as lg
from sklearn.metrics import ndcg_score
from sklearn.metrics import top_k_accuracy_score
from datetime import datetime as datetime

In [2]:
# Get all impressions for a user and add them to a df
def get_impressions(userID, behavior_view):
    test = []
    for index, row in behavior_view.iterrows():
        if row["User ID"] == userID:
            for impression in row["Impressions"].split(" "):
                imp = impression.split("-")
                if imp[1] == "1":
                    test.append((imp[0], 1))
                else:
                    test.append((imp[0], 0))
    return pd.DataFrame(test, columns=["News ID", "Response"])


# Join response to our predictions in order to sort them before evaluation
def create_evaluation_data(scored_data, userID):
    return scored_data.join(get_impressions(userID).set_index("News ID"), how="inner", on="News ID")


# Get dict of all users
def get_users(behavior_view):
    users = {}
    for index, row in behavior_view.iterrows():
        users[row["User ID"]] = True
    return users.keys()

#userList = __get_users()

In [None]:
def DCGi(scored):
    dcgi = scored.iloc[0]["Response"]

    for index, row in scored.iloc[1:].iterrows():
        dcgi += row["Response"]/lg(index+1)
    
    return dcgi



def nDCG(scored_data, userID):
    scored = create_evaluation_data(scored_data, userID)
    dcg = DCGi(scored)
    n = 1
    for index, row in scored.iloc[1:].iterrows():
        if row["Response"] == 1:
            n += 1/lg(index+1)
    return dcg/n


In [13]:
%run feature_based.ipynb

In [14]:
# New evaluator using library
behavior = pd.read_csv("data/MINDsmall_train/behaviors.tsv", sep="\t", header=None, names=["Impression ID", "User ID", "Time", "History", "Impressions"])
news = pd.read_csv("data/MINDsmall_train/news.tsv", sep="\t", header=None, names=["News ID", "Category", "Subcategory", "Title", "Abstract", "URL", "Title Entities", "Abstract Entities", "Title Topics", "Abstract Topics"])

In [15]:
def str_to_timestamp(str):
    return datetime.strptime(str, "%m/%d/%Y %H:%M:%S %p").timestamp()

timestamps = behavior["Time"].apply(str_to_timestamp)
behavior["Time"] = timestamps
behavior.sort_values(by="Time")

Unnamed: 0,Impression ID,User ID,Time,History,Impressions
103760,103761,U68089,1.573258e+09,N138 N29177 N28850 N22745 N55326 N53100 N33969...,N52000-0 N41881-0 N60374-0 N5442-0 N51398-0 N5...
42450,42451,U56509,1.573258e+09,N51214 N60979 N9293 N4786 N13380 N14149 N2155,N63685-0 N34799-0 N26130-0 N51378-0 N7891-0 N6...
116953,116954,U34617,1.573258e+09,N11863 N44310 N31064,N52000-0 N41881-0 N27845-1 N47020-0 N51398-0
26087,26088,U11984,1.573258e+09,N39074 N3501 N31457 N61864 N3493 N25971 N29718...,N47020-1 N27845-0 N41881-0 N51398-0 N52000-0
14464,14465,U74966,1.573258e+09,N39074 N19760 N20530 N58668 N44495 N20039 N339...,N59852-0 N47020-0 N39115-0 N58051-0 N37088-0 N...
...,...,...,...,...,...
95717,95718,U6300,1.573733e+09,N28296 N34087 N37942 N27311 N35022 N42620,N50872-0 N25165-0 N29212-0 N32567-0 N16439-0 N...
140318,140319,U15094,1.573733e+09,N33038 N19494 N54377 N21242 N29499 N55743 N330...,N56142-0 N23446-0 N19661-1 N18529-0 N41387-0 N...
107052,107053,U80707,1.573733e+09,N33358 N8887 N55922 N23554 N30578 N48904 N4595...,N23446-0 N50872-0 N1952-0 N45523-0 N38779-0 N3...
124875,124876,U43003,1.573733e+09,N13427 N16158 N16233 N42526 N7422 N9226 N55743...,N63060-0 N38779-0 N10960-0 N1539-0 N9284-0 N34...


In [16]:
dt = 1000

def get_view(t0, t1):
    df = behavior[(behavior["Time"] >= t0) & (behavior["Time"] < t1)]
    return df

X_j = create_all_x_j()

In [46]:
evaluations_ndcg = []
t0 = float(behavior["Time"][0])
tn = float(behavior["Time"][len(behavior["Time"])-1])
split_ratio = 2/3
dt = (tn-t0)/100
k = 5

while t0 < tn:
    tsplit = t0 + dt*split_ratio
    t1 = t0 + dt

    train_view = get_view(t0, tsplit)
    test_view = get_view(tsplit, t1)

    users = get_users(train_view)

    for user in users:
        if user in test_view["User ID"].values:
            x_i = create_x_i(user, train_view)
            prediction = pd.DataFrame(score(x_i), columns=["News ID", "Score"])
            response = get_impressions(user, test_view)
            pred_resp = prediction.join(response.set_index("News ID"), on="News ID", how="inner")

            try:
                evaluation = ndcg_score(np.array([pred_resp["Response"].to_numpy()]), np.array([pred_resp["Score"].to_numpy()]))
                evaluations_ndcg.append(evaluation)
                evaluation = top_k_accuracy_score(np.array([pred_resp["Response"].to_numpy()]), np.array([pred_resp["Score"].to_numpy()]), k)
                print("Predictions for user " + user + " evaluated!")
            except:
                print("Eval failed")
                pass

    print("moving timeframe")
    t0 = t1

Predictions for user U73551 evaluated!
Predictions for user U30988 evaluated!
Predictions for user U8673 evaluated!
Predictions for user U91640 evaluated!
Predictions for user U54806 evaluated!
Predictions for user U52496 evaluated!
Predictions for user U37854 evaluated!
Predictions for user U68925 evaluated!
Predictions for user U41595 evaluated!
Predictions for user U86539 evaluated!
Predictions for user U27440 evaluated!
Predictions for user U79508 evaluated!
Predictions for user U54826 evaluated!
Predictions for user U70879 evaluated!


In [45]:
print(sum(evaluations_ndcg)/len(evaluations_ndcg))
#print(get_view(float(behavior["Time"][0]), float(behavior["Time"][0])+(float(behavior["Time"][len(behavior["Time"])-1])-float(behavior["Time"][0]))/10))

0.4920403583681785


In [12]:
print(behavior["History"])
for index, row in behavior.iterrows():
    if type(row["History"]) != type("str"):
        print(index, row["History"])

0         N55189 N42782 N34694 N45794 N18445 N63302 N104...
1         N31739 N6072 N63045 N23979 N35656 N43353 N8129...
2         N10732 N25792 N7563 N21087 N41087 N5445 N60384...
3         N45729 N2203 N871 N53880 N41375 N43142 N33013 ...
4                               N10078 N56514 N14904 N33740
                                ...                        
156960    N7432 N58559 N1954 N43353 N14343 N13008 N28833...
156961    N9803 N104 N24462 N57318 N55743 N40526 N31726 ...
156962    N29898 N59704 N4408 N9803 N53644 N26103 N812 N...
156963    N4118 N47297 N3164 N43295 N6056 N38747 N42973 ...
156964                                        N22997 N48742
Name: History, Length: 156965, dtype: object
28 nan
96 nan
155 nan
194 nan
257 nan
358 nan
433 nan
651 nan
738 nan
864 nan
866 nan
897 nan
946 nan
961 nan
1082 nan
1101 nan
1121 nan
1216 nan
1332 nan
1362 nan
1533 nan
1686 nan
1855 nan
1861 nan
1872 nan
1924 nan
1935 nan
1975 nan
1996 nan
2053 nan
2165 nan
2170 nan
2284 nan
2287 nan
2315 

KeyboardInterrupt: 