## Evaluation

##### How to run:

Run all cells up to, and including, the loading data section.

Run all cells in the section belonging to the recommender you want to evaluate.


## Required packages:

##### All:
 - pandas
 - numpy
 - scikit-learn

##### Collaborative Filtering:
 - pyspark

In [None]:
# import libraries

import pandas as pd
import numpy as np
from sklearn.metrics import ndcg_score
from datetime import datetime as datetime

#### Defining Helper Functions

In [None]:
# Get all impressions from a specific user in a behavior dataframe, and add them to a new dataframe
def get_impressions(userID, behavior_view):
    test = []
    for index, row in behavior_view.iterrows():
        if row["User ID"] == userID:
            for impression in row["Impressions"].split(" "):
                imp = impression.split("-")
                if imp[1] == "1":
                    test.append((imp[0], 1))
                else:
                    test.append((imp[0], 0))
    return pd.DataFrame(test, columns=["News ID", "Response"])


# Get df of all user IDs
def get_users(view1, view2):
    return pd.merge(left=view1["User ID"], right=view2["User ID"], how="inner", on="User ID")["User ID"].unique()

# Get a timegated view of a behavior dataframe
def get_view(behavior, t0, t1):
    df = behavior[(behavior["Timestamp"] >= t0) & (behavior["Timestamp"] < t1)]
    return df

#### Loading Data

In [None]:
behavior = pd.read_csv("data/MINDsmall_train/behaviors.tsv", sep="\t", header=None, names=["Impression ID", "User ID", "Time", "History", "Impressions"])
news = pd.read_csv("data/MINDsmall_train/news.tsv", sep="\t", header=None, names=["News ID", "Category", "Subcategory", "Title", "Abstract", "URL", "Title Entities", "Abstract Entities", "Title Topics", "Abstract Topics"])

In [None]:
def str_to_timestamp(str):
    return datetime.strptime(str, "%m/%d/%Y %H:%M:%S %p").timestamp()

timestamps = behavior["Time"].apply(str_to_timestamp)
behavior["Timestamp"] = timestamps
behavior.sort_values(by="Timestamp")

#Fill missing abstracts with placeholder
news['Abstract'].fillna('No abstract available', inplace=True)

# if there are rows with no impressions, drop them
behavior = behavior.dropna(subset=['Impressions']) # this looses some user information, could instead manually overwrite and fill in the missing values based on the typo combining the impression and history columns

#### Evaluate Feature-Based Model

In [None]:
# Importing functions from feature_based.ipynb
# Comment out function calls in feature_based.ipynb (except the pd.set_option calls) before running this field
%run feature_based.ipynb

In [None]:
X_j, news_id_to_index = create_all_item_vectors(news)

evaluations_ndcg = []
evaluations_pak = []

t0 = float(behavior["Timestamp"][0])
tn = float(behavior["Timestamp"][len(behavior["Timestamp"])-1])

split_ratio = 2/3
dt = (tn-t0)/5
k = 5

In [None]:
# Loop that runs the sliding window

while t0 <= tn-dt:
    # Splitting into train and test views
    tsplit = t0 + dt*split_ratio
    t1 = t0 + dt

    train_view = get_view(behavior, t0, tsplit)
    test_view = get_view(behavior, tsplit, t1)

    # Finding users that have impressions in both the test and train view
    users = get_users(test_view, train_view)

    for user in users:
        prediction = pd.DataFrame(find_top_k_articles(user, X_j, 600000, news_id_to_index, train_view), columns=["News ID", "Score"])
        response = get_impressions(user, test_view)
        
        pred_resp = prediction.join(response.set_index("News ID"), on="News ID", how="inner")

        try:
            evaluation = ndcg_score(np.array([pred_resp["Response"].to_numpy()]), np.array([pred_resp["Score"].to_numpy()]))
            evaluations_ndcg.append(evaluation)
            k_slice = pred_resp["Response"].iloc[:k]
            evaluation = k_slice.sum()/min(len(k_slice), k)
            evaluations_pak.append(evaluation)
            #print("Predictions for user " + user + " evaluated!")
        except:
            #print("Eval failed")
            pass
        

    print("moving timeframe")
    t0 = tsplit

In [None]:
print("nDCG:")
print(sum(evaluations_ndcg)/len(evaluations_ndcg))
print("Precision at K:")
print(sum(evaluations_pak)/len(evaluations_pak))


#### Evaluate Collaborative Filtering Model

In [None]:
# Importing functions from item_collab_filtering.ipynb
# Comment out function calls (except for the call to initiate a spark engine) in item_collab_filtering.ipynb before running this field
%run item_collab_filtering.ipynb

In [None]:
evaluations_ndcg = []
evaluations_pak = []
t0 = float(behavior["Timestamp"][0])
tn = float(behavior["Timestamp"][len(behavior["Timestamp"])-1])
split_ratio = 2/3
dt = (tn-t0)/10
k = 5

In [None]:
# Loop that runs the sliding window

while t0 <= tn-dt:
    # Splitting into train and test views
    tsplit = t0 + dt*split_ratio
    t1 = t0 + dt

    train_view = get_view(behavior, t0, tsplit)
    test_view = get_view(behavior, tsplit, t1)

    # Finding users that have impressions in both the test and train view
    users = get_users(test_view, train_view)

    # Train the model
    model = train_collaborative_filtering_model(train_view)

    # Can swap which for loop you iterate through, if iterating over all users takes too long
    #for user in users[:500]:
    for user in users:
        prediction = get_top_n_recommendations(user, model, N=600000).toPandas()
        response = get_impressions(user, test_view)
        pred_resp = prediction.join(response.set_index("News ID"), on="news_id", how="inner")
        try:
            evaluation = ndcg_score(np.array([pred_resp["Response"].to_numpy()]), np.array([pred_resp["score"].to_numpy()]))
            evaluations_ndcg.append(evaluation)
            k_slice = pred_resp["Response"].iloc[:k]
            evaluation = k_slice.sum()/min(len(k_slice), k)
            evaluations_pak.append(evaluation)
            #print("Predictions for user " + user + " evaluated!")
        except:
            #print("Eval failed")
            pass

    print("moving timeframe")
    t0 = tsplit

In [None]:
print("nDCG:")
print(sum(evaluations_ndcg)/len(evaluations_ndcg))
print("Precision at K:")
print(sum(evaluations_pak)/len(evaluations_pak))

#### Checking against randomized scores

In [9]:
evaluations_ndcg = []
evaluations_pak = []
t0 = float(behavior["Timestamp"][0])
tn = float(behavior["Timestamp"][len(behavior["Timestamp"])-1])
split_ratio = 2/3
dt = (tn-t0)/10
k = 5
np.random.seed(42)

In [10]:
# Loop that runs the sliding window

while t0 <= tn-dt:
    # Splitting into train and test views
    tsplit = t0 + dt*split_ratio
    t1 = t0 + dt

    train_view = get_view(behavior, t0, tsplit)
    test_view = get_view(behavior, tsplit, t1)

    # Finding users that have impressions in both the test and train view
    users = get_users(test_view, train_view)


    # Get ranked recommendations, and evaluate them for all users
    for user in users:
        pred_resp = get_impressions(user, test_view)
        pred_resp["Score"] = np.random.random(pred_resp["Response"].size)
        try:
            evaluation = ndcg_score(np.array([pred_resp["Response"].to_numpy()]), np.array([pred_resp["Score"].to_numpy()]))
            evaluations_ndcg.append(evaluation)
            k_slice = pred_resp["Response"].iloc[:k]
            evaluation = k_slice.sum()/min(len(k_slice), k)
            evaluations_pak.append(evaluation)
            #print("Predictions for user " + user + " evaluated!")
        except:
            #print("Eval failed")
            pass

    print("moving timeframe")
    t0 = tsplit

moving timeframe
moving timeframe
moving timeframe


KeyboardInterrupt: 

In [None]:
print("nDCG:")
print(sum(evaluations_ndcg)/len(evaluations_ndcg))
print("Precision at K:")
print(sum(evaluations_pak)/len(evaluations_pak))