In [3]:
import pandas as pd
import numpy as np

from scipy.sparse import save_npz, load_npz

news_dev = pd.read_csv("data/MINDsmall_dev/news.tsv", sep="\t", header=None, names=["News ID", "Category", "Subcategory", "Title", "Abstract", "URL", "Title Entities", "Abstract Entities", "Title Topics", "Abstract Topics"])
behavior_dev = pd.read_csv("data/MINDsmall_dev/behaviors.tsv", sep="\t", header=None, names=["Impression ID", "User ID", "Time", "History", "Impressions"])
behavior_train = pd.read_csv("data/MINDsmall_train/behaviors.tsv", sep="\t", header=None, names=["Impression ID", "User ID", "Time", "History", "Impressions"])
news_train = pd.read_csv("data/MINDsmall_train/news.tsv", sep="\t", header=None, names=["News ID", "Category", "Subcategory", "Title", "Abstract", "URL", "Title Entities", "Abstract Entities", "Title Topics", "Abstract Topics"])

#get all news so that item vectors have the same dimension
news_all = pd.concat([news_dev, news_train], ignore_index=True).drop_duplicates().reset_index(drop=True)

In [1]:
%run feature_based.ipynb

In [4]:
#Fill missing abstracts with placeholder
news_dev['Abstract'].fillna('No abstract available', inplace=True)
news_train['Abstract'].fillna('No abstract available', inplace=True)
news_all['Abstract'].fillna('No abstract available', inplace=True)


# if there are rows with no impressions, drop them
behavior_dev = behavior_dev.dropna(subset=['Impressions']) # this looses some user information, could instead manually overwrite and fill in the missing values based on the typo combining the impression and history columns
behavior_dev = behavior_train.dropna(subset=['Impressions']) # this looses some user information, could instead manually overwrite and fill in the missing values based on the typo combining the impression and history columns

In [5]:
item_vectors, news_id_to_index, vectorizer = create_all_item_vectors(news_all)

In [6]:
# run if you dont have the data. 

X_train, y_train = prepare_training_data(behavior_train, item_vectors, news_id_to_index)

#save to avoid rerunning the above code as it takes a long time
save_npz("data/test/X_train.npz", X_train)
np.save("data/test/y_train.npy", y_train)

X_dev, y_dev = prepare_training_data(behavior_dev, item_vectors, news_id_to_index, history=True)

save_npz("data/test/X_dev.npz", X_dev)
np.save("data/test/y_dev.npy", y_dev)

In [17]:
#load the data
X_train = load_npz("data/test/X_train.npz")
y_train = np.load("data/test/y_train.npy")
X_dev = load_npz("data/test/X_dev.npz")
y_dev = np.load("data/test/y_dev.npy")

In [18]:

model = train_final_model(X_train, y_train)

y_pred = model.predict(X_dev)
classification = classification_report(y_dev, y_pred)
print(classification)

              precision    recall  f1-score   support

           0       0.98      0.94      0.96     10320
           1       0.79      0.91      0.84      2512

    accuracy                           0.93     12832
   macro avg       0.88      0.92      0.90     12832
weighted avg       0.94      0.93      0.94     12832

