In [35]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors

from sklearn.model_selection import train_test_split

### Load MIND train and MIND Dev set

In [2]:
news_train = pd.read_csv('Dataset/MINDsmall_train/news.tsv',sep='\t', header=None)
news_train = news_train.rename(columns={0:'News ID', 1:'Category', 2:'SubCategory', 3:'Title', 4:'Abstract', 5:'URL', 6:'Title Entities', 7:'Abstract Entities'})
print("Shape of news data: ",news_train.shape)

Shape of news data:  (51282, 8)


In [3]:
behaviours_train = pd.read_csv('Dataset/MINDsmall_train/behaviors.tsv',sep='\t', header=None)
print(behaviours_train.shape)

(156965, 5)


In [4]:
behaviours_train = behaviours_train.rename(columns={0:'Impression ID', 1:'User ID', 2:'Time', 3:'History', 4:'Impressions'})

In [5]:
news_valid = pd.read_csv('Dataset/MINDsmall_dev/news.tsv',sep='\t', header=None)
print("Shape of news data: ",news_valid.shape)

news_valid = news_valid.rename(columns={0:'News ID', 1:'Category', 2:'SubCategory', 3:'Title', 4:'Abstract', 5:'URL', 6:'Title Entities', 7:'Abstract Entities'})

Shape of news data:  (42416, 8)


In [6]:
behaviours_valid = pd.read_csv('Dataset/MINDsmall_dev/behaviors.tsv',sep='\t', header=None)
print(behaviours_valid.shape)
behaviours_valid = behaviours_valid.rename(columns={0:'Impression ID', 1:'User ID', 2:'Time', 3:'History', 4:'Impressions'})

(73152, 5)


##### Concatenate MIND train and MIND Dev

In [7]:
news_concat = pd.concat([news_train, news_valid], ignore_index=True)

In [8]:
behaviours_concat = pd.concat([behaviours_train, behaviours_valid], ignore_index=True)

##### Prepare data: encode concatenated behaviour data and news data

In [10]:
behaviours_encoded_concat = pd.DataFrame(behaviours_concat, columns=['Impression ID', 'User ID', 'Time', 'History', 'Impressions'])

In [11]:
# Index all users' IDs in behaviours.csv
unique_users = behaviours_concat["User ID"].unique()
user_id_dic = {i : item for i, item in enumerate(unique_users)}
user_keys = list(user_id_dic.keys())
user_values = list(user_id_dic.values())
 
list_user_ids = []
for x in behaviours_concat["User ID"]:
    for i in range(len(user_values)):
        if x == user_values[i]:
            list_user_ids.append(user_keys[i])
            break
print("length of list_user_ids: ", len(list_user_ids))

behaviours_encoded_concat["User ID"] = list_user_ids

# userid_to_index = { userid: index for index, userid in enumerate(unique_users)}

length of list_user_ids:  230117


In [12]:
# Index News articles in news.csv
id_to_index = {news_id: index for index, news_id in enumerate(news_concat['News ID'])}

# Index history
history = behaviours_concat['History'].str.split()
behaviours_encoded_concat['History'] = history.apply(lambda x: [id_to_index[id] for id in x] if isinstance(x, list) else x)

In [13]:
# ref: https://www.kaggle.com/code/danielpleus/mind-recommender-from-scratch
def process_impression(impression):
    list_of_strgs = impression.split(" ")
    item_tuple = [l.split("-") for l in list_of_strgs]
    noclicks = []
    clicks = []
    
    for item in item_tuple:
        if item[1] =='0':
            noclicks.append(item[0])
        if item[1] =='1':
            clicks.append(item[0])
    return noclicks, clicks

behaviours_encoded_concat['Noclicks'], behaviours_encoded_concat['Clicks'] = zip(*behaviours_concat['Impressions'].map(process_impression))

behaviours_encoded_concat['Noclicks'] = behaviours_encoded_concat['Noclicks'].map(lambda list_of_strings: [id_to_index[id] for id in list_of_strings])
behaviours_encoded_concat['Clicks'] = behaviours_encoded_concat['Clicks'].map(lambda list_of_strings: [id_to_index[id] for id in list_of_strings])

In [20]:
# convert given time format to hours
behaviours_encoded_concat["Time"] = pd.to_datetime(behaviours_concat['Time']).values.astype(np.int64)/(1e6)/1000/3600
behaviours_encoded_concat["Time"] = behaviours_encoded_concat["Time"].round()

# behaviours_encoded_concat.explode("Clicks").reset_index(drop=True)

In [22]:
behaviours_encoded_concat.to_pickle('behaviours_encoded_concat.df')

### Sp

In [26]:
# Let us use the last 0.1 of the data as our validation data using quantile:
test_time_th = behaviours_encoded_concat['Time'].quantile(0.9)
behaviour_encoded_train = behaviours_encoded_concat[behaviours_encoded_concat['Time']< test_time_th]
behaviour_encoded_valid =  behaviours_encoded_concat[behaviours_encoded_concat['Time']>= test_time_th]

In [33]:
behaviour_encoded_train.to_pickle('behaviour_encoded_train.df')

In [34]:
behaviour_encoded_valid.to_pickle('behaviour_encoded_valid.df')

In [30]:
len(behaviour_encoded_train['User ID'].unique())

81336

In [38]:
behaviour_encoded_train.shape

(204082, 7)

In [31]:
# News data
# Index News articles in news.csv
newsids = [index for index, news_id in enumerate(news_concat['News ID'])]

In [39]:
newsid_encoded_concat = pd.DataFrame(newsids)

In [46]:
newsid_encoded_concat.to_pickle('newsid_encoded_concat.df')

In [47]:
newsid_encoded_train, newsid_encoded_valid = train_test_split(news_concat, test_size=0.1)

In [48]:
newsid_encoded_train.shape

(84328, 8)

In [49]:
newsid_encoded_train.to_pickle('newsid_encoded_train.df')
newsid_encoded_valid.to_pickle('newsid_encoded_valid.df')