In [16]:
import pandas as pd
import numpy as np
import pickle 
from tqdm import tqdm 
from collections import Counter,OrderedDict
from sklearn.metrics.pairwise import cosine_similarity
import nltk 
from nltk.corpus import stopwords
import string  

nltk.download("stopwords")

def flatten_comprehension(matrix):
    #https://realpython.com/python-flatten-list/
    return [item for row in matrix for item in row]

# Load data
behaviors = pd.read_csv('./small_training_data/behaviors.tsv', delimiter='\t', header=None)
news = pd.read_csv('./small_training_data/news.tsv', delimiter='\t', header=None)

# Naming columns
behaviors.columns = ["impression_id", "user_id", "time", "history", "impressions"]
news.columns = ["news_id", "category", "subcategory", "title", "abstract", "url", "title_entities", "abstract_entities"]


# Extracting clicked news from behaviors, this is a column of lists of the clicked news (tagget with 1) for each impression
behaviors['clicked_news'] = behaviors['impressions'].apply(lambda x: [imp.split('-')[0] for imp in x.split() if imp.split('-')[1] == '1'])

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/hal9000/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


{'has', 'being', 'him', 'with', 'been', "you'd", 'myself', 'couldn', 'does', 'are', 'against', 'how', 'theirs', 'do', 'both', 'under', "won't", 'until', 'mustn', 'each', 'ain', 'yours', "wasn't", 'haven', 'or', "shouldn't", 'will', 'during', 'between', 'whom', 'where', 'below', 'ours', 'for', 'over', 'its', 'very', 'yourselves', 'should', 'having', 'at', 'not', 'out', 'here', "mightn't", 'ourselves', 'such', 'few', "aren't", "hadn't", 'doing', 'their', 'i', 'some', 'shan', 'as', 'the', 'she', "shan't", 'did', 'while', 'which', 'didn', "didn't", 'than', 'hadn', 's', 'of', "should've", 'doesn', 'only', "it's", "hasn't", 'you', 'yourself', 'any', 'off', 'when', 'am', 'but', 'don', 'be', "you've", 'is', 'itself', 'most', 'herself', 'into', 'weren', 'so', 'was', "couldn't", "you'll", 'to', 'down', 'just', 'll', 'in', 't', 'can', 'because', 'by', 'more', 'they', 'own', 'he', 'up', 'after', 'her', 'himself', 'all', 'hasn', 'wouldn', 'my', "that'll", 'further', 'those', 're', 'on', "isn't", 'y

In [9]:
#define unique users and their read articles
unique_users_series = behaviors["user_id"].drop_duplicates()
unique_users_dict = {}  #user_id coupled to news_articles

#aggregate all read articles and couple it to the user in the unique_users_frame
i = 0
for user_id in tqdm(unique_users_series,ascii=True):
    user_specific_behavior = behaviors[behaviors["user_id"] == user_id]
    try:
        history_list = flatten_comprehension([elem.split(" ") for elem in user_specific_behavior["history"]])
    except:
        pass 
    clicked_list = flatten_comprehension([elem for elem in user_specific_behavior["clicked_news"]])
    all_read_articles = history_list + clicked_list
    unique_users_dict[user_id] = all_read_articles


with open('unique_users.pkl', 'wb') as file: #serialize so we don't have to do this inefficient code again
    pickle.dump(unique_users_dict,file)



100%|###################################################################################################################################| 50000/50000 [04:11<00:00, 198.97it/s]


In [None]:
#define the embedding. (what words we look for)

def remove_punctuation(sentence:str) -> str:
    sentence = sentence.replace("'","")
    for elem in string.punctuation:
        sentence = sentence.replace(elem," ")
    return sentence

def remove_stopwords(sentence:str)-> str:
    stop_words = list(set(stopwords.words("english")))
    sentence= [ elem for elem in sentence.split(" ") if elem not in stop_words]
        
    return " ".join(sentence)

def preprocess(sentence:str) -> Counter:
    counter = Counter(remove_punctuation(remove_stopwords(sentence.lower())).split(" "))
    counter[''] =0 
    return counter


total_title = ""
for title in tqdm(news["title"],ascii=True):
    total_title += " "+title 
total_BOW = preprocess(total_title)


sorted_BOW = sorted(OrderedDict(total_BOW).items(),key=lambda x: x[1],reverse=True)
print(sorted_BOW)
    

100%|################################################################################################################| 51282/51282 [00:00<00:00, 2116133.74it/s]


In [None]:
embedding_words = [elem[0] for elem in sorted_BOW[:500]]

with open('unique_users.pkl', 'rb') as file:
    users = pickle.load(file)

user_BOW = {} #{"user_id":embedding,etc.}

#compute user embeddings
for user in tqdm(users,ascii=True):
    news_ids = users[user]     
    user_BOW[user] = Counter(news["title"][news["news_id"].isin(news_ids)].map(str.lower).apply(lambda x: x +" ").agg("sum").split(" "))
    
with open('userBOW.pkl', 'wb') as file: #serialize so we don't have to do this inefficient code again
    pickle.dump(user_BOW,file)

In [None]:
with open('userBOW.pkl', 'rb') as file:
    user_BOW = pickle.load(file)

ordered_userBOW = OrderedDict(user_BOW)



In [None]:
"""
NOTE 

instead of cosine similarity we use Jaccard similarity J (https://stats.stackexchange.com/questions/289400/quantify-the-similarity-of-bags-of-words)
because the BOW embeddings are really sparse except if you use to common words ( "a", "is", "of",etc) 
This would be counter intuitive to use. 

"""