In [1]:
import pandas as pd
import numpy as np
import pickle 
from tqdm import tqdm 
from collections import Counter,OrderedDict
from sklearn.metrics.pairwise import cosine_similarity
import nltk 
from nltk.corpus import stopwords
import string  
from collections import OrderedDict
nltk.download("stopwords")

def flatten_comprehension(matrix):
    #https://realpython.com/python-flatten-list/
    return [item for row in matrix for item in row]

# Load data
behaviors = pd.read_csv('./small_training_data/behaviors.tsv', delimiter='\t', header=None)
news = pd.read_csv('./small_training_data/news.tsv', delimiter='\t', header=None)

# Naming columns
behaviors.columns = ["impression_id", "user_id", "time", "history", "impressions"]
news.columns = ["news_id", "category", "subcategory", "title", "abstract", "url", "title_entities", "abstract_entities"]


# Extracting clicked news from behaviors, this is a column of lists of the clicked news (tagget with 1) for each impression
behaviors['clicked_news'] = behaviors['impressions'].apply(lambda x: [imp.split('-')[0] for imp in x.split() if imp.split('-')[1] == '1'])

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\fabia\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
#define unique users and their read articles
unique_users_series = behaviors["user_id"].drop_duplicates()
unique_users_dict = {}  #user_id coupled to news_articles

#aggregate all read articles and couple it to the user in the unique_users_frame
i = 0
for user_id in tqdm(unique_users_series,ascii=True):
    user_specific_behavior = behaviors[behaviors["user_id"] == user_id]
    try:
        history_list = flatten_comprehension([elem.split(" ") for elem in user_specific_behavior["history"]])
    except:
        pass 
    clicked_list = flatten_comprehension([elem for elem in user_specific_behavior["clicked_news"]])
    all_read_articles = history_list + clicked_list
    unique_users_dict[user_id] = all_read_articles


with open('unique_users.pkl', 'wb') as file: #serialize so we don't have to do this inefficient code again
    pickle.dump(unique_users_dict,file)



100%|##########| 50000/50000 [10:16<00:00, 81.17it/s] 


In [None]:
#define the embedding. (what words we look for)

def remove_punctuation(sentence:str) -> str:
    sentence = sentence.replace("'","")
    for elem in string.punctuation:
        sentence = sentence.replace(elem," ")
    return sentence

def remove_stopwords(sentence:str)-> str:
    stop_words = list(set(stopwords.words("english")))
    sentence= [ elem for elem in sentence.split(" ") if elem not in stop_words]
        
    return " ".join(sentence)

def preprocess(sentence:str) -> Counter:
    counter = Counter(remove_punctuation(remove_stopwords(sentence.lower())).split(" "))
    counter[''] =0 
    return counter



    

In [None]:
# embedding_words = [elem[0] for elem in sorted_BOW[:1000]]

with open('unique_users.pkl', 'rb') as file:
    users = pickle.load(file)

#make the dict ordered and sort it on the number of clicks grab top 300 users with most clicks
selected_users = sorted(OrderedDict(users),key=lambda x: len(users[x]),reverse=True) 
users = {user:users[user] for user in selected_users[:300]}

user_BOW = {} #{"user_id":embedding,etc.}

#compute user embeddings
for user in tqdm(users,ascii=True):

    news_ids = users[user]    
    counter =preprocess(news["title"][news["news_id"].isin(news_ids)].map(str.lower).apply(lambda x: x +" ").agg("sum"))
    
    user_BOW[user] = counter

with open('userBOW.pkl', 'wb') as file: #serialize so we don't have to do this inefficient code again
    pickle.dump(user_BOW,file)
    

100%|##########| 300/300 [00:02<00:00, 109.01it/s]


In [None]:
with open('userBOW.pkl', 'rb') as file:
    user_BOW = pickle.load(file)

user_BOW = OrderedDict(user_BOW)
k = 8  # size of the groups

def Jaccard_similarity(bow_user1, bow_user2):
    intersection = sum((bow_user1 & bow_user2).values())
    union = sum((bow_user1 + bow_user2).values())
    return intersection / union

# Get the first 300
k_most_similar = {}  # {user1: [k * user2], etc.}
for user1, bow_user1 in user_BOW.items():
    similarity = OrderedDict({})

    for user2, bow_user2 in user_BOW.items():
        if user1 != user2:
            similarity[user2] = Jaccard_similarity(bow_user1, bow_user2)
    k_most_similar[user1] = [user for user in sorted(similarity, key=lambda x: similarity[x])[:k]]

grouped_dict = {}
for i, (user, similar_users) in enumerate(k_most_similar.items(), start=1):
    group_key = f"Group {i}"
    grouped_dict[group_key] = similar_users


# Print the k_most_similar dictionary
for user, similar_users in k_most_similar.items():
    print(f"{user}: {similar_users}")


U63482: ['U62470', 'U67478', 'U1331', 'U57214', 'U64554', 'U3394', 'U3713', 'U19299']
U20833: ['U69084', 'U31631', 'U25381', 'U1331', 'U64554', 'U19299', 'U20186', 'U46689']
U79449: ['U69084', 'U57214', 'U31631', 'U62470', 'U1331', 'U67478', 'U46689', 'U4929']
U79210: ['U69084', 'U31631', 'U67478', 'U64554', 'U1331', 'U57214', 'U20186', 'U46689']
U32146: ['U31631', 'U20186', 'U3713', 'U84756', 'U79549', 'U54826', 'U79816', 'U75154']
U84756: ['U69084', 'U31631', 'U57214', 'U21331', 'U1331', 'U64554', 'U42643', 'U80596']
U72489: ['U69084', 'U67478', 'U1331', 'U4929', 'U57214', 'U62470', 'U21980', 'U88934']
U67455: ['U69084', 'U4929', 'U62470', 'U1331', 'U57214', 'U67478', 'U20995', 'U3394']
U44210: ['U62470', 'U19299', 'U60879', 'U16144', 'U9438', 'U70879', 'U17326', 'U31631']
U55621: ['U31631', 'U46689', 'U20186', 'U1331', 'U93377', 'U75154', 'U64554', 'U70879']
U93306: ['U31631', 'U70879', 'U46689', 'U1331', 'U20186', 'U69084', 'U8568', 'U25381']
U72280: ['U69084', 'U1331', 'U4929', 'U

In [None]:
grouped_dict["Group 300"]

['U84756', 'U91426', 'U43884', 'U7736', 'U72489', 'U20186', 'U49572', 'U87148']

In [None]:
"""
NOTE 

instead of cosine similarity we use Jaccard similarity J (https://stats.stackexchange.com/questions/289400/quantify-the-similarity-of-bags-of-words)
because the BOW embeddings are really sparse except if you use to common words ( "a", "is", "of",etc) 
This would be counter intuitive to use. 

"""