In [1]:
!pip install scipy
import numpy as np
import scipy
import pandas as pd
import math
import random
import sklearn
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import normalize
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse.linalg import svds
import matplotlib.pyplot as plt



In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


importing dataset

In [3]:
import os
folder_path='/content/drive/MyDrive/CI&T'
files_in_folder = os.listdir(folder_path)
print(files_in_folder)

['shared_articles.csv', 'users_interactions.csv']


In [4]:
shared_articles=pd.read_csv('/content/drive/MyDrive/CI&T/shared_articles.csv')
user_interaction=pd.read_csv('/content/drive/MyDrive/CI&T/users_interactions.csv')


In [5]:
shared_articles=shared_articles[shared_articles['eventType']=='CONTENT SHARED']
shared_articles.head()


Unnamed: 0,timestamp,eventType,contentId,authorPersonId,authorSessionId,authorUserAgent,authorRegion,authorCountry,contentType,url,title,text,lang
1,1459193988,CONTENT SHARED,-4110354420726924665,4340306774493623681,8940341205206233829,,,,HTML,http://www.nytimes.com/2016/03/28/business/dea...,"Ethereum, a Virtual Currency, Enables Transact...",All of this work is still very early. The firs...,en
2,1459194146,CONTENT SHARED,-7292285110016212249,4340306774493623681,8940341205206233829,,,,HTML,http://cointelegraph.com/news/bitcoin-future-w...,Bitcoin Future: When GBPcoin of Branson Wins O...,The alarm clock wakes me at 8:00 with stream o...,en
3,1459194474,CONTENT SHARED,-6151852268067518688,3891637997717104548,-1457532940883382585,,,,HTML,https://cloudplatform.googleblog.com/2016/03/G...,Google Data Center 360° Tour,We're excited to share the Google Data Center ...,en
4,1459194497,CONTENT SHARED,2448026894306402386,4340306774493623681,8940341205206233829,,,,HTML,https://bitcoinmagazine.com/articles/ibm-wants...,"IBM Wants to ""Evolve the Internet"" With Blockc...",The Aite Group projects the blockchain market ...,en
5,1459194522,CONTENT SHARED,-2826566343807132236,4340306774493623681,8940341205206233829,,,,HTML,http://www.coindesk.com/ieee-blockchain-oxford...,IEEE to Talk Blockchain at Cloud Computing Oxf...,One of the largest and oldest organizations fo...,en


In [6]:
user_interaction.head()

Unnamed: 0,timestamp,eventType,contentId,personId,sessionId,userAgent,userRegion,userCountry
0,1465413032,VIEW,-3499919498720038879,-8845298781299428018,1264196770339959068,,,
1,1465412560,VIEW,8890720798209849691,-1032019229384696495,3621737643587579081,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2...,NY,US
2,1465416190,VIEW,310515487419366995,-1130272294246983140,2631864456530402479,,,
3,1465413895,FOLLOW,310515487419366995,344280948527967603,-3167637573980064150,,,
4,1465412290,VIEW,-7820640624231356730,-445337111692715325,5611481178424124714,,,


data munging

In [7]:
event_type_strength={
    'VIEW':1.0,
    'LIKE':2.0,
    'BOOKMARK':3.0,
    'FOLLOW':4.0,
    'COMMENT CREATED':5.0
}
user_interaction['eventStrength']=user_interaction['eventType'].apply(lambda x:event_type_strength[x])

In [8]:

users_interactions_count_df=user_interaction.groupby(['personId','contentId']).size().groupby('personId').size()
print('# users:',len(users_interactions_count_df))
users_with_enough_interactions_df=users_interactions_count_df[users_interactions_count_df>=5].reset_index()[['personId']]
print('# users with at least 5interactions: %d'%len(users_with_enough_interactions_df))

# users: 1895
# users with at least 5interactions: 1140


In [9]:
print('#of interaction %d'%len(user_interaction))
interactions_from_selected_users_df=user_interaction.merge(users_with_enough_interactions_df,how='right',left_on='personId',right_on='personId')
print('#of interactions from users with at least 5 interactions %d'%len(interactions_from_selected_users_df))

#of interaction 72312
#of interactions from users with at least 5 interactions 69868


In [10]:
def smooth_user_preference(x):
  return math.log(1+x,2)
interactions_full_df=interactions_from_selected_users_df\
                      .groupby(['personId','contentId'])["eventStrength"].sum()\
                      .apply(smooth_user_preference).reset_index()
print('# of unique user/item interactions:%d '% len(interactions_full_df))
interactions_full_df.head(10)

# of unique user/item interactions:39106 


Unnamed: 0,personId,contentId,eventStrength
0,-9223121837663643404,-8949113594875411859,1.0
1,-9223121837663643404,-8377626164558006982,1.0
2,-9223121837663643404,-8208801367848627943,1.0
3,-9223121837663643404,-8187220755213888616,1.0
4,-9223121837663643404,-7423191370472335463,3.169925
5,-9223121837663643404,-7331393944609614247,1.0
6,-9223121837663643404,-6872546942144599345,1.0
7,-9223121837663643404,-6728844082024523434,1.0
8,-9223121837663643404,-6590819806697898649,1.0
9,-9223121837663643404,-6558712014192834002,1.584963


In [11]:
interactions_train_df, interactions_test_df = train_test_split(
    interactions_full_df,
    stratify=interactions_full_df['personId'],
    test_size=0.20,
    random_state=42
)

print('# interactions on train set:%d'%len(interactions_train_df))
print('# interactions on test set:%d'%len(interactions_test_df))

# interactions on train set:31284
# interactions on test set:7822


In [12]:
#indexing by personId to speed up the searcges during evaluation
interactions_full_indexed_df=interactions_full_df.set_index('personId')
interactions_train_indexed_df=interactions_train_df.set_index('personId')
interactions_test_indexed_df=interactions_test_df.set_index('personId')

Obtaining vector embeddingd for each word in our corpus

In [13]:
import nltk
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [14]:


from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer

# Ignoring stopwords from English and Portuguese
stopwords_list = stopwords.words('english') + stopwords.words('portuguese')

# Train a TF-IDF model whose vector size is composed of the main unigrams and bigrams, ignoring stopwords
vectorizer = TfidfVectorizer(
    analyzer='word',
    ngram_range=(1, 2),  # Unigrams and Bigrams
    min_df=1,  # Minimum document frequency (set to 1, meaning term must appear in at least one document)
    stop_words=stopwords_list  # Stopwords list
)

# Assuming 'shared_articles' has columns 'contentId', 'title', and 'text'
item_ids = shared_articles['contentId'].tolist()

# Create the TF-IDF matrix using 'title' and 'text'
tfidf_matrix = vectorizer.fit_transform(shared_articles['title'] + ' ' + shared_articles['text'])

# Get the feature names (terms) from the TF-IDF model
tfidf_features_names = vectorizer.get_feature_names_out()

# Print the TF-IDF matrix (this will be a sparse matrix)
print(tfidf_matrix)


<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 2337998 stored elements and shape (3047, 1168041)>
  Coords	Values
  (0, 379072)	0.4808658966401115
  (0, 1123612)	0.03454806183866551
  (0, 269553)	0.0630031512731233
  (0, 357049)	0.02659771146412773
  (0, 1068062)	0.028234267994671866
  (0, 897583)	0.017655947302593348
  (0, 134043)	0.25362518705278064
  (0, 1150152)	0.013627809815296696
  (0, 993816)	0.022876715632271647
  (0, 339926)	0.01008666709435639
  (0, 428539)	0.013247279646368241
  (0, 448562)	0.009262819024473951
  (0, 836274)	0.022030553065580865
  (0, 1117649)	0.019514894768534023
  (0, 968785)	0.04093249350893952
  (0, 860573)	0.031300570205181566
  (0, 872179)	0.03657783931487708
  (0, 1016274)	0.09061069509857472
  (0, 255882)	0.021648209798891135
  (0, 404042)	0.024660361994598524
  (0, 1029491)	0.011431233963428156
  (0, 596891)	0.014539077694479534
  (0, 815295)	0.030405779362382902
  (0, 1025334)	0.02682332774031942
  (0, 638461)	0.040465746305358104
 

In [15]:
stopwords_list


['a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 "he'd",
 "he'll",
 'her',
 'here',
 'hers',
 'herself',
 "he's",
 'him',
 'himself',
 'his',
 'how',
 'i',
 "i'd",
 'if',
 "i'll",
 "i'm",
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it'd",
 "it'll",
 "it's",
 'its',
 'itself',
 "i've",
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'on

In [16]:
import numpy as np
import scipy.sparse
from sklearn.preprocessing import normalize

def get_item_profile(item_id):
    # Convert item_id to a list if it's not already
    if not isinstance(item_id, list):
        item_id = [item_id]

    # Get indices for all item_ids in the list
    idx = [item_ids.index(x) for x in item_id if x in item_ids]

    # If any item_id is not found, idx will be empty, so return an empty sparse matrix
    if not idx:
        return scipy.sparse.csr_matrix((1, tfidf_matrix.shape[1]))

    # Stack the item profiles for all item_ids
    item_profile = scipy.sparse.vstack([tfidf_matrix[i:i+1] for i in idx])

    return item_profile

def get_item_profiles(ids):
    item_profiles_list = [get_item_profile(x) for x in ids]
    item_profiles = scipy.sparse.vstack(item_profiles_list)
    return item_profiles

def build_users_profile(person_id, interactions_indexed_df):
    interactions_person_df = interactions_indexed_df.loc[person_id]
    user_item_profiles = get_item_profile(interactions_person_df['contentId'].tolist())
    user_item_strengths = np.array(interactions_person_df['eventStrength']).reshape(-1, 1)

    # Compute weighted average of item profiles
    weighted_avg = np.sum(user_item_profiles.multiply(user_item_strengths), axis=0) / np.sum(user_item_strengths)

    # Ensure it's a numpy array and flatten it
    weighted_avg = np.asarray(weighted_avg).flatten()

    # Normalize the vector
    user_profile_nom = normalize(weighted_avg.reshape(1, -1))

    return user_profile_nom.flatten()

def build_users_profiles():
    interaction_indexed_df = interactions_full_df[interactions_full_df['contentId'].isin(shared_articles['contentId'])].set_index('personId')
    user_profiles = {}
    for person_id in interaction_indexed_df.index.unique():
        user_profiles[person_id] = build_users_profile(person_id, interaction_indexed_df)
    return user_profiles

# Build the user profiles
user_profiles = build_users_profiles()
len(user_profiles)


1140

In [17]:
print("Nombre d'utilisateurs dans interactions_test_indexed_df:", len(interactions_test_indexed_df.index.unique()))
print("Exemple d'IDs d'utilisateurs:", list(interactions_test_indexed_df.index.unique())[:5])
print("Nombre de profils utilisateurs créés:", len(user_profiles))
print("Exemple de user_profiles.keys():", list(user_profiles.keys())[:5])


Nombre d'utilisateurs dans interactions_test_indexed_df: 1140
Exemple d'IDs d'utilisateurs: [-830175562779396891, -7267769888748948232, -3535274684588209118, 3094513233385472738, -3390049372067052505]
Nombre de profils utilisateurs créés: 1140
Exemple de user_profiles.keys(): [-9223121837663643404, -9212075797126931087, -9207251133131336884, -9199575329909162940, -9196668942822132778]


In [18]:
user_profiles

{-9223121837663643404: array([0.00524108, 0.00200939, 0.        , ..., 0.        , 0.        ,
        0.        ]),
 -9212075797126931087: array([0., 0., 0., ..., 0., 0., 0.]),
 -9207251133131336884: array([0.00858753, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]),
 -9199575329909162940: array([0., 0., 0., ..., 0., 0., 0.]),
 -9196668942822132778: array([0., 0., 0., ..., 0., 0., 0.]),
 -9188188261933657343: array([0.02175721, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]),
 -9172914609055320039: array([0.01894135, 0.00319087, 0.        , ..., 0.        , 0.        ,
        0.        ]),
 -9156344805277471150: array([0., 0., 0., ..., 0., 0., 0.]),
 -9120685872592674274: array([0., 0., 0., ..., 0., 0., 0.]),
 -9109785559521267180: array([0., 0., 0., ..., 0., 0., 0.]),
 -9063420486253202900: array([0.00666096, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]),
 -9060214117327732109: array([0., 0., 0., ..., 0.

In [19]:
user_profile=user_profiles[-1479311724257856983]
print(user_profile.shape)

(1168041,)


In [20]:
pd.DataFrame(sorted(zip(tfidf_features_names, user_profiles[-1479311724257856983].flatten().tolist()), key=lambda x: -x[1])[:20],
             columns=['token', 'relevance'])

Unnamed: 0,token,relevance
0,learning,0.276838
1,machine learning,0.236177
2,machine,0.22561
3,google,0.186366
4,data,0.157204
5,ai,0.119854
6,graph,0.095519
7,algorithms,0.089
8,like,0.082494
9,new,0.076572


In [21]:
# Assuming `user_profile` is the user profile vector after normalization
top_tokens = sorted(zip(tfidf_features_names, user_profile.tolist()), key=lambda x: x[1], reverse=True)[:20]

# Display the top 20 tokens with the highest relevance scores
pd.DataFrame(top_tokens, columns=['token', 'relevance'])


Unnamed: 0,token,relevance
0,learning,0.276838
1,machine learning,0.236177
2,machine,0.22561
3,google,0.186366
4,data,0.157204
5,ai,0.119854
6,graph,0.095519
7,algorithms,0.089
8,like,0.082494
9,new,0.076572


In [22]:
print("TF-IDF Matrix Shape:", tfidf_matrix.shape)  # Should be (N, 5000) where 5000 is the feature size


TF-IDF Matrix Shape: (3047, 1168041)


Class for content-based filtring

In [23]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

class ContentBasedRecommender:
    MODEL_NAME = 'Content-Based'

    def __init__(self, items_df=None):
        self.items_df = items_df

    def get_model_name(self):
        return self.MODEL_NAME

    def _get_similar_items_to_user_profile(self, person_id, topn=1000):
        # Vérifier que user_profiles et tfidf_matrix sont définis
        if person_id not in user_profiles:
            raise ValueError(f"L'utilisateur {person_id} n'a pas de profil.")

        # Reshape user profile into a 2D array with a single row
        user_profile_2d = user_profiles[person_id].reshape(1, -1)  # Reshape to (1, 1168041)

        # Calcul de la similarité cosinus using the reshaped user profile
        cosine_similarities = cosine_similarity(user_profile_2d, tfidf_matrix)

        # Sélection des indices des items les plus similaires
        similar_indices = cosine_similarities.argsort().flatten()[-topn:]

        # Trier les items par score de similarité décroissant
        similar_items = sorted(
            [(item_ids[i], cosine_similarities[0, i]) for i in similar_indices],
            key=lambda x: -x[1]
        )

        return similar_items

    def recommend_items(self, user_id, items_to_ignore=[], topn=10, verbose=False):
        similar_items = self._get_similar_items_to_user_profile(user_id)

        # Filtrer les items ignorés
        similar_items_filtered = list(filter(lambda x: x[0] not in items_to_ignore, similar_items))

        # Création d'un DataFrame des recommandations
        recommendations_df = pd.DataFrame(similar_items_filtered, columns=['contentId', 'recStrength']).head(topn)

        if verbose:
            if self.items_df is None:
                raise Exception('"items_df" est requis en mode verbose')
            recommendations_df = recommendations_df.merge(
                self.items_df, how='left', left_on='contentId', right_on='contentId'
            )[['recStrength', 'contentId', 'title', 'url', 'lang']]

        return recommendations_df
content_based_recommendation_model = ContentBasedRecommender(shared_articles)


In [24]:
shared_articles.shape

(3047, 13)

Evaluation

In [26]:
def get_items_interacted(person_id, interactions_df):
  interacted_items=interactions_df.loc[person_id]['contentId']
  return set(interacted_items if type(interacted_items)==pd.Series else [interacted_items])

In [27]:
import random
import pandas as pd

# Top N Accuracy Metrics Consts
EVAL_RANDOM_SAMPLE_NON_INTERACTED_ITEMS = 100

class ModelEvaluator:
    # Fonction pour obtenir les articles non interagis
    def get_not_interacted_items_samples(self, person_id, sample_size, seed=42):
        interacted_items = get_items_interacted(person_id, interactions_full_indexed_df)
        all_items = set(shared_articles['contentId'])

        non_interacted_items = all_items - interacted_items

        if len(non_interacted_items) < sample_size:
            raise ValueError(f"Pas assez d'articles non interagis pour {person_id}. Disponible : {len(non_interacted_items)}")

        random.seed(seed)
        non_interacted_items_sample = random.sample(list(non_interacted_items), sample_size)
        return set(non_interacted_items_sample)

    # Vérifie si un article est dans le top-N des recommandations
    def _verify_hit_top_n(self, item_id, recommended_items, topn):
        try:
            index = next(i for i, c in enumerate(recommended_items) if c == item_id)
            hit = int(index in range(0, topn))
        except StopIteration:
            hit = 0
            index = -1
        return hit, index

    # Évaluer les performances du modèle pour un utilisateur
    def evaluate_model_for_user(self, model, person_id):
        # Vérification si l'utilisateur existe dans le test set
        if person_id not in interactions_test_indexed_df.index:
            return None

        interacted_values_testset = interactions_test_indexed_df.loc[person_id]

        if isinstance(interacted_values_testset['contentId'], pd.Series):
            person_interacted_items_testset = set(interacted_values_testset['contentId'])
        else:
            person_interacted_items_testset = {int(interacted_values_testset['contentId'])}

        interacted_items_count_testset = len(person_interacted_items_testset)

        # Obtenir les recommandations du modèle
        person_recs_df = model.recommend_items(
            person_id,
            items_to_ignore=get_items_interacted(person_id, interactions_train_indexed_df),
            topn=10000
        )

        hits_at_5_count = 0
        hits_at_10_count = 0

        for item_id in person_interacted_items_testset:
            non_interacted_items_sample = self.get_not_interacted_items_samples(
                person_id,
                sample_size=EVAL_RANDOM_SAMPLE_NON_INTERACTED_ITEMS,
                seed=item_id % (2 ** 32)
            )

            items_to_filter_recs = non_interacted_items_sample.union({item_id})
            valid_recs_df = person_recs_df[person_recs_df['contentId'].isin(items_to_filter_recs)]
            valid_recs = valid_recs_df['contentId'].values

            hit_at_5, _ = self._verify_hit_top_n(item_id, valid_recs, 5)
            hits_at_5_count += hit_at_5

            hit_at_10, _ = self._verify_hit_top_n(item_id, valid_recs, 10)
            hits_at_10_count += hit_at_10

        recall_at_5 = hits_at_5_count / float(interacted_items_count_testset)
        recall_at_10 = hits_at_10_count / float(interacted_items_count_testset)

        person_metrics = {
            'hits@5_count': hits_at_5_count,
            'hits@10_count': hits_at_10_count,
            'interacted_count': interacted_items_count_testset,
            'recall@5': recall_at_5,
            'recall@10': recall_at_10
        }

        return person_metrics

    # Évaluer les performances du modèle globalement
    def evaluate_model(self, model):
        people_metrics = []

        for idx, person_id in enumerate(interactions_test_indexed_df.index.unique()):
            person_metrics = self.evaluate_model_for_user(model, person_id)
            if person_metrics:
                person_metrics['_person_id'] = person_id
                people_metrics.append(person_metrics)

        print(f"{idx} utilisateurs traités")

        detailed_results_df = pd.DataFrame(people_metrics).sort_values('interacted_count', ascending=False)

        global_recall_at_5 = detailed_results_df['hits@5_count'].sum() / float(detailed_results_df['interacted_count'].sum())
        global_recall_at_10 = detailed_results_df['hits@10_count'].sum() / float(detailed_results_df['interacted_count'].sum())

        global_metrics = {
            'modelName': model.get_model_name(),
            'recall@5': global_recall_at_5,
            'recall@10': global_recall_at_10
        }

        return global_metrics, detailed_results_df

# Instanciation de l'évaluateur
model_evaluator = ModelEvaluator()


In [28]:
print('evaluating content-based model...')
cb_global_metrics,cb_detailed_results_df=model_evaluator.evaluate_model(content_based_recommendation_model)
print('\ncontent-based global metrics:\n%s'%cb_global_metrics)

evaluating content-based model...
1139 utilisateurs traités

content-based global metrics:
{'modelName': 'Content-Based', 'recall@5': np.float64(0.6132702633597545), 'recall@10': np.float64(0.6981590386090514)}


In [29]:
cb_detailed_results_df.head(10)

Unnamed: 0,hits@5_count,hits@10_count,interacted_count,recall@5,recall@10,_person_id
76,22,37,192,0.114583,0.192708,3609194402293569455
17,22,36,134,0.164179,0.268657,-2626634673110551643
16,20,39,130,0.153846,0.3,-1032019229384696495
10,38,53,117,0.324786,0.452991,-1443636648652872475
82,15,29,88,0.170455,0.329545,-2979881261169775358
161,18,28,80,0.225,0.35,-3596626804281480007
65,15,24,73,0.205479,0.328767,1116121227607581999
81,9,26,69,0.130435,0.376812,692689608292948411
106,9,14,69,0.130435,0.202899,-9016528795238256703
52,9,21,68,0.132353,0.308824,3636910968448833585
