In [1]:
import nltk 
import numpy as np
import scipy
import pandas as pd
import math
import random
import sklearn

from nltk.corpus import stopwords 

from scipy.sparse import csr_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse.linalg import svds
from sklearn.preprocessing import MinMaxScaler

In [2]:
class ContentBasedRecommender:
        
    MODEL_NAME = 'Content-Based'
    
    def __init__(self, items_df=None, item_ids=None, user_profiles=None, tfidf_matrix=None):
        self.item_ids = item_ids
        self.items_df = items_df
        self.user_profiles = user_profiles
        self.tfidf_matrix = tfidf_matrix
        
    def get_model_name(self):
        return self.MODEL_NAME
        
    def _get_similar_items_to_user_profile(self, person_id, topn=1000):
        #Computes the cosine similarity between the user profile and all item profiles
        cosine_similarities = cosine_similarity(self.user_profiles[person_id], self.tfidf_matrix)
        #Gets the top similar items
        similar_indices = cosine_similarities.argsort().flatten()[-topn:]
        #Sort the similar items by similarity
        similar_items = sorted([(self.item_ids[i], cosine_similarities[0,i]) for i in similar_indices], key=lambda x: -x[1])
        return similar_items
        
    def recommend_items(self, user_id, items_to_ignore=[], topn=10, verbose=False):
        similar_items = self._get_similar_items_to_user_profile(user_id)
        #Ignores items the user has already interacted
        similar_items_filtered = list(filter(lambda x: x[0] not in items_to_ignore, similar_items))
        
        recommendations_df = pd.DataFrame(similar_items_filtered, columns=['contentId', 'recStrength']) \
                                    .head(topn)

        if verbose:
            if self.items_df is None:
                raise Exception('"items_df" is required in verbose mode')

            recommendations_df = recommendations_df.merge(self.items_df, how = 'left', 
                                                          left_on = 'contentId', 
                                                          right_on = 'contentId')[['recStrength', 'contentId', 'title', 'url', 'lang']]


        return recommendations_df

In [4]:
class CFRecommender:
        
    MODEL_NAME = 'Collaborative Filtering'
    
    def __init__(self, cf_predictions_df, items_df=None):
        self.cf_predictions_df = cf_predictions_df
        self.items_df = items_df
        
    def get_model_name(self):
        return self.MODEL_NAME
        
    def recommend_items(self, user_id, items_to_ignore=[], topn=10, verbose=False):
        # Get and sort the user's predictions
        sorted_user_predictions = self.cf_predictions_df[user_id].sort_values(ascending=False) \
                                    .reset_index().rename(columns={user_id: 'recStrength'})

        # Recommend the highest predicted rating movies that the user hasn't seen yet.
        recommendations_df = sorted_user_predictions[~sorted_user_predictions['contentId'].isin(items_to_ignore)] \
                               .sort_values('recStrength', ascending = False) \
                               .head(topn)

        if verbose:
            if self.items_df is None:
                raise Exception('"items_df" is required in verbose mode')

            recommendations_df = recommendations_df.merge(self.items_df, how = 'left', 
                                                          left_on = 'contentId', 
                                                          right_on = 'contentId')[['recStrength', 'contentId', 'title', 'url', 'lang']]


        return recommendations_df

In [None]:
class PopularityRecommender:
        
    MODEL_NAME = 'Popularity'
    
    def __init__(self, popularity_df, items_df=None):
        self.popularity_df = popularity_df
        self.items_df = items_df
        
    def get_model_name(self):
        return self.MODEL_NAME
        
    def recommend_items(self, user_id, items_to_ignore=[], topn=10, verbose=False):
        # Recommend the more popular items that the user hasn't seen yet.
        recommendations_df = self.popularity_df[~self.popularity_df['contentId'].isin(items_to_ignore)] \
                               .sort_values('eventStrength', ascending = False) \
                               .head(topn)

        if verbose:
            if self.items_df is None:
                raise Exception('"items_df" is required in verbose mode')

            recommendations_df = recommendations_df.merge(self.items_df, how = 'left', 
                                                          left_on = 'contentId', 
                                                          right_on = 'contentId')[['eventStrength', 'contentId', 'title', 'url', 'lang']]
    

        return recommendations_df



def run_starting_code(articles_df, interactions_df):



    def smooth_user_preference(x):
        return math.log(1+x, 2)

    def get_items_interacted(person_id, interactions_df):
        # Get the user's data and merge in the movie information.
        interacted_items = interactions_df.loc[person_id]['contentId']
        return set(interacted_items if type(interacted_items) == pd.Series else [interacted_items])

    def get_item_profile(item_id):
        idx = item_ids.index(item_id)
        item_profile = tfidf_matrix[idx:idx+1]
        return item_profile

    def get_item_profiles(ids):
        item_profiles_list = [get_item_profile(x) for x in ids]
        item_profiles = scipy.sparse.vstack(item_profiles_list)
        return item_profiles

    def build_users_profile(person_id, interactions_indexed_df):
        interactions_person_df = interactions_indexed_df.loc[person_id]
        user_item_profiles = get_item_profiles(interactions_person_df['contentId'])
        
        user_item_strengths = np.array(interactions_person_df['eventStrength']).reshape(-1,1)
        #Weighted average of item profiles by the interactions strength
        user_item_strengths_weighted_avg = np.sum(user_item_profiles.multiply(user_item_strengths), axis=0) / np.sum(user_item_strengths)
        user_profile_norm = sklearn.preprocessing.normalize(np.array(user_item_strengths_weighted_avg))
        return user_profile_norm

    def build_users_profiles(): 
        interactions_indexed_df = interactions_full_df[interactions_full_df['contentId'] \
                                                       .isin(articles_df['contentId'])].set_index('personId')
        user_profiles = {}
        for person_id in interactions_indexed_df.index.unique():
            user_profiles[person_id] = build_users_profile(person_id, interactions_indexed_df)
        return user_profiles

In [7]:
# Load the datasets
interactions_df = pd.read_csv("/kaggle/input/lab7dataset/ccai422_lab07_data_users_interactions.csv")
articles_df = pd.read_csv("/kaggle/input/lab7dataset/ccai422_lab07_data_shared_articles.csv")

# Display the first few rows to verify the data is loaded correctly
print(interactions_df.head(10))
print(articles_df.head(10))

# Event type strength mapping
event_type_strength = {
    'VIEW': 1.0,
    'LIKE': 2.0,
    'BOOKMARK': 2.5,
    'FOLLOW': 3.0,
    'COMMENT CREATED': 4.0
}

# Add eventStrength column based on eventType
interactions_df['eventStrength'] = interactions_df['eventType'].apply(lambda x: event_type_strength.get(x, 0))

# Filter users with at least 5 interactions
users_interactions_count_df = interactions_df.groupby(['personId', 'contentId']).size().groupby('personId').size()
users_with_enough_interactions_df = users_interactions_count_df[users_interactions_count_df >= 5].reset_index(name='interactionCount')

# Merge filtered users back into the interactions data
interactions_from_selected_users_df = interactions_df.merge(users_with_enough_interactions_df[['personId']], how='inner', on='personId')

# Aggregate event strength for user-content interactions
interactions_full_df = interactions_from_selected_users_df.groupby(['personId', 'contentId'])['eventStrength'].sum().reset_index()

# Compute item popularity
item_popularity_df = interactions_full_df.groupby('contentId')['eventStrength'].sum().sort_values(ascending=False).reset_index()

# Define stopwords for vectorization
stopwords_list = stopwords.words('english') + stopwords.words('portuguese')

# Train a TF-IDF model
vectorizer = TfidfVectorizer(
    analyzer='word',
    ngram_range=(1, 2),
    min_df=0.003,
    max_df=0.5,
    max_features=5000,
    stop_words=stopwords_list
)
tfidf_matrix = vectorizer.fit_transform(articles_df['title'] + " " + articles_df['text'])
tfidf_feature_names = vectorizer.get_feature_names_out()

# User profiles creation
def build_users_profiles():
    user_profiles = {}
    for person_id in interactions_full_df['personId'].unique():
        user_interactions = interactions_full_df[interactions_full_df['personId'] == person_id]
        user_profile = np.sum(
            tfidf_matrix[user_interactions['contentId'].map(lambda x: articles_df[articles_df['contentId'] == x].index[0]).values],
            axis=0
        )
        user_profiles[person_id] = user_profile
    return user_profiles

user_profiles = build_users_profiles()

# Example: Sorting and displaying top tokens in the user profile
user_id_example = list(user_profiles.keys())[0]  # Retrieve a sample user ID
example_profile = user_profiles[user_id_example]  # Retrieve the user profile (vector)

# Ensure example_profile is properly flattened into a numerical array
example_profile = np.array(example_profile).flatten()

# Fix: Ensure correct type for sorting by numerical relevance
sorted_tokens = sorted(
    zip(tfidf_feature_names, example_profile),
    key=lambda x: -x[1]  # Sort by relevance score in descending order
)[:20]

# Display the top tokens and their relevance
print("Top tokens for user profile:\n", pd.DataFrame(sorted_tokens, columns=['token', 'relevance']))

# Pivot table for collaborative filtering
users_items_pivot_matrix_df = interactions_full_df.pivot(index='personId', columns='contentId', values='eventStrength').fillna(0)
users_items_pivot_sparse_matrix = csr_matrix(users_items_pivot_matrix_df.values)

# Matrix factorization
NUMBER_OF_FACTORS_MF = 15
U, sigma, Vt = np.linalg.svd(users_items_pivot_sparse_matrix.toarray(), full_matrices=False)
sigma = np.diag(sigma)

# Predicted ratings
all_user_predicted_ratings = np.dot(np.dot(U, sigma), Vt)
cf_preds_df = pd.DataFrame(all_user_predicted_ratings, index=users_items_pivot_matrix_df.index, columns=users_items_pivot_matrix_df.columns)

# Return models
def create_models():
    return {
        'popularity_model': item_popularity_df,
        'cf_recommender': cf_preds_df,
        'content_based_recommender': user_profiles
    }

models = create_models()
print("Models created successfully.")


    timestamp eventType            contentId             personId  \
0  1465413032      VIEW -3499919498720038879 -8845298781299428018   
1  1465412560      VIEW  8890720798209849691 -1032019229384696495   
2  1465416190      VIEW   310515487419366995 -1130272294246983140   
3  1465413895    FOLLOW   310515487419366995   344280948527967603   
4  1465412290      VIEW -7820640624231356730  -445337111692715325   
5  1465413742      VIEW   310515487419366995 -8763398617720485024   
6  1465415950      VIEW -8864073373672512525  3609194402293569455   
7  1465415066      VIEW -1492913151930215984  4254153380739593270   
8  1465413762      VIEW   310515487419366995   344280948527967603   
9  1465413771      VIEW  3064370296170038610  3609194402293569455   

             sessionId                                          userAgent  \
0  1264196770339959068                                                NaN   
1  3621737643587579081  Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2...   
2  263186