In [None]:
# Recommender Systems in Python 101
# https://www.kaggle.com/gspmoreira/recommender-systems-in-python-101
# tutorial on Recommender Systems
# specifically how to implement Collaborative Filtering, Content Based Filtering and Hybrid Methods
# in Python for the task of providing personalized recommendations to the user

In [47]:
import pandas as pd
import os
import math

In [7]:
# shared acticles
# df contains info of shared articles on the platform
path = '/Users/pliu/Downloads/articles/'
acticles_df = pd.read_csv(path+'shared_articles.csv')
acticles_df = acticles_df[acticles_df.eventType=='CONTENT SHARED']
acticles_df.shape

(3047, 13)

In [9]:
# user interactions
# df contains log of user interactions on the article
# join with articles on contentId
# eventType: view, like, comment, follow, bookmark
interactions_df = pd.read_csv(path+'users_interactions.csv')
interactions_df.shape

(72312, 8)

In [14]:
interactions_df.groupby(['eventType']).size().sort_values(ascending=False)

eventType
VIEW               61086
LIKE                5745
BOOKMARK            2463
COMMENT CREATED     1611
FOLLOW              1407
dtype: int64

In [16]:
# encoding the engagement level of user event type
event_type_strength = {
    'VIEW':1.0,
    'LIKE':2.0,
    'BOOKMARK':2.5,
    'COMMENT CREATED':3.0,
    'FOLLOW':4.0
}
interactions_df['eventStrength'] = interactions_df['eventType'].map(event_type_strength)
interactions_df.groupby(['eventStrength']).size().sort_values(ascending=False)

eventStrength
1.0    61086
2.0     5745
2.5     2463
3.0     1611
4.0     1407
dtype: int64

In [45]:
# get users have no more than 5 interactions
count_of_content_per_user = interactions_df.groupby(['personId', 'contentId']).size().groupby('personId').size()
print(f'num of users have interactions: {len(count_of_content_per_user)}')
count_of_content_per_user = count_of_content_per_user[count_of_content_per_user>=5].reset_index()[['personId']]
print(f'num of users have enough interactions: {len(count_of_content_per_user)}')

num of users have interactions: 1895
num of users have enough interactions: 1140


In [46]:
# filter out interactions that were not interacted with 
# users who have more than 5 interactions
# user right join
print(f'num of interactions_df: {len(interactions_df)}')
interactions_df_selected = interactions_df.merge(count_of_content_per_user,
                                                how='right',
                                                left_on='personId',
                                                right_on='personId')
print(f'num of interactions of selected users that more than 5 interactions: {len(interactions_df_selected)}')


num of interactions_df: 72312
num of interactions of selected users that more than 5 interactions: 69868


In [48]:
""" model the user's interest on a given item
 aggregate all the interactions the user has performed in an item
 by a WEIGHTED sum of interaction type strength
 and apply a log transformation to smooth the distribution
 specifically: 
     1. group interactions by user and content 
     2. sum event type score
     3. log trans the score
"""
interactions_user_item_score = interactions_df_selected \
.groupby(['personId', 'contentId'])['eventStrength'].sum() \
.apply(lambda x: math.log(1+x,2)).reset_index()
print(f'num of unique user-item pair interactions: {len(interactions_user_item_score)}')


num of unique user-item pair interactions: 39106


In [49]:
interactions_user_item_score.head(3)

Unnamed: 0,personId,contentId,eventStrength
0,-9223121837663643404,-8949113594875411859,1.0
1,-9223121837663643404,-8377626164558006982,1.0
2,-9223121837663643404,-8208801367848627943,1.0


In [None]:
# Evaluation