In [33]:
import sklearn  
import scipy  
import numpy as np  
import random  
import pandas as pd  
from scipy.sparse import csr_matrix  
from sklearn.model_selection import train_test_split  
from sklearn.metrics.pairwise import cosine_similarity  
from scipy.sparse.linalg import svds  
from sklearn.feature_extraction.text import TfidfVectorizer  
from sklearn.preprocessing import MinMaxScaler  
import matplotlib.pyplot as plt  
import math  

In [34]:
df_shared = pd.read_csv('Data\shared_articles.csv')  
df_shareddf_shared = df_shared[df_shared['eventType'] == 'CONTENT SHARED']  
df_shared.head(5)  

Unnamed: 0,timestamp,eventType,contentId,authorPersonId,authorSessionId,authorUserAgent,authorRegion,authorCountry,contentType,url,title,text,lang
0,1459192779,CONTENT REMOVED,-6451309518266745024,4340306774493623681,8940341205206233829,,,,HTML,http://www.nytimes.com/2016/03/28/business/dea...,"Ethereum, a Virtual Currency, Enables Transact...",All of this work is still very early. The firs...,en
1,1459193988,CONTENT SHARED,-4110354420726924665,4340306774493623681,8940341205206233829,,,,HTML,http://www.nytimes.com/2016/03/28/business/dea...,"Ethereum, a Virtual Currency, Enables Transact...",All of this work is still very early. The firs...,en
2,1459194146,CONTENT SHARED,-7292285110016212249,4340306774493623681,8940341205206233829,,,,HTML,http://cointelegraph.com/news/bitcoin-future-w...,Bitcoin Future: When GBPcoin of Branson Wins O...,The alarm clock wakes me at 8:00 with stream o...,en
3,1459194474,CONTENT SHARED,-6151852268067518688,3891637997717104548,-1457532940883382585,,,,HTML,https://cloudplatform.googleblog.com/2016/03/G...,Google Data Center 360° Tour,We're excited to share the Google Data Center ...,en
4,1459194497,CONTENT SHARED,2448026894306402386,4340306774493623681,8940341205206233829,,,,HTML,https://bitcoinmagazine.com/articles/ibm-wants...,"IBM Wants to ""Evolve the Internet"" With Blockc...",The Aite Group projects the blockchain market ...,en


In [35]:
df_users = pd.read_csv('users_interactions.csv')
df_users.head(10)  

Unnamed: 0,timestamp,eventType,contentId,personId,sessionId,userAgent,userRegion,userCountry
0,1465413032,VIEW,-3499919498720038879,-8845298781299428018,1264196770339959068,,,
1,1465412560,VIEW,8890720798209849691,-1032019229384696495,3621737643587579081,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2...,NY,US
2,1465416190,VIEW,310515487419366995,-1130272294246983140,2631864456530402479,,,
3,1465413895,FOLLOW,310515487419366995,344280948527967603,-3167637573980064150,,,
4,1465412290,VIEW,-7820640624231356730,-445337111692715325,5611481178424124714,,,
5,1465413742,VIEW,310515487419366995,-8763398617720485024,1395789369402380392,Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebK...,MG,BR
6,1465415950,VIEW,-8864073373672512525,3609194402293569455,1143207167886864524,,,
7,1465415066,VIEW,-1492913151930215984,4254153380739593270,8743229464706506141,Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/53...,SP,BR
8,1465413762,VIEW,310515487419366995,344280948527967603,-3167637573980064150,,,
9,1465413771,VIEW,3064370296170038610,3609194402293569455,1143207167886864524,,,


In [36]:
EventStrength = {  
   'VIEW': 1.0,  
   'LIKE': 2.0,  
   'BOOKMARK': 2.5,  
   'FOLLOW': 3.0,  
   'COMMENT CREATED': 4.0,    
}  
  
df_users['eventStrength'] = df_users['eventType'].apply(lambda x: EventStrength[x])  

In [37]:
df_user_Interaction = df_users.groupby(['personId', 'contentId']).size().groupby('personId').size()  
df_user_Int_Satisfied = df_user_Interaction[df_user_Interaction >=3].reset_index()[['personId']]  

In [38]:

df_int = df_users.merge(df_user_Int_Satisfied,  
               how = 'right',  
               left_on = 'personId',  
               right_on = 'personId')  


In [39]:
def preference_of_smooth_users(x):  
    return math.log(1+x, 2)  
     
df_Interactions = df_int.groupby(['personId', 'contentId'])['eventStrength'].sum().apply(preference_of_smooth_users).reset_index()  
df_Interactions.head(10)  

Unnamed: 0,personId,contentId,eventStrength
0,-9223121837663643404,-8949113594875411859,1.0
1,-9223121837663643404,-8377626164558006982,1.0
2,-9223121837663643404,-8208801367848627943,1.0
3,-9223121837663643404,-8187220755213888616,1.0
4,-9223121837663643404,-7423191370472335463,3.169925
5,-9223121837663643404,-7331393944609614247,1.0
6,-9223121837663643404,-6872546942144599345,1.0
7,-9223121837663643404,-6728844082024523434,1.0
8,-9223121837663643404,-6590819806697898649,1.0
9,-9223121837663643404,-6558712014192834002,1.584963


In [40]:
df_Interactions_train, df_Interactions_test = train_test_split(df_Interactions,  
                                   stratify=df_Interactions['personId'],  
                                   test_size=0.20,  
                                   random_state=42)  


In [41]:

df_Interactions_indexed = df_Interactions.set_index('personId')  
df_Interactions_train_indexed = df_Interactions_train.set_index('personId')  
df_Interactions_test_indexed = df_Interactions_test.set_index('personId')  
  
def getting_items_interacted(person_id, interaction_dataframe):  
    
    items_interacted = interaction_dataframe.loc[person_id]['contentId']  
    return set(items_interacted if type(items_interacted) == pd.Series else [items_interacted])  

In [42]:
EVAL_RANDOM_SAMPLE_NON_INTERACTED_ITEMS = 150  
  
class ModelEvaluator:  
  
  
    def getting_not_interacted_samples(self, person_id, SS, seed=42):  
        items_interacted = getting_items_interacted(person_id, df_Interactions_indexed)  
        items_all = set(df_shared['contentId'])  
        items_not_interacted = items_all - items_interacted  
  
        random.seed(seed)  
        sample_non_interacted_items = random.sample(items_not_interacted, SS)  
        return set(sample_non_interacted_items)  
  
    def _to_verify_hit_top_n(self, item_id, items_recommended, topn):          
            try:  
                index = next(i for i, c in enumerate(items_recommended) if c == item_id)  
            except:  
                index = -1  
            hit = int(index in range(0, topn))  
            return hit, index  
  
    def model_evaluation_for_users(self, model, person_id):    
        interacted_testset_values = df_Interactions_test_indexed.loc[person_id]  
        if type(interacted_testset_values['contentId']) == pd.Series:  
            person_interacted_testset_items = set(interacted_testset_values['contentId'])  
        else:  
            person_interacted_testset_items = set([int(interacted_testset_values['contentId'])])    
        interated_testset_items_count = len(person_interacted_testset_items)  
        dataframe_person_recs = model.recommending_items(person_id, items_to_ignore=getting_items_interacted(person_id,  df_Interactions_train_indexed ), topn=10000000000)  
  
        hits_at_5_count = 0  
        hits_at_10_count = 0  

        for item_id in person_interacted_testset_items:  

            sample_non_interacted_items = self.getting_not_interacted_samples(person_id,  
                                                                          SS=EVAL_RANDOM_SAMPLE_NON_INTERACTED_ITEMS,  
                                                                          seed=item_id%(2**32))  
 
            items_to_filter_recs = sample_non_interacted_items.union(set([item_id]))  
  
           
            dataframe_valid_recs = dataframe_person_recs[dataframe_person_recs['contentId'].isin(items_to_filter_recs)]                      
            valid_recs_ = dataframe_valid_recs['contentId'].values  

            hit_at_5, index_at_5 = self._to_verify_hit_top_n(item_id, valid_recs_, 5)  
            hits_at_5_count += hit_at_5  
            hit_at_10, index_at_10 = self._to_verify_hit_top_n(item_id, valid_recs_, 10)  
            hits_at_10_count += hit_at_10  
  

        recall_at_5 = hits_at_5_count / float(interated_testset_items_count)  
        recall_at_10 = hits_at_10_count / float(interated_testset_items_count)  
  
        person_metrics = {'hits@5_count':hits_at_5_count,  
                          'hits@10_count':hits_at_10_count,  
                          'interacted_count': interated_testset_items_count,  
                          'recall@5': recall_at_5,  
                          'recall@10': recall_at_10}  
        return person_metrics  
  
    def model_evaluation(self, model):  
  
        people_metrics = []  
        for idx, person_id in enumerate(list(df_Interactions_test_indexed.index.unique().values)):  
 
            person_metrics = self.model_evaluation_for_users(model, person_id)    
            person_metrics['_person_id'] = person_id  
            people_metrics.append(person_metrics)  
        print('%d users processed' % idx)  
  
        detailed_results_df = pd.DataFrame(people_metrics).sort_values('interacted_count', ascending=False)  
         
        global_recall_at_5 = detailed_results_df['hits@5_count'].sum() / float(detailed_results_df['interacted_count'].sum())  
        global_recall_at_10 = detailed_results_df['hits@10_count'].sum() / float(detailed_results_df['interacted_count'].sum())  
         
        global_metrics = {'modelName': model.getting_model_name(),  
                          'recall@5': global_recall_at_5,  
                          'recall@10': global_recall_at_10}      
        return global_metrics, detailed_results_df  
     
model_evaluator = ModelEvaluator()      

In [43]:
vectorizer = TfidfVectorizer(analyzer='word',  
                     ngram_range=(1, 2),  
                     min_df=0.003,  
                     max_df=0.5,  
                     max_features=5000)  
  
item_ids = df_shared['contentId'].tolist()  
tfidf_matrix = vectorizer.fit_transform(df_shared['title'] + "" + df_shared['text'])  
tfidf_feature_names = vectorizer.get_feature_names_out()  
tfidf_matrix  

<3122x5000 sparse matrix of type '<class 'numpy.float64'>'
	with 970688 stored elements in Compressed Sparse Row format>

In [44]:
def getting_item_profiles(item_id):  
    idx = item_ids.index(item_id)  
    profile_item = tfidf_matrix[idx:idx+1]  
    return profile_item  
  
def getting_item_profiless(ids):  
    list_profiles_item = [getting_item_profiles(x) for x in ids]  
    profile_items = scipy.sparse.vstack(list_profiles_item)  
    return profile_items  
  
def building_user_profiles(person_id, dataframe_interaction_indexed):  
    df_users_person = dataframe_interaction_indexed.loc[person_id]  
    profiles_user_items = getting_item_profiless(df_users_person['contentId'])  
     
    user_item_strengths = np.array(df_users_person['eventStrength']).reshape(-1, 1)  
    user_item_strengths_weighted_avg = np.sum(profiles_user_items.multiply(user_item_strengths), axis=0) / np.sum(user_item_strengths)  
    user_item_strengths_weighted_avg_array = np.asarray(user_item_strengths_weighted_avg)  # Convert to NumPy array
    user_profile_norm = sklearn.preprocessing.normalize(user_item_strengths_weighted_avg_array)  
    return user_profile_norm  

def build_users_profiles():  
    dataframe_interaction_indexed = df_Interactions_train[
        df_Interactions_train['contentId'].isin(df_shared['contentId'])
    ].set_index('personId')  
    
    profiles_user = {}  
    for person_id in dataframe_interaction_indexed.index.unique():  
        profiles_user[person_id] = building_user_profiles(person_id, dataframe_interaction_indexed)  
    return profiles_user  

profiles_users = build_users_profiles()  
len(profiles_users)


1400

In [45]:
profiles_users

{5970634102483953353: array([[0.        , 0.        , 0.        , ..., 0.02252874, 0.00226584,
         0.        ]]),
 -6944500707172804068: array([[0.00683775, 0.01019839, 0.01786052, ..., 0.00486052, 0.00586242,
         0.00294074]]),
 6464364285286199002: array([[0.00268645, 0.01975209, 0.0319514 , ..., 0.00735298, 0.00941249,
         0.00082775]]),
 -2465847828331443343: array([[0.        , 0.        , 0.02635587, ..., 0.        , 0.        ,
         0.        ]]),
 8766802480854827422: array([[0.00316512, 0.01976719, 0.01699645, ..., 0.        , 0.        ,
         0.        ]]),
 -5706287032724665714: array([[0.0005801 , 0.0085487 , 0.01460223, ..., 0.00096702, 0.00067979,
         0.        ]]),
 -3595444231792050977: array([[0.        , 0.00253122, 0.01188286, ..., 0.00038015, 0.00279357,
         0.00407782]]),
 -1352542225971050638: array([[0.01160952, 0.00731646, 0.00651658, ..., 0.00504092, 0.        ,
         0.        ]]),
 7530102388348371118: array([[0.        , 0

In [46]:
my_profile = profiles_users[4297386640890689301]  
print(my_profile.shape)  
pd.DataFrame(sorted(zip(tfidf_feature_names,  
                        profiles_users[4297386640890689301].flatten().tolist()), key=lambda x: -x[1])[:20],  
             columns=['token', 'relevance'])  

(1, 5000)


Unnamed: 0,token,relevance
0,de,0.638046
1,que,0.287426
2,para,0.191575
3,em,0.148207
4,uma,0.146047
5,um,0.144464
6,da,0.136748
7,com,0.122168
8,java,0.121373
9,não,0.109418


In [47]:
class ContentBasedRecommender:  
     
    MODEL_NAME = 'Content-Based'  
     
    def __init__(self, items_df=None):  
        self.item_ids = item_ids  
        self.items_df = items_df  
         
    def getting_model_name(self):  
        return self.MODEL_NAME  
         
    def _getting_similar_items_to_the_users(self, person_id, topn=1000):  
        # The user profile and all object profiles are compared using the cosine similarity formula.  
        cosine_similarities = cosine_similarity(profiles_users[person_id], tfidf_matrix)  
        # Gets the most comparable products.  
        similar_indices = cosine_similarities.argsort().flatten()[-topn:]  
        # Sort comparable objects according to similarity.  
        similar_items = sorted([(item_ids[i], cosine_similarities[0,i]) for i in similar_indices], key=lambda x: -x[1])  
        return similar_items  
         
    def recommending_items(self, user_id, items_to_ignore=[], topn=10, verbose=False):  
        similar_items = self._getting_similar_items_to_the_users(user_id)  
        # Ignores things with which the user has previously behaved  
        similar_items_filtered = list(filter(lambda x: x[0] not in items_to_ignore, similar_items))  
         
        dataframe_recommendations = pd.DataFrame(similar_items_filtered, columns=['contentId', 'recStrength']).head(topn)  
  
        if verbose:  
            if self.items_df is None:  
                raise Exception('"items_df" is required in verbose mode')  
  
            dataframe_recommendationsdataframe_recommendations = dataframe_recommendations.merge(self.items_df, how = 'left',  
                                                          left_on = 'contentId',  
                                                          right_on = 'contentId')[['recStrength', 'contentId', 'title', 'url', 'lang']]  
  
  
        return dataframe_recommendations  
     
content_based_recommender_model = ContentBasedRecommender(df_shared)  

In [48]:
print('Evaluating The Content-Based Filtering model...')  
metrics_cb_global, dataframe_cb_result_detailed = model_evaluator.model_evaluation(content_based_recommender_model)  
print('\nGlobal metrics:\n%s' % metrics_cb_global)  
dataframe_cb_result_detailed.head(10)  

Evaluating The Content-Based Filtering model...


since Python 3.9 and will be removed in a subsequent version.
  sample_non_interacted_items = random.sample(items_not_interacted, SS)
since Python 3.9 and will be removed in a subsequent version.
  sample_non_interacted_items = random.sample(items_not_interacted, SS)
since Python 3.9 and will be removed in a subsequent version.
  sample_non_interacted_items = random.sample(items_not_interacted, SS)
since Python 3.9 and will be removed in a subsequent version.
  sample_non_interacted_items = random.sample(items_not_interacted, SS)
since Python 3.9 and will be removed in a subsequent version.
  sample_non_interacted_items = random.sample(items_not_interacted, SS)
since Python 3.9 and will be removed in a subsequent version.
  sample_non_interacted_items = random.sample(items_not_interacted, SS)
since Python 3.9 and will be removed in a subsequent version.
  sample_non_interacted_items = random.sample(items_not_interacted, SS)
since Python 3.9 and will be removed in a subsequent version.


1399 users processed

Global metrics:
{'modelName': 'Content-Based', 'recall@5': 0.06863357919739968, 'recall@10': 0.11788973621702713}


since Python 3.9 and will be removed in a subsequent version.
  sample_non_interacted_items = random.sample(items_not_interacted, SS)
since Python 3.9 and will be removed in a subsequent version.
  sample_non_interacted_items = random.sample(items_not_interacted, SS)
since Python 3.9 and will be removed in a subsequent version.
  sample_non_interacted_items = random.sample(items_not_interacted, SS)
since Python 3.9 and will be removed in a subsequent version.
  sample_non_interacted_items = random.sample(items_not_interacted, SS)
since Python 3.9 and will be removed in a subsequent version.
  sample_non_interacted_items = random.sample(items_not_interacted, SS)
since Python 3.9 and will be removed in a subsequent version.
  sample_non_interacted_items = random.sample(items_not_interacted, SS)


Unnamed: 0,hits@5_count,hits@10_count,interacted_count,recall@5,recall@10,_person_id
11,3,9,192,0.015625,0.046875,3609194402293569455
86,5,9,134,0.037313,0.067164,-2626634673110551643
29,6,16,130,0.046154,0.123077,-1032019229384696495
81,16,26,117,0.136752,0.222222,-1443636648652872475
55,1,1,87,0.011494,0.011494,-2979881261169775358
65,9,11,80,0.1125,0.1375,-3596626804281480007
10,2,3,73,0.027397,0.041096,1116121227607581999
5,4,10,69,0.057971,0.144928,692689608292948411
45,4,7,68,0.058824,0.102941,-9016528795238256703
90,3,7,68,0.044118,0.102941,3636910968448833585
