In [1]:
import numpy as np
import scipy
import pandas as pd
import math
import random
import sklearn
from nltk.corpus import stopwords
from scipy.sparse import csr_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse.linalg import svds
import warnings

In [2]:
# Ignore warnings for cleaner output

warnings.filterwarnings('ignore')

In [3]:
# Data input
# List of Views, Bookmarks, Likes, Follows, and Comments on news articles. Each article and user has a unique Id

shared_articles_df = pd.read_csv('shared_articles.csv')
shared_articles_df = shared_articles_df[shared_articles_df['eventType'] == 'CONTENT SHARED']
shared_articles_df.head(10)

Unnamed: 0,timestamp,eventType,contentId,authorPersonId,authorSessionId,authorUserAgent,authorRegion,authorCountry,contentType,url,title,text,lang
1,1459193988,CONTENT SHARED,-4110354420726924665,4340306774493623681,8940341205206233829,,,,HTML,http://www.nytimes.com/2016/03/28/business/dea...,"Ethereum, a Virtual Currency, Enables Transact...",All of this work is still very early. The firs...,en
2,1459194146,CONTENT SHARED,-7292285110016212249,4340306774493623681,8940341205206233829,,,,HTML,http://cointelegraph.com/news/bitcoin-future-w...,Bitcoin Future: When GBPcoin of Branson Wins O...,The alarm clock wakes me at 8:00 with stream o...,en
3,1459194474,CONTENT SHARED,-6151852268067518688,3891637997717104548,-1457532940883382585,,,,HTML,https://cloudplatform.googleblog.com/2016/03/G...,Google Data Center 360° Tour,We're excited to share the Google Data Center ...,en
4,1459194497,CONTENT SHARED,2448026894306402386,4340306774493623681,8940341205206233829,,,,HTML,https://bitcoinmagazine.com/articles/ibm-wants...,"IBM Wants to ""Evolve the Internet"" With Blockc...",The Aite Group projects the blockchain market ...,en
5,1459194522,CONTENT SHARED,-2826566343807132236,4340306774493623681,8940341205206233829,,,,HTML,http://www.coindesk.com/ieee-blockchain-oxford...,IEEE to Talk Blockchain at Cloud Computing Oxf...,One of the largest and oldest organizations fo...,en
6,1459194557,CONTENT SHARED,-2148899391355011268,4340306774493623681,8940341205206233829,,,,HTML,http://www.newsbtc.com/2016/03/28/banks-need-c...,Banks Need To Collaborate With Bitcoin and Fin...,It will take time until banks come around to t...,en
7,1459194599,CONTENT SHARED,4119190424078847945,4340306774493623681,8940341205206233829,,,,HTML,https://bitcoinmagazine.com/articles/blockchai...,Blockchain Technology Could Put Bank Auditors ...,When most people think about computers and rob...,en
8,1459194751,CONTENT SHARED,-7926018713416777892,4340306774493623681,8940341205206233829,,,,HTML,https://news.bitcoin.com/conglomerates-intervi...,Why Decentralized Conglomerates Will Scale Bet...,"Bitcoin.com spoke with the OpenLedger CEO, Ron...",en
9,1459194842,CONTENT SHARED,3353902017498793780,4340306774493623681,8940341205206233829,,,,HTML,https://www.cryptocoinsnews.com/ethereum-rise-...,The Rise And Growth of Ethereum Gets Mainstrea...,"Ethereum, considered by many to be the most pr...",en
10,1459210504,CONTENT SHARED,-9157338616628196758,5206835909720479405,-7864441319395545950,,,,HTML,http://economia.ig.com.br/2016-03-27/situacao-...,Situação financeira ruim de varejistas pressio...,A queda nas vendas e a deterioração na situaçã...,pt


In [4]:
users_interactions_df = pd.read_csv('users_interactions.csv')
users_interactions_df.head(10)

Unnamed: 0,timestamp,eventType,contentId,personId,sessionId,userAgent,userRegion,userCountry
0,1465413032,VIEW,-3499919498720038879,-8845298781299428018,1264196770339959068,,,
1,1465412560,VIEW,8890720798209849691,-1032019229384696495,3621737643587579081,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2...,NY,US
2,1465416190,VIEW,310515487419366995,-1130272294246983140,2631864456530402479,,,
3,1465413895,FOLLOW,310515487419366995,344280948527967603,-3167637573980064150,,,
4,1465412290,VIEW,-7820640624231356730,-445337111692715325,5611481178424124714,,,
5,1465413742,VIEW,310515487419366995,-8763398617720485024,1395789369402380392,Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebK...,MG,BR
6,1465415950,VIEW,-8864073373672512525,3609194402293569455,1143207167886864524,,,
7,1465415066,VIEW,-1492913151930215984,4254153380739593270,8743229464706506141,Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/53...,SP,BR
8,1465413762,VIEW,310515487419366995,344280948527967603,-3167637573980064150,,,
9,1465413771,VIEW,3064370296170038610,3609194402293569455,1143207167886864524,,,


In [5]:
# Data Munging
# We want to appropriotaley weight content interactions that show a higher levels of interest than others

event_type_weight = {
    'VIEW': 1.0,
    'BOOKMARK': 1.5, 
    'LIKE': 2.0, 
    'FOLLOW': 3.0,
    'COMMENT CREATED': 4.0,  
}

users_interactions_df['eventWeight'] = users_interactions_df['eventType'].apply(lambda x: event_type_weight[x])

In [6]:
# This removes any users with <4 interactions as their data is not sufficent enough to anaylzed with confidence

users_interactions_count_df = users_interactions_df.groupby(['personId', 'contentId']).size().groupby('personId').size()
print('Number of logged users: %d' % len(users_interactions_count_df))
users_with_enough_interactions_df = users_interactions_count_df[users_interactions_count_df >= 4].reset_index()[['personId']]
print('Number of logged users with at least 4 interactions: %d' % len(users_with_enough_interactions_df))

# Creating a new data frame of interactions from users with >= 4 interactions

print('Number of total interactions: %d' % len(users_interactions_df))
users_interactions_4_df = users_interactions_df.merge(users_with_enough_interactions_df, 
    how = 'right',
    left_on = 'personId',
    right_on = 'personId')
print('Number of interactions from users with at least 4 interactions: %d' % len(users_interactions_4_df))

Number of logged users: 1895
Number of logged users with at least 4 interactions: 1249
Number of total interactions: 72312
Number of interactions from users with at least 4 interactions: 70549


In [7]:
# List of every single user with 4 interactions by id

curated_user_list = users_interactions_4_df['personId']
curated_user_list = curated_user_list.drop_duplicates()
curated_user_list.head

<bound method NDFrame.head of 0       -9223121837663643404
63      -9212075797126931087
72      -9207251133131336884
86      -9199575329909162940
101     -9196668942822132778
                ...         
70335    9165571805999894845
70343    9187866633451383747
70369    9191849144618614467
70388    9199170757466086545
70414    9210530975708218054
Name: personId, Length: 1249, dtype: int64>

In [8]:
# We want to consider multiple user interactions with the same article as one weight.  Aditionally we want to smooth this weight
# to prevent overfitting around the heaviest weighted interactions

def log_user_preference(x):
    return math.log(1+x, 2)
    
interactions_final_df = users_interactions_4_df \
    .groupby(['personId', 'contentId'])['eventWeight'].sum() \
    .apply(log_user_preference).reset_index()
print('Number of unique user/item interactions: %d' % len(interactions_final_df))
interactions_final_df.head(10)

Number of unique user/item interactions: 39542


Unnamed: 0,personId,contentId,eventWeight
0,-9223121837663643404,-8949113594875411859,1.0
1,-9223121837663643404,-8377626164558006982,1.0
2,-9223121837663643404,-8208801367848627943,1.0
3,-9223121837663643404,-8187220755213888616,1.0
4,-9223121837663643404,-7423191370472335463,3.169925
5,-9223121837663643404,-7331393944609614247,1.0
6,-9223121837663643404,-6872546942144599345,1.0
7,-9223121837663643404,-6728844082024523434,1.0
8,-9223121837663643404,-6590819806697898649,1.0
9,-9223121837663643404,-6558712014192834002,1.584963


In [9]:
# Presorting data to increase the speed of the algorithim's searches

interactions_final_indexed_df = interactions_final_df.set_index('personId')
interactions_final_indexed_df.head(10)

Unnamed: 0_level_0,contentId,eventWeight
personId,Unnamed: 1_level_1,Unnamed: 2_level_1
-9223121837663643404,-8949113594875411859,1.0
-9223121837663643404,-8377626164558006982,1.0
-9223121837663643404,-8208801367848627943,1.0
-9223121837663643404,-8187220755213888616,1.0
-9223121837663643404,-7423191370472335463,3.169925
-9223121837663643404,-7331393944609614247,1.0
-9223121837663643404,-6872546942144599345,1.0
-9223121837663643404,-6728844082024523434,1.0
-9223121837663643404,-6590819806697898649,1.0
-9223121837663643404,-6558712014192834002,1.584963


In [10]:
# Natural Language Processing
# We hope to pair articles together based on similar use of words.  The less frequent a word is the more heavily weighted it
# will be.

#Additionally stop words have been removed to remove their influence

stopwords_list = stopwords.words('english') + stopwords.words('portuguese') + stopwords.words('spanish')

In [11]:
# Creates a vectorizer filled by the main unigrams (group of one word) and bigrams (group of two words) found in the original
# shared article data frame while ignoring stopwords

vectorizer = TfidfVectorizer(analyzer='word',
     ngram_range=(1, 2),
     min_df=0.003,
     max_df=0.5,
     max_features=4000,
     stop_words=stopwords_list)

content_ids = shared_articles_df['contentId'].tolist()
similarity_matrix = vectorizer.fit_transform(shared_articles_df['title'] + "" + shared_articles_df['text'])
similarity_feature_names = vectorizer.get_feature_names_out()
similarity_matrix

<3047x4000 sparse matrix of type '<class 'numpy.float64'>'
	with 594098 stored elements in Compressed Sparse Row format>

In [12]:
# Searches item profiles

def find_item_profile(item_id):
    idx = content_ids.index(item_id)
    item_profile = similarity_matrix[idx:idx+1]
    return item_profile

# Creates a list of item profiles

def find_item_profiles(id_list):
    item_profiles_list = [find_item_profile(x) for x in id_list]
    item_profiles = scipy.sparse.vstack(item_profiles_list)
    return item_profiles

# Creates indivdual profile with help from find_item_profiles() and find_item_profile()

def build_individual_profile(person_id, shared_interactions_indexed_df):
    interactions_person_df = shared_interactions_indexed_df.loc[person_id]
    user_item_profiles = find_item_profiles(interactions_person_df['contentId'])   
    user_item_strengths = np.array(interactions_person_df['eventWeight']).reshape(-1,1)
    #Weighted average of item profiles by the interactions strength
    user_item_strengths_weighted_avg = np.sum(user_item_profiles.multiply(user_item_strengths), axis=0) / np.sum(user_item_strengths)
    user_profile_norm = sklearn.preprocessing.normalize(user_item_strengths_weighted_avg)
    return user_profile_norm

# Collects and loops build_individual_profile()

def build_individual_profiles(): 
    shared_interactions_indexed_df = interactions_final_df[interactions_final_df['contentId'].isin(shared_articles_df['contentId'])].set_index('personId')
    user_profiles = {}
    for person_id in shared_interactions_indexed_df.index.unique():
        user_profiles[person_id] = build_individual_profile(person_id, shared_interactions_indexed_df)
    return user_profiles

In [13]:
# Builds all user TF-IDF profiles

user_profiles = build_individual_profiles()
len(user_profiles)

1249

In [14]:
# Example user TF-IDF vector output

myprofile = user_profiles[2766187446275090740]
print(myprofile.shape)
pd.DataFrame(sorted(zip(similarity_feature_names, 
    user_profiles[2766187446275090740].flatten().tolist()), key=lambda x: -x[1])[:15],
    columns=['keyword', 'similarity score'])

(1, 4000)


Unnamed: 0,keyword,similarity score
0,java,0.250929
1,code,0.22606
2,google,0.211709
3,angular,0.161134
4,javascript,0.159476
5,typescript,0.127432
6,cloud,0.114579
7,use,0.11005
8,web,0.107291
9,reactive,0.107039


In [15]:
# Example user TF-IDF vector output

myprofile = user_profiles[6879394870211872116]
print(myprofile.shape)
pd.DataFrame(sorted(zip(similarity_feature_names, 
    user_profiles[6879394870211872116].flatten().tolist()), key=lambda x: -x[1])[:15],
    columns=['keyword', 'similarity score'])

(1, 4000)


Unnamed: 0,keyword,similarity score
0,netflix,0.221427
1,máquina,0.185055
2,visa,0.180379
3,aplicativo,0.173755
4,cartão,0.171495
5,nubank,0.167431
6,criou,0.164137
7,produto,0.158815
8,ter,0.156644
9,journal,0.152879


In [16]:
# Example user TF-IDF vector output

myprofile = user_profiles[-8550167523008133722]
print(myprofile.shape)
pd.DataFrame(sorted(zip(similarity_feature_names, 
    user_profiles[-8550167523008133722].flatten().tolist()), key=lambda x: -x[1])[:15],
    columns=['keyword', 'similarity score'])

(1, 4000)


Unnamed: 0,keyword,similarity score
0,google,0.233387
1,data,0.154513
2,cloud,0.134048
3,docker,0.132203
4,like,0.098627
5,machine,0.098031
6,apple,0.096434
7,code,0.091267
8,use,0.091208
9,learning,0.089051


In [17]:
# Example user TF-IDF vector output

myprofile = user_profiles[6013226412048763966]
print(myprofile.shape)
pd.DataFrame(sorted(zip(similarity_feature_names, 
    user_profiles[6013226412048763966].flatten().tolist()), key=lambda x: -x[1])[:15],
    columns=['keyword', 'similarity score'])

(1, 4000)


Unnamed: 0,keyword,similarity score
0,coaching,0.517868
1,diz,0.207178
2,pessoas,0.188951
3,leadership,0.13104
4,life,0.117635
5,leaders,0.115194
6,fazer,0.114823
7,lean,0.110962
8,equipe,0.105999
9,líderes,0.092964


In [18]:
# Sorts the most similar articles via TF-IDF and outputs as list of top 10 by contentId

def recommendation_list(user_id):
    cosine_similarities = cosine_similarity(user_profiles[user_id], similarity_matrix)
    similar_indices = cosine_similarities.argsort().flatten()[-1000:]
    similar_items = sorted([(content_ids[i], cosine_similarities[0,i]) for i in similar_indices], key=lambda x: -x[1])
    top10recommended = similar_items[0:10]
    id_recommended = pd.DataFrame(top10recommended)
    id_recommended = id_recommended.iloc[: , :1] 
    return id_recommended

In [19]:
# Creates a list of all interacted items to be compared with recommended results
def get_items_interacted(person_id, interactions_df):
    interacted_items = interactions_df.loc[person_id]['contentId']
    return set(interacted_items if type(interacted_items) == pd.Series else [interacted_items])

# Counts number of top ten recommendations in user's interacted list
def get_top_ten_accuracy(person_id):
    recommended_df = recommendation_list(person_id)
    interacted_df = get_items_interacted(person_id, interactions_final_indexed_df)
    count = 0
    for i in range(10):
        test_value = recommended_df.iloc[i,0]
        if test_value in interacted_df:
            count += 1
    return count

In [20]:
# Run analysis on all curated users to calculate accuracy and recall

correct = 0
count = 0
total_user_articles = 0
for i in curated_user_list:
    correct += get_top_ten_accuracy(i)
    count += 1
    total_user_articles += len(get_items_interacted(i, interactions_final_indexed_df))
guesses = count * 10
accuracy = correct / guesses
recall = correct / total_user_articles

In [21]:
# Evaluating reccomendation system

print('Number of users analyzed: %d' %count)
print('Number of correct guesses in top 10 reccomended: %d' %correct)
print('Number of total guesses in top 10 reccomended: %d' %guesses)
print('Number of total anaylzed user articles read: %d' %total_user_articles)
print('Accuracy: %f' %accuracy)
print('Recall: %f' %recall)

Number of users analyzed: 1249
Number of correct guesses in top 10 reccomended: 6366
Number of total guesses in top 10 reccomended: 12490
Number of total anaylzed user articles read: 39542
Accuracy: 0.509688
Recall: 0.160993
