In [2]:
import pandas as pd
import scipy.sparse as sparse
import numpy as np
import random
import implicit
from sklearn.preprocessing import MinMaxScaler
from sklearn import metrics

In [3]:
articles_df = pd.read_csv('shared_articles.csv')
interactions_df = pd.read_csv('users_interactions.csv')
articles_df.drop(['authorUserAgent', 'authorRegion', 'authorCountry'], axis=1, inplace=True)
interactions_df.drop(['userAgent', 'userRegion', 'userCountry'], axis=1, inplace=True)

In [4]:
articles_df.head(3)

Unnamed: 0,timestamp,eventType,contentId,authorPersonId,authorSessionId,contentType,url,title,text,lang
0,1459192779,CONTENT REMOVED,-6451309518266745024,4340306774493623681,8940341205206233829,HTML,http://www.nytimes.com/2016/03/28/business/dea...,"Ethereum, a Virtual Currency, Enables Transact...",All of this work is still very early. The firs...,en
1,1459193988,CONTENT SHARED,-4110354420726924665,4340306774493623681,8940341205206233829,HTML,http://www.nytimes.com/2016/03/28/business/dea...,"Ethereum, a Virtual Currency, Enables Transact...",All of this work is still very early. The firs...,en
2,1459194146,CONTENT SHARED,-7292285110016212249,4340306774493623681,8940341205206233829,HTML,http://cointelegraph.com/news/bitcoin-future-w...,Bitcoin Future: When GBPcoin of Branson Wins O...,The alarm clock wakes me at 8:00 with stream o...,en


In [5]:
articles_df['eventType'].value_counts()

CONTENT SHARED     3047
CONTENT REMOVED      75
Name: eventType, dtype: int64

In [6]:
articles_df = articles_df[articles_df['eventType'] == 'CONTENT SHARED']
articles_df.drop('eventType', axis=1, inplace=True)

In [7]:
articles_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3047 entries, 1 to 3121
Data columns (total 9 columns):
timestamp          3047 non-null int64
contentId          3047 non-null int64
authorPersonId     3047 non-null int64
authorSessionId    3047 non-null int64
contentType        3047 non-null object
url                3047 non-null object
title              3047 non-null object
text               3047 non-null object
lang               3047 non-null object
dtypes: int64(4), object(5)
memory usage: 238.0+ KB


In [8]:
interactions_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 72312 entries, 0 to 72311
Data columns (total 5 columns):
timestamp    72312 non-null int64
eventType    72312 non-null object
contentId    72312 non-null int64
personId     72312 non-null int64
sessionId    72312 non-null int64
dtypes: int64(4), object(1)
memory usage: 2.8+ MB


In [9]:
df = pd.merge(interactions_df[['contentId','personId', 'eventType']], articles_df[['contentId', 'title']], how = 'inner', on = 'contentId')

In [10]:
df.head(10)

Unnamed: 0,contentId,personId,eventType,title
0,-3499919498720038879,-8845298781299428018,VIEW,Hiri wants to fix the workplace email problem
1,-3499919498720038879,-8845298781299428018,VIEW,Hiri wants to fix the workplace email problem
2,-3499919498720038879,-108842214936804958,VIEW,Hiri wants to fix the workplace email problem
3,-3499919498720038879,-1443636648652872475,VIEW,Hiri wants to fix the workplace email problem
4,-3499919498720038879,-1443636648652872475,VIEW,Hiri wants to fix the workplace email problem
5,-3499919498720038879,-1443636648652872475,VIEW,Hiri wants to fix the workplace email problem
6,-3499919498720038879,-8020832670974472349,VIEW,Hiri wants to fix the workplace email problem
7,-3499919498720038879,-8020832670974472349,VIEW,Hiri wants to fix the workplace email problem
8,-3499919498720038879,-9009798162809551896,LIKE,Hiri wants to fix the workplace email problem
9,-3499919498720038879,-9009798162809551896,VIEW,Hiri wants to fix the workplace email problem


In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 72269 entries, 0 to 72268
Data columns (total 4 columns):
contentId    72269 non-null int64
personId     72269 non-null int64
eventType    72269 non-null object
title        72269 non-null object
dtypes: int64(2), object(2)
memory usage: 2.8+ MB


In [12]:
df['eventType'].value_counts()

VIEW               61043
LIKE                5745
BOOKMARK            2463
COMMENT CREATED     1611
FOLLOW              1407
Name: eventType, dtype: int64

In [13]:
event_type_strength = {
   'VIEW': 1.0,
   'LIKE': 2.0, 
   'BOOKMARK': 3.0, 
   'FOLLOW': 4.0,
   'COMMENT CREATED': 5.0,  
}

df['eventStrength'] = df['eventType'].apply(lambda x: event_type_strength[x])

In [14]:
df.head(10)

Unnamed: 0,contentId,personId,eventType,title,eventStrength
0,-3499919498720038879,-8845298781299428018,VIEW,Hiri wants to fix the workplace email problem,1.0
1,-3499919498720038879,-8845298781299428018,VIEW,Hiri wants to fix the workplace email problem,1.0
2,-3499919498720038879,-108842214936804958,VIEW,Hiri wants to fix the workplace email problem,1.0
3,-3499919498720038879,-1443636648652872475,VIEW,Hiri wants to fix the workplace email problem,1.0
4,-3499919498720038879,-1443636648652872475,VIEW,Hiri wants to fix the workplace email problem,1.0
5,-3499919498720038879,-1443636648652872475,VIEW,Hiri wants to fix the workplace email problem,1.0
6,-3499919498720038879,-8020832670974472349,VIEW,Hiri wants to fix the workplace email problem,1.0
7,-3499919498720038879,-8020832670974472349,VIEW,Hiri wants to fix the workplace email problem,1.0
8,-3499919498720038879,-9009798162809551896,LIKE,Hiri wants to fix the workplace email problem,2.0
9,-3499919498720038879,-9009798162809551896,VIEW,Hiri wants to fix the workplace email problem,1.0


In [15]:
df = df.drop_duplicates()
grouped_df = df.groupby(['personId', 'contentId', 'title']).sum().reset_index()

In [16]:
grouped_df.sample(10)

Unnamed: 0,personId,contentId,title,eventStrength
9834,-4092545774372727680,3801658304588389996,"As Artificial Intelligence Evolves, So Does It...",4.0
9653,-4125205337625989832,-3040610224044779845,Things you probably didn't know you could do w...,6.0
15892,-1547632488317277075,4383035933260676803,"Pedágio das rodovias estaduais de SP sobe 9,32...",1.0
18196,-1032019229384696495,7277161137091492767,An overview of web service solutions in Drupal 8,10.0
9809,-4092545774372727680,174707786647990372,Bringing Pokémon GO to life on Google Cloud,1.0
36460,7020155836312304353,3467834993860080188,15 Interesting JavaScript and CSS Libraries fo...,1.0
27749,3217014177234377440,3180828616327439381,Governo define cronograma para plano nacional ...,1.0
11289,-3458422503840091409,-2386148284399181873,Vídeos de testes unitários em sistemas embarca...,1.0
4922,-6944500707172804068,-3136683414587095274,Marketplace,3.0
37713,7741213197781008422,-8723568727518869245,The Most Important Push Notification metric - ...,1.0


In [17]:
grouped_df.dtypes

personId           int64
contentId          int64
title             object
eventStrength    float64
dtype: object

In [18]:
grouped_df['title'] = grouped_df['title'].astype("category")
grouped_df['personId'] = grouped_df['personId'].astype("category")
grouped_df['contentId'] = grouped_df['contentId'].astype("category")
grouped_df['person_id'] = grouped_df['personId'].cat.codes
grouped_df['content_id'] = grouped_df['contentId'].cat.codes

In [19]:
grouped_df.head(10)

Unnamed: 0,personId,contentId,title,eventStrength,person_id,content_id
0,-9223121837663643404,-8949113594875411859,"No Brasil, '25% dos celulares ainda são 'Burro...",1.0,0,65
1,-9223121837663643404,-8377626164558006982,Bad Writing Is Destroying Your Company's Produ...,1.0,0,159
2,-9223121837663643404,-8208801367848627943,Ray Kurzweil: The world isn't getting worse - ...,1.0,0,187
3,-9223121837663643404,-8187220755213888616,Organizing for digital acceleration: Making a ...,1.0,0,195
4,-9223121837663643404,-7423191370472335463,"Espresso Intents: não é magia, é tecnologia! -...",1.0,0,313
5,-9223121837663643404,-7331393944609614247,Here's proof that Google is getting serious ab...,1.0,0,327
6,-9223121837663643404,-6872546942144599345,My experience with Google's Associate Android ...,1.0,0,385
7,-9223121837663643404,-6728844082024523434,Seniority,1.0,0,416
8,-9223121837663643404,-6590819806697898649,Listas com RecyclerView - Android Dev BR,1.0,0,442
9,-9223121837663643404,-6558712014192834002,Google's fair use victory is good for open source,1.0,0,450


In [20]:
sparse_content_person = sparse.csr_matrix((grouped_df['eventStrength'].astype(float), (grouped_df['content_id'], grouped_df['person_id'])))
sparse_person_content = sparse.csr_matrix((grouped_df['eventStrength'].astype(float), (grouped_df['person_id'], grouped_df['content_id'])))

In [21]:
model = implicit.als.AlternatingLeastSquares(factors=20, regularization=0.1, iterations=50)



In [22]:
alpha = 15
data = (sparse_content_person * alpha).astype('double')

# Fit the model
model.fit(data)

HBox(children=(IntProgress(value=0, max=50), HTML(value='')))




In [23]:
content_id = 450
n_similar = 10

person_vecs = model.user_factors
content_vecs = model.item_factors

content_norms = np.sqrt((content_vecs * content_vecs).sum(axis=1))

scores = content_vecs.dot(content_vecs[content_id]) / content_norms
top_idx = np.argpartition(scores, -n_similar)[-n_similar:]
similar = sorted(zip(top_idx, scores[top_idx] / content_norms[content_id]), key=lambda x: -x[1])

In [24]:
for content in similar:
    idx, score = content
    print(grouped_df.title.loc[grouped_df.content_id == idx].iloc[0])

Google's fair use victory is good for open source
Google lags behind Amazon and Microsoft's cloud in one important area
Up your DevOps chops with this online Kubernetes class
Google's Cloud Dataflow stomps on Apache Spark in new benchmark tests
Building immutable entities into Google Cloud Datastore
5 Unique Features Of Google Compute Engine That No IaaS Provider Could Match
Automate deployments and traffic splitting with the App Engine Admin API
Google just open sourced something called 'Parsey McParseface,' and it could change AI forever
Google connects BigQuery to Google Drive and Sheets
An independent organization just ranked Google as the best cloud, beating Amazon


In [25]:
def recommend(person_id, sparse_person_content, person_vecs, content_vecs, num_contents=10):
    # Get the interactions scores from the sparse person content matrix
    person_interactions = sparse_person_content[person_id,:].toarray()
    # Add 1 to everything, so that articles with no interaction yet become equal to 1
    person_interactions = person_interactions.reshape(-1) + 1
    # Make articles already interacted zero
    person_interactions[person_interactions > 1] = 0
    # Get dot product of person vector and all content vectors
    rec_vector = person_vecs[person_id,:].dot(content_vecs.T).toarray()
    
    # Scale this recommendation vector between 0 and 1
    min_max = MinMaxScaler()
    rec_vector_scaled = min_max.fit_transform(rec_vector.reshape(-1,1))[:,0]
    # Content already interacted have their recommendation multiplied by zero
    recommend_vector = person_interactions * rec_vector_scaled
    # Sort the indices of the content into order of best recommendations
    content_idx = np.argsort(recommend_vector)[::-1][:num_contents]
    
    # Start empty list to store titles and scores
    titles = []
    scores = []

    for idx in content_idx:
        # Append titles and scores to the list
        titles.append(grouped_df.title.loc[grouped_df.content_id == idx].iloc[0])
        scores.append(recommend_vector[idx])

    recommendations = pd.DataFrame({'title': titles, 'score': scores})

    return recommendations

In [26]:
# Get the trained person and content vectors. We convert them to csr matrices
person_vecs = sparse.csr_matrix(model.user_factors)
content_vecs = sparse.csr_matrix(model.item_factors)

# Create recommendations for person with id 50
person_id = 50

recommendations = recommend(person_id, sparse_person_content, person_vecs, content_vecs)

print(recommendations)

                                               title     score
0  Custo do Erro - Cinco motivos para investir em...  1.000000
1  Former Google career coach shares a visual tri...  0.889816
2  Ray Kurzweil: The world isn't getting worse - ...  0.871815
3  Do You Suffer From Deployment Anxiety? - DZone...  0.834542
4  My experience with Google's Associate Android ...  0.801299
5  'The Simpsons' celebrates 600 episodes with a ...  0.794170
6  Psicóloga de Harvard diz que as pessoas julgam...  0.782687
7               Bring Your App To Life with CALayers  0.781154
8                              Microservices testing  0.780169
9  Novo workaholic trabalha, pratica esportes e t...  0.778091


In [27]:
grouped_df.loc[grouped_df['person_id'] == 50].sort_values(by=['eventStrength'], ascending=False)[['title', 'person_id', 'eventStrength']].head(10)

Unnamed: 0,title,person_id,eventStrength
1727,Acquia Engage 2016: Day One,50,3.0
1791,Um bilhão de arquivos mostram quem vence a dis...,50,3.0
1781,Acquia Engage Awards Finalists Announced,50,3.0
1778,Sharing innovation with your competitors - Dri...,50,3.0
1769,Don't document your code. Code your documentat...,50,3.0
1747,Who sponsors Drupal development? | Dries Buytaert,50,3.0
1768,Johnson & Johnson comprará grupo suíço por US$...,50,1.0
1767,Slack and Google announce partnership focused ...,50,1.0
1770,Rating the English Proficiency of Countries an...,50,1.0
1766,Infográfico: Algoritmos para Aprendizado de Má...,50,1.0


In [28]:
person_id = 2

recommendations = recommend(person_id, sparse_person_content, person_vecs, content_vecs)

print(recommendations)

                                               title     score
0                   Livro: Retrospectivas Divertidas  0.919207
1          Google Ranking Factors: The Complete List  0.901563
2  Novo workaholic trabalha, pratica esportes e t...  0.805523
3  Psicóloga de Harvard diz que as pessoas julgam...  0.770835
4  ITA está oferecendo 10 cursos gratuitos a dist...  0.739980
5           Drupal and ambitious digital experiences  0.722973
6  How to Improve 8 Major Problem Areas for Japan...  0.699353
7                    40 Basic Japanese conversations  0.675020
8                                     WeChat's world  0.662424
9                               Japanese for dummies  0.638213


In [29]:
grouped_df.loc[grouped_df['person_id'] == 2].sort_values(by=['eventStrength'], ascending=False)[['title', 'eventStrength', 'person_id']]

Unnamed: 0,title,eventStrength,person_id
51,Former Google career coach shares a visual tri...,6.0,2
48,Request lesson : How and when to use はず(=hazu)...,3.0,2
49,Aposta na inovação,3.0,2
50,"The Algorithm March, Japan's Strangely Enterta...",3.0,2
54,Como são escrita as risadas em japonês? - Suki...,3.0,2
52,A minha viagem à Maternidade #tetodomundo,1.0,2
53,Learn Hiragana: The Ultimate Guide,1.0,2


In [30]:
person_id = 1

recommendations = recommend(person_id, sparse_person_content, person_vecs, content_vecs)

print(recommendations)

                                               title     score
0  Como são escrita as risadas em japonês? - Suki...  0.859833
1  ITA está oferecendo 10 cursos gratuitos a dist...  0.781478
2                   Livro: Retrospectivas Divertidas  0.757219
3  Request lesson : How and when to use はず(=hazu)...  0.735989
4          Google Ranking Factors: The Complete List  0.723770
5  'The Simpsons' celebrates 600 episodes with a ...  0.712964
6  The Algorithm March, Japan's Strangely Enterta...  0.674670
7  NodeMCU (ESP8266) o módulo que desbanca o Ardu...  0.672613
8          A minha viagem à Maternidade #tetodomundo  0.663650
9                                          UX ou UI?  0.651298


In [31]:
grouped_df.loc[grouped_df['person_id'] == 1].sort_values(by=['eventStrength'], ascending=False)[['title', 'eventStrength', 'person_id']]

Unnamed: 0,title,eventStrength,person_id
44,Learn Hiragana: The Ultimate Guide,3.0,1
43,Firebase Test Lab for Android,1.0,1
45,"Fresco, sim! - Android Dev BR",1.0,1
46,Japanese for dummies,1.0,1
47,Firebase and Google Cloud: better together,1.0,1


In [32]:
import random

def make_train(ratings, pct_test = 0.2):
    test_set = ratings.copy() # Make a copy of the original set to be the test set. 
    test_set[test_set != 0] = 1 # Store the test set as a binary preference matrix
    
    training_set = ratings.copy() # Make a copy of the original data we can alter as our training set. 
    
    nonzero_inds = training_set.nonzero() # Find the indices in the ratings data where an interaction exists
    nonzero_pairs = list(zip(nonzero_inds[0], nonzero_inds[1])) # Zip these pairs together of item,user index into list

    
    random.seed(0) # Set the random seed to zero for reproducibility
    
    num_samples = int(np.ceil(pct_test*len(nonzero_pairs))) # Round the number of samples needed to the nearest integer
    samples = random.sample(nonzero_pairs, num_samples) # Sample a random number of item-user pairs without replacement

    content_inds = [index[0] for index in samples] # Get the item row indices

    person_inds = [index[1] for index in samples] # Get the user column indices

    
    training_set[content_inds, person_inds] = 0 # Assign all of the randomly chosen user-item pairs to zero
    training_set.eliminate_zeros() # Get rid of zeros in sparse array storage after update to save space
    
    return training_set, test_set, list(set(person_inds))

In [33]:
content_train, content_test, content_persons_altered = make_train(sparse_content_person, pct_test = 0.2)

In [34]:
def auc_score(predictions, test):
    fpr, tpr, thresholds = metrics.roc_curve(test, predictions)
    return metrics.auc(fpr, tpr)

In [35]:
def calc_mean_auc(training_set, altered_persons, predictions, test_set):
    store_auc = [] # An empty list to store the AUC for each user that had an item removed from the training set
    popularity_auc = [] # To store popular AUC scores
    pop_contents = np.array(test_set.sum(axis = 1)).reshape(-1) # Get sum of item iteractions to find most popular
    content_vecs = predictions[1]
    for person in altered_persons: # Iterate through each user that had an item altered
        training_column = training_set[:,person].toarray().reshape(-1) # Get the training set column
        zero_inds = np.where(training_column == 0) # Find where the interaction had not yet occurred
        
        # Get the predicted values based on our user/item vectors
        person_vec = predictions[0][person,:]
        pred = person_vec.dot(content_vecs).toarray()[0,zero_inds].reshape(-1)
        
        # Get only the items that were originally zero
        # Select all ratings from the MF prediction for this user that originally had no iteraction
        actual = test_set[:,person].toarray()[zero_inds,0].reshape(-1)
        
        # Select the binarized yes/no interaction pairs from the original full data
        # that align with the same pairs in training 
        pop = pop_contents[zero_inds] # Get the item popularity for our chosen items
        
        store_auc.append(auc_score(pred, actual)) # Calculate AUC for the given user and store
        
        popularity_auc.append(auc_score(pop, actual)) # Calculate AUC using most popular and score
    # End users iteration
    
    return float('%.3f'%np.mean(store_auc)), float('%.3f'%np.mean(popularity_auc))

In [36]:
calc_mean_auc(content_train, content_persons_altered,
              [person_vecs, content_vecs.T], content_test)

(0.98, 0.819)