In [761]:
import pandas as pd
import numpy as np
import math

In [762]:
articles_df = pd.read_csv('data/shared_articles.csv')
articles_df.shape

(3122, 13)

In [763]:
articles_df = articles_df[articles_df['eventType'] == 'CONTENT SHARED']
articles_df.shape

(3047, 13)

In [764]:
interactions_df = pd.read_csv('data/users_interactions.csv')
interactions_df.head()

Unnamed: 0,timestamp,eventType,contentId,personId,sessionId,userAgent,userRegion,userCountry
0,1465413032,VIEW,-3499919498720038879,-8845298781299428018,1264196770339959068,,,
1,1465412560,VIEW,8890720798209849691,-1032019229384696495,3621737643587579081,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2...,NY,US
2,1465416190,VIEW,310515487419366995,-1130272294246983140,2631864456530402479,,,
3,1465413895,FOLLOW,310515487419366995,344280948527967603,-3167637573980064150,,,
4,1465412290,VIEW,-7820640624231356730,-445337111692715325,5611481178424124714,,,


In [765]:
interactions_df.personId = interactions_df.personId.astype(str)
interactions_df.contentId = interactions_df.contentId.astype(str)
articles_df.contentId = articles_df.contentId.astype(str)

In [766]:
event_type = {
   'VIEW': 1.0,
   'LIKE': 2.0, 
   'BOOKMARK': 2.5, 
   'FOLLOW': 3.0,
   'COMMENT CREATED': 4.0,  
}

In [767]:
event_weight = pd.DataFrame.from_dict(event_type, orient='index').reset_index()
event_weight.columns = ['eventType', 'eventWeight']
event_weight

Unnamed: 0,eventType,eventWeight
0,VIEW,1.0
1,LIKE,2.0
2,BOOKMARK,2.5
3,FOLLOW,3.0
4,COMMENT CREATED,4.0


In [768]:
interactions_df = interactions_df.merge(event_weight, how='left')

In [769]:
interactions_df.eventWeight.mean()

1.2362885828078327

In [770]:
person_content = interactions_df.groupby(['personId', 'contentId'], as_index=False)['contentId'].agg('count')

In [771]:
person_content1 = person_content.groupby('personId', as_index=False).count()

In [772]:
person_list = list(person_content1[person_content1['contentId']>=5]['personId'])

In [773]:
interactions_df = interactions_df[interactions_df['personId'].isin(person_list)]

In [774]:
interactions_df.shape

(69868, 9)

In [775]:
def smooth_user_preference(x):
    return math.log(1+x, 2)

In [776]:
interactions_df['eventWeSum'] = interactions_df.groupby(['personId', 'contentId'])['eventWeight'].transform('sum')
interactions_df.shape

(69868, 10)

In [777]:
interactions_df['eventLog'] = interactions_df['eventWeSum'].apply(lambda x: smooth_user_preference(x))
interactions_df.shape

(69868, 11)

In [778]:
interactions_df = interactions_df.groupby(['personId','contentId']).agg({'timestamp': ['max'], 'eventLog': ['mean']}).reset_index()
interactions_df.columns = ['personId', 'contentId', 'timeMax', 'eventLog']
interactions_df.head()

Unnamed: 0,personId,contentId,timeMax,eventLog
0,-1007001694607905623,-5065077552540450930,1470395911,1.0
1,-1007001694607905623,-6623581327558800021,1487240080,1.0
2,-1007001694607905623,-793729620925729327,1472834892,1.0
3,-1007001694607905623,1469580151036142903,1487240062,1.0
4,-1007001694607905623,7270966256391553686,1485994342,1.584963


In [779]:
interactions_df['timeMax'].mean()

1470605340.0403006

In [780]:
# interactions_df[interactions_df['timeMax'] == 1475519545]
interactions_train_df = interactions_df[interactions_df['timeMax'] < 1475519545].copy()
interactions_test_df = interactions_df[interactions_df['timeMax'] > 1475519545].copy()
interactions_train_df.shape

(29325, 4)

In [781]:
final_df = (
    interactions_train_df.reset_index()
    .groupby('personId')['contentId'].agg(lambda x: list(x))
    .reset_index()
    .rename(columns={'contentId': 'true_train'})
    .set_index('personId')
)

final_df['true_test'] = (
    interactions_test_df.reset_index()
    .groupby('personId')['contentId'].agg(lambda x: list(x))
)

final_df['true_test'] = [ [] if x is np.NaN else x for x in final_df['true_test'] ]
final_df = final_df.reset_index()
final_df.sort_index(ascending=True).head()

Unnamed: 0,personId,true_train,true_test
0,-1007001694607905623,"[-5065077552540450930, -793729620925729327]","[-6623581327558800021, 1469580151036142903, 72..."
1,-1032019229384696495,"[-1006791494035379303, -1039912738963181810, -...","[-1415040208471067980, -2555801390963402198, -..."
2,-108842214936804958,"[-1196068832249300490, -133139342397538859, -1...","[-2780168264183400543, -3060116862184714437, -..."
3,-1130272294246983140,"[-1150591229250318592, -1196068832249300490, -...","[-1606980109000976010, -1663441888197894674, -..."
4,-1160159014793528221,"[-133139342397538859, -387651900461462767, 377...",[-3462051751080362224]


In [782]:
rating_articles = interactions_train_df.groupby('contentId')['eventLog'].sum().sort_values(ascending=False).reset_index()
rating_articles.head()

Unnamed: 0,contentId,eventLog
0,-6783772548752091658,231.177195
1,-133139342397538859,228.024567
2,-8208801367848627943,189.937683
3,8224860111193157980,186.04468
4,7507067965574797372,179.094002


_____________










___________

_________

In [783]:
articles_df1 = pd.read_csv('data/shared_articles.csv')
articles_df1 = articles_df1[articles_df1['eventType'] == 'CONTENT SHARED']
articles_df1.shape

(3047, 13)

In [784]:
interactions_df1 = pd.read_csv('data/users_interactions.csv')
interactions_df1.personId = interactions_df1.personId.astype(str)
interactions_df1.contentId = interactions_df1.contentId.astype(str)
articles_df1.contentId = articles_df1.contentId.astype(str)
event_type_strength = {
   'VIEW': 1.0,
   'LIKE': 2.0, 
   'BOOKMARK': 2.5, 
   'FOLLOW': 3.0,
   'COMMENT CREATED': 4.0,  
}
interactions_df1['eventStrength'] = interactions_df1.eventType.apply(lambda x: event_type_strength[x])
interactions_df1['eventStrength'].mean()

1.2362885828078327

In [785]:
users_interactions_count_df = (
    interactions_df1
    .groupby(['personId', 'contentId'])
    .first()
    .reset_index()
    .groupby('personId').size())

users_with_enough_interactions_df = \
    users_interactions_count_df[users_interactions_count_df >= 5].reset_index()[['personId']]

print(len(users_with_enough_interactions_df))

1140


In [786]:
interactions_from_selected_users_df = interactions_df1.loc[np.in1d(interactions_df1.personId,
            users_with_enough_interactions_df)]
print(interactions_from_selected_users_df.shape)

(69868, 9)


In [787]:
def smooth_user_preference(x):
    return math.log(1+x, 2)
    
interactions_full_df = (
    interactions_from_selected_users_df
    .groupby(['personId', 'contentId']).eventStrength.sum()
    .apply(smooth_user_preference)
    .reset_index().set_index(['personId', 'contentId'])
)
print(interactions_full_df.shape)
interactions_full_df['last_timestamp'] = (
    interactions_from_selected_users_df
    .groupby(['personId', 'contentId'])['timestamp'].max()
)
        
interactions_full_df = interactions_full_df.reset_index()
interactions_full_df['last_timestamp'].mean()

(39106, 1)


1470605340.0403006

In [788]:
split_ts = 1475519545
interactions_train_df = interactions_full_df.loc[interactions_full_df.last_timestamp < split_ts].copy()
interactions_test_df = interactions_full_df.loc[interactions_full_df.last_timestamp >= split_ts].copy()

print(len(interactions_train_df))

29325


In [789]:
final_df = (
    interactions_train_df.reset_index()
    .groupby('personId')['contentId'].agg(lambda x: list(x))
    .reset_index()
    .rename(columns={'contentId': 'true_train'})
    .set_index('personId')
)

final_df['true_test'] = (
    interactions_test_df.reset_index()
    .groupby('personId')['contentId'].agg(lambda x: list(x))
)

final_df['true_test'] = [ [] if x is np.NaN else x for x in final_df['true_test'] ]
final_df.sort_index(ascending=True).head()

Unnamed: 0_level_0,true_train,true_test
personId,Unnamed: 1_level_1,Unnamed: 2_level_1
-1007001694607905623,"[-5065077552540450930, -793729620925729327]","[-6623581327558800021, 1469580151036142903, 72..."
-1032019229384696495,"[-1006791494035379303, -1039912738963181810, -...","[-1415040208471067980, -2555801390963402198, -..."
-108842214936804958,"[-1196068832249300490, -133139342397538859, -1...","[-2780168264183400543, -3060116862184714437, -..."
-1130272294246983140,"[-1150591229250318592, -1196068832249300490, -...","[-1606980109000976010, -1663441888197894674, -..."
-1160159014793528221,"[-133139342397538859, -387651900461462767, 377...",[-3462051751080362224]


In [790]:
popular = (
    interactions_train_df
    .groupby('contentId')
    .eventStrength.sum().reset_index()
    .sort_values('eventStrength', ascending=False)
    .contentId.values
)
popular[0]

'-6783772548752091658'

In [791]:
top_k = 10
 
final_df['popular'] = (
    final_df.true_train
    .apply(
        lambda x:
        popular[~np.in1d(popular, x)][:top_k]
    )
)
def calc_precision(column):
    return (
        final_df
        .apply(
            lambda row:
            len(set(row['true_test']).intersection(
                set(row[column]))) /
            min(len(row['true_test']) + 0.001, 10.0),
            axis=1)).mean()
calc_precision('popular')

0.006454207722621089