# Baseline - Extract most popular tweet for each langugage

In [1]:
import sys
sys.path.append('../..')

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from tqdm import tqdm

from utils.cuda_cluster import *
from utils.dataset import read_data, factorize_small_cardinality

import core.config as conf


In [2]:
client

0,1
Client  Scheduler: tcp://127.0.0.1:46507  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 1  Cores: 1  Memory: 33.47 GB


## 1. Load data & preprocessing

In [3]:
# data_path = conf.raw_data_path + '*' # for all dataset
data_path = conf.raw_data_path + 'part-00175'
ori_df = read_data(data_path)

number of rows: 3033347


In [4]:
df = ori_df[['enaging_user_id', 'language', 'tweet_id', 'reply_timestamp', 'retweet_timestamp','retweet_with_comment_timestamp', 'like_timestamp']]

In [5]:
df['is_reply'] = df['reply_timestamp'].compute().applymap(lambda x: 1 if x > 0 else 0).astype(np.int32)
df['is_retweet'] = df['retweet_timestamp'].compute().applymap(lambda x: 1 if x > 0 else 0).astype(np.int32)
df['is_comment'] = df['retweet_with_comment_timestamp'].compute().applymap(lambda x: 1 if x > 0 else 0).astype(np.int32)
df['is_like'] = df['like_timestamp'].compute().applymap(lambda x: 1 if x > 0 else 0).astype(np.int32)

df['is_positive'] = df[['is_like', 'is_retweet', 'is_reply', 'is_comment']].sum(axis=1).astype(np.uint8)
df['is_negative'] =  df['is_positive'].compute().applymap(lambda x: 1 if x == 0 else 0).astype(np.int32)

df = df.drop('reply_timestamp', axis=1)
df = df.drop('retweet_timestamp', axis=1)
df = df.drop('retweet_with_comment_timestamp', axis=1)
df = df.drop('like_timestamp', axis=1)

In [6]:
df, idx_to_tweet = factorize_small_cardinality(df, 'tweet_id')
df, idx_to_language = factorize_small_cardinality(df, 'language')

In [7]:
df.head()

Unnamed: 0_level_0,enaging_user_id,language,tweet_id,is_reply,is_retweet,is_comment,is_like,is_positive,is_negative,tweet_id_encode,language_encode
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,411C3FA9B6AB5CA95192D875CDC22823,B8B04128918BBF54E2E178BFF1ABA833,C8F345CF8BC7A86E34572072ECFBBEC4,0,0,0,1,1,0,2214622,48
2,E764026AB0E38A5C2FF19921D73B6C18,9FCF19233EAD65EA6E32C2E6DC03A444,C1E31636C343B780BA776E4B73147028,0,0,0,0,0,1,2136815,43
3,455134BAAD3EAC4093393EC233FBAEF9,B0FA488F2911701DD8EC5B1EA5E322D8,B436C84E80C2430BA9DE41FDF04C73BF,1,0,0,0,1,0,1986297,46
4,92D70497B86CAFBA5C51E331084462AD,1F73BB863A39DB62B4A55B7E558DB1E8,033FFA42C8AD502057AE96C8B4B812BE,0,0,0,1,1,0,35685,5
5,DC1C8A9412B9E266A4C3D4CAF6DB06CB,E7F038DE3EAD397AEC9193686C911677,84F2E902BA3CF3B34B8D056F6F78D488,0,0,0,0,0,1,1465500,61


In [8]:
engagements = df.groupby(['language_encode', 'tweet_id_encode'])[['is_reply',	'is_retweet',	'is_comment',	'is_like',	'is_positive',	'is_negative']].sum()
engagements = engagements.reset_index()
engagements = engagements.set_index('language_encode', drop=True)

# engagements.loc[23].head()

In [9]:
n_languages = 66
language_engagements = [[] for _ in range(n_languages)]

for i in tqdm(range(n_languages)):
    try:
        tmp = engagements.loc[i]
        tmp = tmp.compute().sort_values('is_positive', ascending=False).reset_index()
        language_engagements[i] = tmp
    except:
        print('There is no data on language ', str(i))

100%|██████████| 66/66 [00:23<00:00,  2.82it/s]


In [10]:
print('# the largest value among positive engagements for each language')

for i in range(n_languages):
    print(f'language {i} : ', language_engagements[i].loc[0]['is_positive'])


# the largest value among positive engagements for each language
language 0 :  7
language 1 :  1
language 2 :  4
language 3 :  9
language 4 :  5
language 5 :  27
language 6 :  14
language 7 :  11
language 8 :  22
language 9 :  13
language 10 :  110
language 11 :  6
language 12 :  2
language 13 :  6
language 14 :  4
language 15 :  3
language 16 :  12
language 17 :  2
language 18 :  3
language 19 :  90
language 20 :  5
language 21 :  9
language 22 :  7
language 23 :  7
language 24 :  2
language 25 :  5
language 26 :  0
language 27 :  3
language 28 :  3
language 29 :  7
language 30 :  1
language 31 :  6
language 32 :  9
language 33 :  12
language 34 :  0
language 35 :  9
language 36 :  98
language 37 :  5
language 38 :  0
language 39 :  2
language 40 :  15
language 41 :  13
language 42 :  1
language 43 :  27
language 44 :  2
language 45 :  23
language 46 :  28
language 47 :  12
language 48 :  39
language 49 :  6
language 50 :  3
language 51 :  8
language 52 :  2
language 53 :  1
language 

In [11]:
print('# the number of rows for each language')

for i in range(n_languages):
    print(f'language {i}')
    print(' - all rows : ', len(language_engagements[i]),  ',  positive rows : ', len(language_engagements[i].loc[language_engagements[i]['is_negative'] == 0]))


# the number of rows for each language
language 0
 - all rows :  7820 ,  positive rows :  4195
language 1
 - all rows :  38 ,  positive rows :  21
language 2
 - all rows :  998 ,  positive rows :  678
language 3
 - all rows :  1208 ,  positive rows :  649
language 4
 - all rows :  3303 ,  positive rows :  1728
language 5
 - all rows :  123095 ,  positive rows :  66302
language 6
 - all rows :  6810 ,  positive rows :  3163
language 7
 - all rows :  17362 ,  positive rows :  8795
language 8
 - all rows :  20096 ,  positive rows :  10624
language 9
 - all rows :  26365 ,  positive rows :  14440
language 10
 - all rows :  180123 ,  positive rows :  87963
language 11
 - all rows :  1607 ,  positive rows :  842
language 12
 - all rows :  92 ,  positive rows :  57
language 13
 - all rows :  660 ,  positive rows :  322
language 14
 - all rows :  761 ,  positive rows :  370
language 15
 - all rows :  142 ,  positive rows :  85
language 16
 - all rows :  1374 ,  positive rows :  616
language 17

## 2. Get the popular tweets for each language

In [16]:
topn = 200
recommend_list_by_language = [[] for _ in range(n_languages)]

In [17]:
for i in range(n_languages):
    recommend_list_by_language[i] = language_engagements[i].loc[:topn-1]

In [18]:
# tweet_id_encode => real tweet_id
for i in range(n_languages):
    tmp = recommend_list_by_language[i][['tweet_id_encode']].to_pandas()
    tmp['tweet_id'] = tmp['tweet_id_encode'].apply(lambda x: idx_to_tweet[x])
    recommend_list_by_language[i]['tweet_id'] = tmp['tweet_id']

In [19]:
recommend_list_by_language[22]

Unnamed: 0,language_encode,tweet_id_encode,is_reply,is_retweet,is_comment,is_like,is_positive,is_negative,tweet_id
0,22,2575606,0,2,0,5,7,2,E9BD6189EA764CB96A4875C3B613C198
1,22,2560206,0,0,0,5,5,2,E8555B4FDAAE52E504D7EC17EECF4D1B
2,22,798,0,0,0,5,5,1,0012C486BAFE57C59953F9633D7881B5
3,22,2545866,0,0,0,3,3,0,E70999C8DB54BC7A94CB9872530ADBB3
4,22,2292236,0,1,0,2,3,1,D003AD3DF74E2E001B80E3A1584818AF
...,...,...,...,...,...,...,...,...,...
195,22,886076,0,0,0,1,1,0,50609560E326D5627A57D2D8445F93E6
196,22,484416,0,0,0,1,1,0,2BF6A6CCD6335115F6C2923DE01DF8FC
197,22,552421,0,0,0,1,1,0,321D9CBA3EFB36D9804AD40FE61E1ABA
198,22,649875,0,0,0,1,1,0,3AF37332E120516B424A85608BB972B5


## 3. Get user's language

## 4.  Recommend tweets to each user
- 1) Remove the tweets that already read by user.
- 2) Get the popular tweets (reagardless of the language) if there is no enough tweets to recommend.