# Baseline - Extract most popular tweet for each langugage

In [3]:
import sys
sys.path.append('../..')

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import train_test_split

from utils.cuda_cluster import *
from utils.dataset import read_data, factorize_small_cardinality
from utils.evaluate import calculate_ctr, compute_rce, average_precision_score

import core.config as conf


In [4]:
client

0,1
Client  Scheduler: tcp://127.0.0.1:42411  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 1  Cores: 1  Memory: 33.47 GB


## 1. Load data & preprocessing

In [5]:
# data_path = conf.raw_data_path + '*' # for all dataset
data_path = conf.raw_data_path + 'part-00175'
ori_df = read_data(data_path)

number of rows: 3033347


In [6]:
df = ori_df[['enaging_user_id', 'tweet_timestamp', 'language', 'tweet_id', 'reply_timestamp', 'retweet_timestamp','retweet_with_comment_timestamp', 'like_timestamp']]

In [7]:
df['is_reply'] = df['reply_timestamp'].compute().applymap(lambda x: 1 if x > 0 else 0).astype(np.int32)
df['is_retweet'] = df['retweet_timestamp'].compute().applymap(lambda x: 1 if x > 0 else 0).astype(np.int32)
df['is_comment'] = df['retweet_with_comment_timestamp'].compute().applymap(lambda x: 1 if x > 0 else 0).astype(np.int32)
df['is_like'] = df['like_timestamp'].compute().applymap(lambda x: 1 if x > 0 else 0).astype(np.int32)

df['positive_cnt'] = df[['is_like', 'is_retweet', 'is_reply', 'is_comment']].sum(axis=1).astype(np.uint8)

df = df.drop('reply_timestamp', axis=1)
df = df.drop('retweet_timestamp', axis=1)
df = df.drop('retweet_with_comment_timestamp', axis=1)
df = df.drop('like_timestamp', axis=1)

In [8]:
df, idx_to_tweet = factorize_small_cardinality(df, 'tweet_id')
df, idx_to_language = factorize_small_cardinality(df, 'language')

In [9]:
df.head()

Unnamed: 0_level_0,enaging_user_id,tweet_timestamp,language,tweet_id,is_reply,is_retweet,is_comment,is_like,positive_cnt,tweet_id_encode,language_encode
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,411C3FA9B6AB5CA95192D875CDC22823,1612993854,B8B04128918BBF54E2E178BFF1ABA833,C8F345CF8BC7A86E34572072ECFBBEC4,0,0,0,1,1,2214622,48
2,E764026AB0E38A5C2FF19921D73B6C18,1612886900,9FCF19233EAD65EA6E32C2E6DC03A444,C1E31636C343B780BA776E4B73147028,0,0,0,0,0,2136815,43
3,455134BAAD3EAC4093393EC233FBAEF9,1614019237,B0FA488F2911701DD8EC5B1EA5E322D8,B436C84E80C2430BA9DE41FDF04C73BF,1,0,0,0,1,1986297,46
4,92D70497B86CAFBA5C51E331084462AD,1612779567,1F73BB863A39DB62B4A55B7E558DB1E8,033FFA42C8AD502057AE96C8B4B812BE,0,0,0,1,1,35685,5
5,DC1C8A9412B9E266A4C3D4CAF6DB06CB,1613822114,E7F038DE3EAD397AEC9193686C911677,84F2E902BA3CF3B34B8D056F6F78D488,0,0,0,0,0,1465500,61


### train, valid data split

In [10]:
train_df, val_df = train_test_split(df.compute(), test_size=0.5, random_state=777, shuffle=False)

### Group by language

In [11]:
engagements = train_df.groupby(['language_encode', 'tweet_id_encode'])[['is_reply',	'is_retweet',	'is_comment',	'is_like',	'positive_cnt']].sum()
engagements = engagements.reset_index()
engagements = engagements.set_index('language_encode', drop=True)

# engagements.loc[23].head()

In [12]:
n_languages = 66
language_engagements = [[] for _ in range(n_languages)]

for i in tqdm(range(n_languages)):
    try:
        tmp = engagements.loc[i]
        tmp = tmp.sort_values('positive_cnt', ascending=False).reset_index()
        language_engagements[i] = tmp
    except:
        print('There is no data on language ', str(i))

 74%|███████▍  | 49/66 [00:00<00:00, 87.91it/s]There is no data on language  26
There is no data on language  34
There is no data on language  38
100%|██████████| 66/66 [00:00<00:00, 95.39it/s]


In [13]:
print('# the largest value among positive engagements for each language')

for i in range(n_languages):
    try:
        print(f'language {i} : ', language_engagements[i].loc[0]['positive_cnt'])
    except:
        print('There is no data on language ', str(i))


# the largest value among positive engagements for each language
language 0 :  5
language 1 :  1
language 2 :  3
language 3 :  4
language 4 :  4
language 5 :  15
language 6 :  6
language 7 :  5
language 8 :  14
language 9 :  7
language 10 :  53
language 11 :  4
language 12 :  2
language 13 :  2
language 14 :  2
language 15 :  3
language 16 :  4
language 17 :  2
language 18 :  3
language 19 :  47
language 20 :  3
language 21 :  5
language 22 :  4
language 23 :  4
language 24 :  2
language 25 :  3
There is no data on language  26
language 27 :  2
language 28 :  3
language 29 :  5
language 30 :  1
language 31 :  5
language 32 :  6
language 33 :  8
There is no data on language  34
language 35 :  4
language 36 :  54
language 37 :  3
There is no data on language  38
language 39 :  2
language 40 :  11
language 41 :  6
language 42 :  1
language 43 :  11
language 44 :  2
language 45 :  15
language 46 :  12
language 47 :  9
language 48 :  17
language 49 :  3
language 50 :  2
language 51 :  5
lan

In [14]:
print('# the number of rows for each language')

for i in range(n_languages):
    try:
        print(f'language {i}')
        print(' - all rows : ', len(language_engagements[i]),  ',  positive rows : ', len(language_engagements[i].loc[language_engagements[i]['positive_cnt'] != 0]))
    except:
        print('There is no data on language ', str(i))
    


# the number of rows for each language
language 0
 - all rows :  4116 ,  positive rows :  2227
language 1
 - all rows :  26 ,  positive rows :  14
language 2
 - all rows :  502 ,  positive rows :  344
language 3
 - all rows :  639 ,  positive rows :  348
language 4
 - all rows :  1635 ,  positive rows :  866
language 5
 - all rows :  63395 ,  positive rows :  35008
language 6
 - all rows :  3534 ,  positive rows :  1644
language 7
 - all rows :  8762 ,  positive rows :  4483
language 8
 - all rows :  10224 ,  positive rows :  5483
language 9
 - all rows :  13310 ,  positive rows :  7369
language 10
 - all rows :  93032 ,  positive rows :  46947
language 11
 - all rows :  835 ,  positive rows :  445
language 12
 - all rows :  45 ,  positive rows :  27
language 13
 - all rows :  309 ,  positive rows :  156
language 14
 - all rows :  378 ,  positive rows :  189
language 15
 - all rows :  70 ,  positive rows :  40
language 16
 - all rows :  744 ,  positive rows :  353
language 17
 - all ro

## 2. Get the popular tweets for each language

In [15]:
topn = 1000
recommend_list_like = [[] for _ in range(n_languages)]
recommend_list_reply = [[] for _ in range(n_languages)]
recommend_list_comment = [[] for _ in range(n_languages)]
recommend_list_retweet = [[] for _ in range(n_languages)]

In [16]:
for i in tqdm(range(n_languages)):
    try:
        recommend_list_like[i] = language_engagements[i].sort_values('is_like', ascending=False)[:topn-1]['tweet_id_encode'].to_array()
        recommend_list_reply[i] = language_engagements[i].sort_values('is_reply', ascending=False)[:topn-1]['tweet_id_encode'].to_array()
        recommend_list_comment[i] = language_engagements[i].sort_values('is_comment', ascending=False)[:topn-1]['tweet_id_encode'].to_array()
        recommend_list_retweet[i] = language_engagements[i].sort_values('is_retweet', ascending=False)[:topn-1]['tweet_id_encode'].to_array()
        
    except:
        print('There is no data on language ', str(i))

 55%|█████▍    | 36/66 [00:00<00:00, 86.54it/s]There is no data on language  26
There is no data on language  34
There is no data on language  38
100%|██████████| 66/66 [00:00<00:00, 81.65it/s]


## 3. Get user's language

In [17]:
predict_df = val_df[['enaging_user_id', 'tweet_id_encode', 'language_encode', 'is_like', 'is_reply', 'is_comment', 'is_retweet']]

## 4.  Predict engagements for each user


### 1) Like

In [18]:
predict_df['predict_like'] = 0

for i in tqdm(range(n_languages)):
    try:
        tmp = predict_df[predict_df['language_encode'] == i]
        positive = tmp[tmp['tweet_id_encode'].isin(recommend_list_like[i]) == True]        
        indexes = positive.index.to_array()
        predict_df.loc[indexes, 'predict_like'] = 1
    except:
        print('There is no data on language ', str(i))


100%|██████████| 66/66 [00:01<00:00, 54.53it/s]


### 2) Reply

In [19]:
predict_df['predict_reply'] = 0

for i in tqdm(range(n_languages)):
    try:
        tmp = predict_df[predict_df['language_encode'] == i]
        positive = tmp[tmp['tweet_id_encode'].isin(recommend_list_reply[i]) == True]        
        indexes = positive.index.to_array()
        predict_df.loc[indexes, 'predict_reply'] = 1
    except:
        print('There is no data on language ', str(i))


100%|██████████| 66/66 [00:01<00:00, 53.75it/s]


### 3) Comment

In [20]:
predict_df['predict_comment'] = 0

for i in tqdm(range(n_languages)):
    try:
        tmp = predict_df[predict_df['language_encode'] == i]
        positive = tmp[tmp['tweet_id_encode'].isin(recommend_list_comment[i]) == True]        
        indexes = positive.index.to_array()
        predict_df.loc[indexes, 'predict_comment'] = 1
    except:
        print('There is no data on language ', str(i))


100%|██████████| 66/66 [00:01<00:00, 53.25it/s]


### 4) Retweet

In [21]:
predict_df['predict_retweet'] = 0

for i in tqdm(range(n_languages)):
    try:
        tmp = predict_df[predict_df['language_encode'] == i]
        positive = tmp[tmp['tweet_id_encode'].isin(recommend_list_retweet[i]) == True]        
        indexes = positive.index.to_array()
        predict_df.loc[indexes, 'predict_retweet'] = 1
    except:
        print('There is no data on language ', str(i))


100%|██████████| 66/66 [00:01<00:00, 52.93it/s]


In [22]:
predict_df.loc[2431037]

Unnamed: 0,enaging_user_id,tweet_id_encode,language_encode,is_like,is_reply,is_comment,is_retweet,predict_like,predict_reply,predict_comment,predict_retweet
2431037,8C01B8883F6B44F1C492AA9B0D433FBD,1227935,0,0,0,0,1,1,1,1,1


## 5. Evaluate

In [23]:
rce_like = compute_rce(predict_df['predict_like'].to_array(), predict_df['is_like'].to_array())
rce_reply = compute_rce(predict_df['predict_reply'].to_array(), predict_df['is_reply'].to_array())
rce_comment = compute_rce(predict_df['predict_comment'].to_array(), predict_df['is_comment'].to_array())
rce_rewteet = compute_rce(predict_df['predict_retweet'].to_array(), predict_df['is_retweet'].to_array())
print(rce_like, rce_reply, rce_comment, rce_rewteet)

-1923.629817245273 -775.0775126143292 -973.8460577441801 -977.3809442582101


In [24]:
ap_like = average_precision_score(predict_df['predict_like'].to_array(), predict_df['is_like'].to_array())
ap_reply = average_precision_score(predict_df['predict_reply'].to_array(), predict_df['is_reply'].to_array())
ap_comment = average_precision_score(predict_df['predict_comment'].to_array(), predict_df['is_comment'].to_array())
ap_retweet = average_precision_score(predict_df['predict_retweet'].to_array(), predict_df['is_retweet'].to_array())
print(ap_like, ap_reply, ap_comment, ap_retweet)

0.01823127326780151 0.004463772316680876 0.006216952844698359 0.008158198645698103
