# Baseline - Most popular

In [8]:
import sys
sys.path.append('../..')

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import train_test_split

from utils.cuda_cluster import *
from utils.dataset import read_data, factorize_small_cardinality


import core.config as conf


In [3]:
client

0,1
Client  Scheduler: tcp://127.0.0.1:35589  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 1  Cores: 1  Memory: 33.47 GB


## 1. Load data & preprocessing

In [4]:
# data_path = conf.raw_data_path + '*' # for all dataset
data_path = conf.raw_data_path + 'part-00175'
ori_df = read_data(data_path)

number of rows: 3033347


In [59]:
df = ori_df[['enaging_user_id', 'tweet_id', 'tweet_timestamp', 'reply_timestamp', 'retweet_timestamp','retweet_with_comment_timestamp', 'like_timestamp']]

In [60]:
df['is_reply'] = df['reply_timestamp'].compute().applymap(lambda x: 1 if x > 0 else 0).astype(np.int32)
df['is_retweet'] = df['retweet_timestamp'].compute().applymap(lambda x: 1 if x > 0 else 0).astype(np.int32)
df['is_comment'] = df['retweet_with_comment_timestamp'].compute().applymap(lambda x: 1 if x > 0 else 0).astype(np.int32)
df['is_like'] = df['like_timestamp'].compute().applymap(lambda x: 1 if x > 0 else 0).astype(np.int32)

df['positive_cnt'] = df[['is_like', 'is_retweet', 'is_reply', 'is_comment']].sum(axis=1).astype(np.uint8)

df = df.drop('reply_timestamp', axis=1)
df = df.drop('retweet_timestamp', axis=1)
df = df.drop('retweet_with_comment_timestamp', axis=1)
df = df.drop('like_timestamp', axis=1)

In [61]:
df, idx_to_tweet = factorize_small_cardinality(df, 'tweet_id')

In [62]:
df = df.sort_values('tweet_timestamp').reset_index(drop=True)

In [63]:
df.head()

Unnamed: 0,enaging_user_id,tweet_id,tweet_timestamp,is_reply,is_retweet,is_comment,is_like,positive_cnt,tweet_id_encode
0,5A3E44D22DD65BA91B89A9DF7702FAC0,999FE9855F9D871EB681E165C53BD207,1612396800,0,0,0,0,0,1693166
1,F1F47A3206C362427B10F2CD35C04DF0,9FB702509229B111C50CE2D116E35048,1612396800,0,0,0,0,0,1760290
2,208C0F895C2C868C5D99642D6E201B62,5407073F17BCE73FD8F5F8C72537F19B,1612396800,0,0,0,0,0,926365
3,41193E811F3CF1915FB56BB976D85425,9B6E6F7013196B780FB034D92DC6D518,1612396800,0,0,0,1,1,1713181
4,F9B8B3C7FA390E9D8D5475F07B6D6F73,A3A2E7B7E2F1ADFA87506C8213CD754E,1612396800,0,0,0,1,1,1803744


### train, validation data split

In [195]:
train_df, val_df = train_test_split(df.compute(), test_size=0.5, random_state=777, shuffle=False)

## 2. Get popular tweets in training data
- recommend tweets received more than 'threshold' positive engagements

In [252]:
threshold = 5

In [253]:
engagements = train_df.groupby(['tweet_id_encode'])[['is_reply',	'is_retweet',	'is_comment',	'is_like',	'positive_cnt']].sum()
engagements.describe().astype(np.int32)

Unnamed: 0,is_reply,is_retweet,is_comment,is_like,positive_cnt
count,1411541,1411541,1411541,1411541,1411541
mean,0,0,0,0,0
std,0,0,0,0,0
min,0,0,0,0,0
25%,0,0,0,0,0
50%,0,0,0,0,0
75%,0,0,0,1,1
max,7,25,9,73,110


In [254]:
# sort by positive_cnt

positive_engagements = engagements.sort_values('positive_cnt', ascending=False)
positive_engagements = positive_engagements.reset_index()
recommend_df = positive_engagements[positive_engagements['positive_cnt'] >= threshold].to_pandas()
recommend_df['tweet_id'] = recommend_df['tweet_id_encode'].apply(lambda x: idx_to_tweet[x])
recommend_list = recommend_df['tweet_id_encode'].to_numpy()

print(len(recommend_list), recommend_list[:5])

2584 [1922240 2455226 2083101 1834183  960782]


## 3. Predict each engagement

In [255]:
predict_df = val_df[['enaging_user_id', 'tweet_id','tweet_id_encode']].to_pandas()
predict_df['predict_positive'] = 0

In [256]:
predict_df[predict_df['tweet_id'].isin(recommend_list) == True]

Unnamed: 0,enaging_user_id,tweet_id,tweet_id_encode,predict_positive
