## Baseline - Predict engagements by popular creator's tweet

In [1]:
import sys
sys.path.append('../..')

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow import feature_column as fc

from utils.cuda_cluster import *
from utils.dataset import read_data, factorize_small_cardinality, df_to_tfdataset
from utils.evaluate import calculate_ctr, compute_rce, average_precision_score

import core.config as conf


In [2]:
client

0,1
Client  Scheduler: tcp://127.0.0.1:33513  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 1  Cores: 1  Memory: 33.47 GB


## 1. Load data & preprocessing

In [3]:
# data_path = conf.raw_data_path + '*' # for all dataset
data_path = conf.raw_data_path + 'part-00175'
ori_df = read_data(data_path)

number of rows: 3033347


In [4]:
ori_df.columns

Index(['text_ tokens', 'hashtags', 'tweet_id', 'present_media',
       'present_links', 'present_domains', 'tweet_type', 'language',
       'tweet_timestamp', 'engaged_with_user_id',
       'engaged_with_user_follower_count', 'engaged_with_user_following_count',
       'engaged_with_user_is_verified', 'engaged_with_user_account_creation',
       'enaging_user_id', 'enaging_user_follower_count',
       'enaging_user_following_count', 'enaging_user_is_verified',
       'enaging_user_account_creation', 'engagee_follows_engager',
       'reply_timestamp', 'retweet_timestamp',
       'retweet_with_comment_timestamp', 'like_timestamp'],
      dtype='object')

In [5]:
df = ori_df[['enaging_user_id', 'tweet_id', 'language', 'engaged_with_user_follower_count', 'reply_timestamp', 'retweet_timestamp','retweet_with_comment_timestamp', 'like_timestamp']]

In [6]:
df['is_reply'] = df['reply_timestamp'].compute().applymap(lambda x: 1 if x > 0 else 0).astype(np.int32)
df['is_retweet'] = df['retweet_timestamp'].compute().applymap(lambda x: 1 if x > 0 else 0).astype(np.int32)
df['is_comment'] = df['retweet_with_comment_timestamp'].compute().applymap(lambda x: 1 if x > 0 else 0).astype(np.int32)
df['is_like'] = df['like_timestamp'].compute().applymap(lambda x: 1 if x > 0 else 0).astype(np.int32)

df['positive_cnt'] = df[['is_like', 'is_retweet', 'is_reply', 'is_comment']].sum(axis=1).astype(np.uint8)

df = df.drop('reply_timestamp', axis=1)
df = df.drop('retweet_timestamp', axis=1)
df = df.drop('retweet_with_comment_timestamp', axis=1)
df = df.drop('like_timestamp', axis=1)

In [7]:
# df, idx_to_tweet = factorize_small_cardinality(df, 'tweet_id')
df, idx_to_language = factorize_small_cardinality(df, 'language') # how language encoding??

In [8]:
df.head()

Unnamed: 0_level_0,enaging_user_id,tweet_id,language,engaged_with_user_follower_count,is_reply,is_retweet,is_comment,is_like,positive_cnt,language_encode
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,411C3FA9B6AB5CA95192D875CDC22823,C8F345CF8BC7A86E34572072ECFBBEC4,B8B04128918BBF54E2E178BFF1ABA833,4753,0,0,0,1,1,48
2,E764026AB0E38A5C2FF19921D73B6C18,C1E31636C343B780BA776E4B73147028,9FCF19233EAD65EA6E32C2E6DC03A444,110643,0,0,0,0,0,43
3,455134BAAD3EAC4093393EC233FBAEF9,B436C84E80C2430BA9DE41FDF04C73BF,B0FA488F2911701DD8EC5B1EA5E322D8,4480,1,0,0,0,1,46
4,92D70497B86CAFBA5C51E331084462AD,033FFA42C8AD502057AE96C8B4B812BE,1F73BB863A39DB62B4A55B7E558DB1E8,461,0,0,0,1,1,5
5,DC1C8A9412B9E266A4C3D4CAF6DB06CB,84F2E902BA3CF3B34B8D056F6F78D488,E7F038DE3EAD397AEC9193686C911677,1308,0,0,0,0,0,61


### train, valid data split

## 2. Feature Engineering

In [33]:
real = {
    'engaged_with_user_follower_count' : fc.numeric_column('engaged_with_user_follower_count', normalizer_fn=lambda x: (x - 3.0) / 4.2)
}

sparse = {
    'language_encode' : fc.categorical_column_with_hash_bucket('language_encode', hash_bucket_size=66),
}


inputs = {
    colname : tf.keras.layers.Input(name=colname, shape=(), dtype='float32') \
          for colname in real.keys()
}
inputs.update({
    colname : tf.keras.layers.Input(name=colname, shape=( ), dtype='string') \
          for colname in sparse.keys()
})


In [34]:
inputs

{'engaged_with_user_follower_count': <KerasTensor: shape=(None,) dtype=float32 (created by layer 'engaged_with_user_follower_count')>,
 'language_encode': <KerasTensor: shape=(None,) dtype=string (created by layer 'language_encode')>}

## 3. Modeling (using wide model)

In [48]:
wide = tf.keras.layers.DenseFeatures(real.values())(inputs)
output = tf.keras.layers.Dense(1, activation='sigmoid')(wide)
model = tf.keras.Model(inputs, output)
optimizer = tf.keras.optimizers.SGD(learning_rate=1)
model.compile(optimizer=optimizer,
                  loss='mean_squared_error', #binary_crossentropy
                  metrics=['accuracy'])

In [49]:
model.summary()

Model: "model_7"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
engaged_with_user_follower_coun [(None,)]            0                                            
__________________________________________________________________________________________________
language_encode (InputLayer)    [(None,)]            0                                            
__________________________________________________________________________________________________
dense_features_7 (DenseFeatures (None, 1)            0           engaged_with_user_follower_count[
                                                                 language_encode[0][0]            
__________________________________________________________________________________________________
dense_7 (Dense)                 (None, 1)            2           dense_features_7[0][0]     

In [55]:
train_df, test_df = train_test_split(df.compute(), test_size=0.2, random_state=777, shuffle=False)
# train_df, val_df = train_test_split(train_df, test_size=0.2, random_state=777, shuffle=False)

batch_size = 128
train_ds = df_to_tfdataset(train_df, 'is_like', batch_size=batch_size) # for like engagement
# val_ds = df_to_tfdataset(val_df, 'is_like', batch_size=batch_size)
test_ds = df_to_tfdataset(test_df, 'is_like', batch_size=batch_size)

## 4. Training

In [53]:
train_ds

<BatchDataset shapes: ({enaging_user_id: (None,), tweet_id: (None,), language: (None,), engaged_with_user_follower_count: (None,), is_reply: (None,), is_retweet: (None,), is_comment: (None,), positive_cnt: (None,), language_encode: (None,)}, (None,)), types: ({enaging_user_id: tf.string, tweet_id: tf.string, language: tf.string, engaged_with_user_follower_count: tf.int32, is_reply: tf.int32, is_retweet: tf.int32, is_comment: tf.int32, positive_cnt: tf.uint8, language_encode: tf.int64}, tf.int32)>

In [51]:
history = model.fit(train_ds,
            validation_split=0.1,
            batch_size=batch_size
            epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10


KeyboardInterrupt: 

In [None]:
model.evaluate(test_ds)

## 5. Predict engagements

In [141]:
predict_like = model.predict(test_ds)

In [164]:
predict_like = np.reshape(predict_like, (-1))

In [None]:
test_df['predict_like'] = predict_like

In [169]:
rce_like = compute_rce(test_df['predict_like'].to_array(), test_df['is_like'].to_array())


In [170]:
rce_like

-1931.390374453516