In [1]:
import sys
sys.path.append('../..')

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow import feature_column as fc
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn import preprocessing

from utils.cuda_cluster import *
from utils.dataset import read_data, factorize_small_cardinality_with_index, df_to_tfdataset, split_join, get_media_index
from utils.evaluate import calculate_ctr, compute_rce, average_precision_score
import core.config as conf

In [2]:
client

0,1
Client  Scheduler: tcp://127.0.0.1:35423  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 1  Cores: 1  Memory: 33.47 GB


## Load data

In [3]:
# data_path = conf.raw_data_path + '*' # for all dataset
data_path = conf.raw_data_path + 'part-00000'
ori_df = read_data(data_path)

In [4]:
ori_df

Unnamed: 0_level_0,text_ tokens,hashtags,tweet_id,present_media,present_links,present_domains,tweet_type,language,tweet_timestamp,engaged_with_user_id,engaged_with_user_follower_count,engaged_with_user_following_count,engaged_with_user_is_verified,engaged_with_user_account_creation,enaging_user_id,enaging_user_follower_count,enaging_user_following_count,enaging_user_is_verified,enaging_user_account_creation,engagee_follows_engager,reply_timestamp,retweet_timestamp,retweet_with_comment_timestamp,like_timestamp
npartitions=16,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1
1,object,object,object,object,object,object,object,object,int32,object,int32,int32,bool,int32,object,int32,int32,bool,int32,bool,int32,int32,int32,int32
273328,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2983644,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3033902,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...


- media target encoding(photo, video, gif)
- tweet_type
- language
- tweet_timestamp
- engaged_with_user_id
- engaged_with_user_follower_count
- engaging_user_id
- engaging_user_following_count
- engagee_follows_engager
- 'reply_timestamp', 'retweet_timestamp','retweet_with_comment_timestamp', 'like_timestamp'

In [5]:
df = ori_df[['present_media', 'tweet_type', #'language', 'tweet_timestamp', 
             'engaged_with_user_follower_count', 
             'enaging_user_following_count', 'engagee_follows_engager','reply_timestamp', 'retweet_timestamp','retweet_with_comment_timestamp', 'like_timestamp']]

## Preprocessing

In [6]:
df['is_reply'] = df['reply_timestamp'].compute().applymap(lambda x: 1 if x > 0 else 0).astype(np.int32)
df['is_retweet'] = df['retweet_timestamp'].compute().applymap(lambda x: 1 if x > 0 else 0).astype(np.int32)
df['is_comment'] = df['retweet_with_comment_timestamp'].compute().applymap(lambda x: 1 if x > 0 else 0).astype(np.int32)
df['is_like'] = df['like_timestamp'].compute().applymap(lambda x: 1 if x > 0 else 0).astype(np.int32)

In [7]:
df = df.drop('reply_timestamp', axis=1)
df = df.drop('retweet_timestamp', axis=1)
df = df.drop('retweet_with_comment_timestamp', axis=1)
df = df.drop('like_timestamp', axis=1)

In [8]:
df.head()

Unnamed: 0_level_0,present_media,tweet_type,engaged_with_user_follower_count,enaging_user_following_count,engagee_follows_engager,is_reply,is_retweet,is_comment,is_like
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,,Retweet,2473,339,False,0,0,0,0
2,,TopLevel,4418640,1190,False,0,0,0,0
3,,TopLevel,219715,1473,False,0,0,0,1
4,,Retweet,2388283,200,False,0,0,0,0
5,Photo,TopLevel,414,379,True,0,0,0,1


### Media Target Encoding

In [9]:
df['present_media'] = df['present_media'].fillna( '' )

In [10]:
df,media_index, _ = factorize_small_cardinality_with_index(df, 'present_media', 'media_type')

In [11]:
media_index = get_media_index(media_index)

In [12]:
media_index = media_index['number_of_media'].to_frame()

In [13]:
media_index

Unnamed: 0,number_of_media
0,0
1,1
2,2
3,3
4,2
5,2
6,1
7,2
8,2
9,3


In [14]:
media_index = cudf.DataFrame(media_index)
media_index = dask_cudf.from_cudf(media_index, npartitions=2)

In [15]:
df = df.merge(media_index, left_on="media_type", right_index = True, how='left')

In [16]:
df = df.drop('present_media', axis = 1)
df = df.drop('media_type', axis = 1)

### Language Encoding

In [17]:
#df, idx_to_language, _ = factorize_small_cardinality_with_index(df, 'language', 'language_encode') # how language encoding??

In [18]:
#df = df.drop('language', axis = 1)

In [19]:
#df.head()

### Tweet Type Encoding

In [20]:
df, _= factorize_small_cardinality_with_index(df, 'tweet_type', 'tweet_type_encode') # how language encoding??

In [21]:
df = df.drop('tweet_type', axis = 1)

In [22]:
df.head()

Unnamed: 0,engaged_with_user_follower_count,enaging_user_following_count,engagee_follows_engager,is_reply,is_retweet,is_comment,is_like,number_of_media,tweet_type_encode
0,3383,1290,True,0,0,0,1,0,2
1,402,222,True,0,0,0,1,0,2
2,5696,2599,True,0,0,0,1,0,2
3,2581,351,False,0,0,0,0,0,1
4,127178,137,False,0,0,0,1,0,2


In [23]:
df = df.astype(np.int64)

## Modeling

### Like

In [24]:
df_y = df[['is_reply', 'is_retweet', 'is_comment', 'is_like']]
df_x = df.drop(['is_reply', 'is_retweet', 'is_comment', 'is_like'], axis = 1)

#### Scaling

In [26]:
standard_scaler = preprocessing.StandardScaler()
standard_scaler.fit(df_x.compute().as_matrix())

StandardScaler()

In [27]:
ss = standard_scaler.transform(df_x.compute().as_matrix())

In [28]:
df_x = pd.DataFrame(ss, columns = df_x.columns)

In [29]:
X_train = df_x
y_train_like = df_y['is_like'].to_frame()

In [30]:
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train_like.compute(), test_size=0.2)

In [31]:
X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.2)

In [32]:
model = Sequential([
    Dense(16, activation = 'relu', input_dim = X_test.shape[1]),
    Dense(8, activation = 'relu'),
    Dense(4, activation = 'relu'),
    Dense(2, activation = 'softmax')
])

In [33]:
model.compile(
    optimizer = 'adam',
    loss = 'sparse_categorical_crossentropy', # softmax : sparse_categorical_crossentropy, sigmoid : binary_crossentropy
    metrics=['accuracy'])

In [34]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 16)                96        
_________________________________________________________________
dense_1 (Dense)              (None, 8)                 136       
_________________________________________________________________
dense_2 (Dense)              (None, 4)                 36        
_________________________________________________________________
dense_3 (Dense)              (None, 2)                 10        
Total params: 278
Trainable params: 278
Non-trainable params: 0
_________________________________________________________________


In [35]:
model.fit(
    x = X_train,
    y = y_train.to_pandas(),
    validation_data=(X_val, y_val.to_pandas()),
    epochs=5,
    batch_size=64
)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7fe4d4531810>

In [36]:
model.evaluate(X_test, y_test.to_pandas())



[0.6384584307670593, 0.6250947713851929]

## Predict

In [37]:
predict_like = model.predict(X_test)

In [38]:
predict_like

array([[0.5505934 , 0.4494066 ],
       [0.5172505 , 0.48274958],
       [0.5831792 , 0.41682082],
       ...,
       [0.37265196, 0.627348  ],
       [0.49968565, 0.5003143 ],
       [0.5505934 , 0.4494066 ]], dtype=float32)

In [39]:
predict_like = list(map(lambda x: 0 if x[0] > x[1] else 1,  predict_like))

In [40]:
y_test['predict_like'] = predict_like

In [41]:
rce_like = compute_rce(y_test['predict_like'].to_array(), y_test['is_like'].to_array())
rce_like

-1826.5365109006596

In [42]:
average_precision_score(y_test['predict_like'].to_array(), y_test['is_like'].to_array())

0.29836524175170515