In [1]:
import sys
sys.path.append('../..')

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow import feature_column as fc
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras import optimizers
from sklearn import preprocessing

from utils.cuda_cluster import *
from utils.preprocessing import read_data, factorize_small_cardinality_with_index, df_to_tfdataset, split_join, get_media_index
from utils.evaluate import calculate_ctr, compute_rce, average_precision_score
import core.config as conf

Perhaps you already have a cluster running?
Hosting the HTTP server on port 36971 instead
  http_address["port"], self.http_server.port


In [2]:
client

0,1
Client  Scheduler: tcp://127.0.0.1:45919  Dashboard: http://127.0.0.1:36971/status,Cluster  Workers: 1  Cores: 1  Memory: 33.47 GB


## Load data

In [3]:
# data_path = conf.raw_data_path + '*' # for all dataset
data_path = conf.data_root + 'dask_input'
df = dask_cudf.read_parquet(f'{data_path}/train-final-te-retweet_comment-1.parquet', index=False)

## Preprocessing

In [4]:
df.head()

Unnamed: 0,timestamp,creator_follower_count,creator_following_count,creator_is_verified,creator_account_creation,engager_follower_count,engager_following_count,engager_is_verified,engager_account_creation,engager_follows_creator,...,TE_creator_count_combined_tweet_type_language_retweet_comment,TE_creator_user_fer_count_delta_time_media_language_retweet_comment,TE_creator_user_fing_count_delta_time_media_language_retweet_comment,TE_creator_user_fering_count_delta_time_tweet_type_language_retweet_comment,TE_creator_user_fing_count_mode_media_language_retweet_comment,TE_creator_user_fer_count_mode_media_language_retweet_comment,TE_creator_user_fering_count_mode_tweet_type_language_retweet_comment,TE_domains_media_tweet_type_language_retweet_comment,TE_links_media_tweet_type_language_retweet_comment,TE_hashtags_media_tweet_type_language_retweet_comment
0,1612772154,389,938,0,1461219816,496,100,0,1447563660,1,...,0.007486,0.014033,0.014033,0.007486,0.013307,0.013307,0.00715,0.011783,0.011783421,0.013712
1,1614050312,17069714,189,1,1344977596,8,17,0,1546057245,0,...,0.006329,0.007339,0.007339,0.006329,0.007291,0.007291,0.006284,0.005441,0.005441219,0.005482
2,1613017908,86963,230,0,1495978710,351,314,0,1480829656,0,...,0.006126,0.00741,0.00741,0.006126,0.007375,0.007375,0.006009,0.006031,0.00603065,0.006208
3,1613668747,27544,2771,1,1270641352,284,1422,0,1231503632,0,...,0.008626,0.006266,0.006266,0.008626,0.006266,0.006266,0.008626,0.003687,,0.004379
4,1612495815,186,177,0,1591281565,423,49,0,1554115261,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
df.isnull().sum().compute()

timestamp                                                                     0
creator_follower_count                                                        0
creator_following_count                                                       0
creator_is_verified                                                           0
creator_account_creation                                                      0
                                                                          ...  
TE_creator_user_fer_count_mode_media_language_retweet_comment            146016
TE_creator_user_fering_count_mode_tweet_type_language_retweet_comment    145772
TE_domains_media_tweet_type_language_retweet_comment                      78933
TE_links_media_tweet_type_language_retweet_comment                       306753
TE_hashtags_media_tweet_type_language_retweet_comment                    340103
Length: 72, dtype: uint64

In [6]:
df = df.fillna(0)

In [7]:
df = df.astype(np.int64)

## Sampling

In [8]:
df_positive = df[df['retweet_comment']==1]

In [9]:
df_negative = df[df['retweet_comment']==0]

In [10]:
print(len(df_positive))
print(len(df_negative))

21659
3057388


In [11]:
df_negative = df_negative.compute().sample(n = len(df_positive))

In [12]:
df = dask.dataframe.concat([df_positive, df_negative])

In [13]:
df.compute()

Unnamed: 0,timestamp,creator_follower_count,creator_following_count,creator_is_verified,creator_account_creation,engager_follower_count,engager_following_count,engager_is_verified,engager_account_creation,engager_follows_creator,...,TE_creator_count_combined_tweet_type_language_retweet_comment,TE_creator_user_fer_count_delta_time_media_language_retweet_comment,TE_creator_user_fing_count_delta_time_media_language_retweet_comment,TE_creator_user_fering_count_delta_time_tweet_type_language_retweet_comment,TE_creator_user_fing_count_mode_media_language_retweet_comment,TE_creator_user_fer_count_mode_media_language_retweet_comment,TE_creator_user_fering_count_mode_tweet_type_language_retweet_comment,TE_domains_media_tweet_type_language_retweet_comment,TE_links_media_tweet_type_language_retweet_comment,TE_hashtags_media_tweet_type_language_retweet_comment
69,1613498594,289,253,0,1581181521,17,4,0,1607221233,1,...,0,0,0,0,0,0,0,0,0,0
331,1613058613,6462,4631,0,1324081619,325,358,0,1587427115,1,...,0,0,0,0,0,0,0,0,0,0
374,1613411963,8881,3250,0,1563687084,4,23,0,1580669919,0,...,0,0,0,0,0,0,0,0,0,0
442,1613587544,1334,546,0,1579228215,323,616,0,1491698512,1,...,0,0,0,0,0,0,0,0,0,0
763,1613566800,937627,694,1,1256238476,186,1318,0,1401386342,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2640044,1612416097,1264,185,0,1599979406,52,153,0,1294106634,0,...,0,0,0,0,0,0,0,0,0,0
2640045,1614115354,736,250,0,1426946983,771,498,0,1359958688,1,...,0,0,0,0,0,0,0,0,0,0
2640046,1613172788,24209,2270,0,1353351890,567,747,0,1218832659,1,...,0,0,0,0,0,0,0,0,0,0
2640047,1613255350,90611,550,1,1317831770,128,539,0,1296972731,0,...,0,0,0,0,0,0,0,0,0,0


In [14]:
df = df.sample(frac=1)

In [15]:
df = df.compute().reset_index(drop=True)

## Scaling

In [16]:
df_y = df[['retweet_comment']]
df_x = df.drop(['retweet_comment'], axis = 1)

In [17]:
X_train = df_x
y_train = df_y['retweet_comment'].to_frame()

In [20]:
standard_scaler = preprocessing.StandardScaler()
standard_scaler.fit(X_train.as_matrix())

StandardScaler()

In [21]:
ss = standard_scaler.transform(X_train.as_matrix())

In [22]:
X_train = pd.DataFrame(ss, columns = df_x.columns)

## Split data

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.2)

In [24]:
X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.2)

## Modeling

In [25]:
model = Sequential([
    Dense(64, activation = 'relu', input_dim = X_test.shape[1]),
    Dense(32, activation = 'relu'),
    Dense(16, activation = 'relu'),
    Dense(2, activation = 'softmax')
])

In [26]:
model.compile(
    optimizer = optimizers.Adam(learning_rate = 0.07),
    loss = 'sparse_categorical_crossentropy', # softmax : sparse_categorical_crossentropy, sigmoid : binary_crossentropy
    metrics=['accuracy']
    )

In [27]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 64)                4608      
_________________________________________________________________
dense_1 (Dense)              (None, 32)                2080      
_________________________________________________________________
dense_2 (Dense)              (None, 16)                528       
_________________________________________________________________
dense_3 (Dense)              (None, 2)                 34        
Total params: 7,250
Trainable params: 7,250
Non-trainable params: 0
_________________________________________________________________


In [28]:
model.fit(
    x = X_train,
    y = y_train.to_pandas(),
    validation_data=(X_val, y_val.to_pandas()),
    epochs=5,
    batch_size=64
)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7f25135202d0>

In [29]:
model.evaluate(X_test, y_test.to_pandas())



[0.29838839173316956, 0.8942432403564453]

## Predict

In [30]:
predict_re_comment = model.predict(X_test)

In [31]:
predict_re_comment

array([[0.0116921 , 0.9883079 ],
       [0.0116921 , 0.9883079 ],
       [0.86369497, 0.13630499],
       ...,
       [0.0116921 , 0.9883079 ],
       [0.86369497, 0.13630499],
       [0.86369497, 0.13630499]], dtype=float32)

In [32]:
predict_re_comment = list(map(lambda x: 0 if x[0] > x[1] else 1,  predict_re_comment))

In [33]:
y_test['predict_retweet_comment'] = predict_re_comment

In [34]:
y_test

Unnamed: 0,retweet_comment,predict_retweet_comment
15126,1,1
6116,1,1
23612,0,0
39704,0,0
3684,1,1
...,...,...
32957,0,0
6506,1,0
9793,1,1
22474,0,0


In [35]:
rce_like = compute_rce(y_test['predict_retweet_comment'].to_array(), y_test['retweet_comment'].to_array())
rce_like

-426.9939199969997

In [36]:
average_precision_score(y_test['predict_retweet_comment'].to_array(), y_test['retweet_comment'].to_array())

0.7911027815378592