In [1]:
import sys
sys.path.append('../..')

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow import feature_column as fc
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn import preprocessing

from utils.cuda_cluster import *
from utils.preprocessing import read_data, factorize_small_cardinality_with_index, df_to_tfdataset, split_join, get_media_index
from utils.evaluate import calculate_ctr, compute_rce, average_precision_score
import core.config as conf

Perhaps you already have a cluster running?
Hosting the HTTP server on port 45937 instead
  http_address["port"], self.http_server.port


In [2]:
client

0,1
Client  Scheduler: tcp://127.0.0.1:46743  Dashboard: http://127.0.0.1:45937/status,Cluster  Workers: 1  Cores: 1  Memory: 33.47 GB


## Load data

In [3]:
# data_path = conf.raw_data_path + '*' # for all dataset
data_path = conf.data_root + 'dask_input'
df = dask_cudf.read_parquet(f'{data_path}/train-final-te-like-1.parquet', index=False)

## Preprocessing

In [4]:
df.head()

Unnamed: 0,timestamp,creator_follower_count,creator_following_count,creator_is_verified,creator_account_creation,engager_follower_count,engager_following_count,engager_is_verified,engager_account_creation,engager_follows_creator,...,TE_creator_count_combined_tweet_type_language_like,TE_creator_user_fer_count_delta_time_media_language_like,TE_creator_user_fing_count_delta_time_media_language_like,TE_creator_user_fering_count_delta_time_tweet_type_language_like,TE_creator_user_fing_count_mode_media_language_like,TE_creator_user_fer_count_mode_media_language_like,TE_creator_user_fering_count_mode_tweet_type_language_like,TE_domains_media_tweet_type_language_like,TE_links_media_tweet_type_language_like,TE_hashtags_media_tweet_type_language_like
0,1612587832,3885,3051,0,1568428850,634,531,0,1590421816,1,...,0.47231,0.353791,0.353791,0.47231,0.354039,0.354039,0.471939,0.435391,0.435391,0.423956
1,1613743226,226443,0,0,1597609757,633,151,0,1541162905,0,...,0.474047,0.355,0.355,0.474047,0.355901,0.355901,0.473944,0.432625,0.432625,0.434331
2,1613369374,1353309,537,1,1299819150,2304,844,0,1278714864,0,...,0.527204,0.48851,0.48851,0.527204,0.487882,0.487882,0.526625,0.597564,0.597564,0.601652
3,1614038010,226308,9,0,1468552079,85,808,0,1387326144,0,...,0.524442,0.475492,0.475492,0.523578,0.475356,0.475356,0.524442,0.597564,0.597564,0.601652
4,1612525320,131219,1023,0,1517575079,141,340,0,1560679572,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
df.isnull().sum().compute()

timestamp                                                          0
creator_follower_count                                             0
creator_following_count                                            0
creator_is_verified                                                0
creator_account_creation                                           0
                                                               ...  
TE_creator_user_fer_count_mode_media_language_like            146676
TE_creator_user_fering_count_mode_tweet_type_language_like    146222
TE_domains_media_tweet_type_language_like                      79182
TE_links_media_tweet_type_language_like                       308353
TE_hashtags_media_tweet_type_language_like                    340310
Length: 72, dtype: uint64

In [6]:
df = df.fillna(0)

In [7]:
df = df.astype(np.int64)

## Modeling

In [8]:
df_y = df[['like']]
df_x = df.drop(['like'], axis = 1)

In [9]:
X_train = df_x
y_train = df_y['like'].to_frame()

## Scaling

In [10]:
standard_scaler = preprocessing.StandardScaler()
standard_scaler.fit(X_train.compute().as_matrix())

StandardScaler()

In [11]:
ss = standard_scaler.transform(X_train.compute().as_matrix())

In [12]:
X_train = pd.DataFrame(ss, columns = df_x.columns)

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train.compute().to_pandas(), test_size=0.2)

In [15]:
X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.2)

In [16]:
model = Sequential([
    Dense(64, activation = 'relu', input_dim = X_test.shape[1]),
    Dense(32, activation = 'relu'),
    Dense(16, activation = 'relu'),
    Dense(2, activation = 'softmax')
])

In [17]:
model.compile(
    optimizer = 'adam',
    loss = 'sparse_categorical_crossentropy', # softmax : sparse_categorical_crossentropy, sigmoid : binary_crossentropy
    metrics=['accuracy'])

In [18]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 64)                4608      
_________________________________________________________________
dense_1 (Dense)              (None, 32)                2080      
_________________________________________________________________
dense_2 (Dense)              (None, 16)                528       
_________________________________________________________________
dense_3 (Dense)              (None, 2)                 34        
Total params: 7,250
Trainable params: 7,250
Non-trainable params: 0
_________________________________________________________________


In [20]:
model.fit(
    x = X_train,
    y = y_train,
    validation_data=(X_val, y_val),
    epochs=5,
    batch_size=64
)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7f74f82a6190>

In [21]:
model.evaluate(X_test, y_test)



[0.5901522636413574, 0.6842719316482544]

## Predict

In [22]:
predict_like = model.predict(X_test)

In [23]:
predict_like

array([[0.50866526, 0.49133483],
       [0.6966353 , 0.3033647 ],
       [0.75899947, 0.24100056],
       ...,
       [0.38811415, 0.61188585],
       [0.67156655, 0.3284335 ],
       [0.66411144, 0.33588853]], dtype=float32)

In [24]:
predict_like = list(map(lambda x: 0 if x[0] > x[1] else 1,  predict_like))

In [25]:
y_test['predict_like'] = predict_like

In [26]:
rce_like = compute_rce(y_test['predict_like'], y_test['like'])
rce_like

-1523.3029648909544

In [27]:
average_precision_score(y_test['predict_like'], y_test['like'])

0.47035601549592054