In [25]:
import sys
sys.path.append('../..')

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow import feature_column as fc
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras import optimizers
from sklearn import preprocessing
from sklearn.utils import shuffle
from utils.gpu.cuda_cluster import *
from utils.gpu.preprocessing import read_data, factorize_small_cardinality_with_index, df_to_tfdataset, split_join, get_media_index
from utils.evaluate import calculate_ctr, compute_rce, average_precision_score
import core.config as conf

In [2]:
client

0,1
Client  Scheduler: tcp://127.0.0.1:40897  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 1  Cores: 1  Memory: 33.47 GB


## Load data

In [3]:
# data_path = conf.raw_data_path + '*' # for all dataset
data_path = conf.data_root + 'nvidia_models'
df = dask_cudf.read_parquet(f'{data_path}/train-final-te-reply-1.parquet', index=False)

## Preprocessing

In [4]:
df.head()

Unnamed: 0,timestamp,creator_follower_count,creator_following_count,creator_is_verified,creator_account_creation,engager_follower_count,engager_following_count,engager_is_verified,engager_account_creation,engager_follows_creator,...,TE_creator_count_combined_tweet_type_language_reply,TE_creator_user_fer_count_delta_time_media_language_reply,TE_creator_user_fing_count_delta_time_media_language_reply,TE_creator_user_fering_count_delta_time_tweet_type_language_reply,TE_creator_user_fing_count_mode_media_language_reply,TE_creator_user_fer_count_mode_media_language_reply,TE_creator_user_fering_count_mode_tweet_type_language_reply,TE_domains_media_tweet_type_language_reply,TE_links_media_tweet_type_language_reply,TE_hashtags_media_tweet_type_language_reply
0,1612587832,3885,3051,0,1568428850,634,531,0,1590421816,1,...,0.041906,0.018245,0.018245,0.041906,0.018103,0.018103,0.041833,0.027013,0.027013,0.028409
1,1613743226,226443,0,0,1597609757,633,151,0,1541162905,0,...,0.041548,0.017228,0.017228,0.041548,0.017068,0.017068,0.041559,0.025484,0.025484,0.027091
2,1613369374,1353309,537,1,1299819150,2304,844,0,1278714864,0,...,0.027891,0.0121,0.0121,0.027891,0.012476,0.012476,0.028071,0.017977,0.017977,0.01929
3,1614038010,226308,9,0,1468552079,85,808,0,1387326144,0,...,0.0295,0.011919,0.011919,0.029697,0.011114,0.011114,0.0295,0.017977,0.017977,0.01929
4,1612525320,131219,1023,0,1517575079,141,340,0,1560679572,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
df.isnull().sum().compute()

timestamp                                                           0
creator_follower_count                                              0
creator_following_count                                             0
creator_is_verified                                                 0
creator_account_creation                                            0
                                                                ...  
TE_creator_user_fer_count_mode_media_language_reply            146676
TE_creator_user_fering_count_mode_tweet_type_language_reply    146222
TE_domains_media_tweet_type_language_reply                      79182
TE_links_media_tweet_type_language_reply                       308353
TE_hashtags_media_tweet_type_language_reply                    340310
Length: 72, dtype: uint64

In [6]:
df = df.fillna(0)

In [7]:
df = df.astype(np.int64)

## Sampling

In [8]:
df_positive = df[df['reply']==1]

In [9]:
df_negative = df[df['reply']==0]

In [10]:
print(len(df_positive))
print(len(df_negative))

90546
2989617


In [11]:
df_negative = df_negative.compute().sample(n = len(df_positive))

In [12]:
df = dask.dataframe.concat([df_positive, df_negative])

In [27]:
df = shuffle(df)

In [30]:
df = df.reset_index(drop=True)

## Scaling

In [31]:
df_y = df[['reply']]
df_x = df.drop(['reply'], axis = 1)

In [32]:
X_train = df_x
y_train = df_y['reply'].to_frame()

In [33]:
standard_scaler = preprocessing.StandardScaler()
standard_scaler.fit(X_train.as_matrix())

StandardScaler()

In [34]:
ss = standard_scaler.transform(X_train.as_matrix())

In [35]:
X_train = pd.DataFrame(ss, columns = df_x.columns)

## Split data

In [36]:
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.2)

In [37]:
X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.2)

## Modeling

In [38]:
model = Sequential([
    Dense(64, activation = 'relu', input_dim = X_test.shape[1]),
    Dense(32, activation = 'relu'),
    Dense(16, activation = 'relu'),
    Dense(1, activation = 'sigmoid')
])

In [39]:
model.compile(
    optimizer = optimizers.Adam(learning_rate = 0.05),
    loss = 'binary_crossentropy', # softmax : sparse_categorical_crossentropy, sigmoid : binary_crossentropy
    metrics=['accuracy']
)

In [40]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 64)                4608      
_________________________________________________________________
dense_1 (Dense)              (None, 32)                2080      
_________________________________________________________________
dense_2 (Dense)              (None, 16)                528       
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 17        
Total params: 7,233
Trainable params: 7,233
Non-trainable params: 0
_________________________________________________________________


In [41]:
model.fit(
    x = X_train,
    y = y_train.to_pandas(),
    validation_data=(X_val, y_val.to_pandas()),
    epochs=4,
    batch_size=64
)

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<tensorflow.python.keras.callbacks.History at 0x7feac44530d0>

In [42]:
model.evaluate(X_test, y_test.to_pandas())



[0.3231922388076782, 0.8414149880409241]

## Predict

In [43]:
predict_reply = model.predict(X_test)

In [44]:
y_test['predict'] = predict_reply.tolist()

In [45]:
rce_like = compute_rce(predict_reply, y_test['reply'].to_array())
rce_like

  loss = -(transformed_labels * np.log(y_pred)).sum(axis=1)
  loss = -(transformed_labels * np.log(y_pred)).sum(axis=1)


nan

In [55]:
average_precision_score(y_test['reply'].to_array(), predict_reply)

0.8316263203235412