In [2]:
import sys
sys.path.append('../..')

import torch
import tqdm
import numpy as np
import pandas as pd
from tensorflow.keras import optimizers
from deepctr.models import DeepFM
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from deepctr.feature_column import SparseFeat, DenseFeat, get_feature_names
from sklearn import preprocessing
from utils.preprocessing import read_data
import core.config as conf

from utils.evaluate import calculate_ctr, compute_rce, average_precision_score

In [3]:
data_path = conf.data_root + 'dask_input'
df = pd.read_parquet(f'{data_path}/train-final-te-like-1.parquet')

In [4]:
DONT_USE = ['timestamp','creator_account_creation','engager_account_creation','engage_time',
            'creator_account_creation', 'engager_account_creation',
            'fold','tweet_id', 
            'tr','dt_day','','',
            'engager_user_id','creator_user_id','engager_is_verified',
            'elapsed_time',
            'links','domains','hashtags0','hashtags1',
            'hashtags','tweet_hash','dt_second','id',
            'tw_hash0',
            'tw_hash1',
            'tw_rt_uhash',
            'same_language', 'nan_language','language',
            'tw_hash', 'tw_freq_hash','tw_first_word', 'tw_second_word', 'tw_last_word', 'tw_llast_word',
            'ypred','creator_count_combined','creator_user_fer_count_delta_time','creator_user_fing_count_delta_time','creator_user_fering_count_delta_time','creator_user_fing_count_mode','creator_user_fer_count_mode','creator_user_fering_count_mode'
           ]

features = [c for c in df.columns if c not in DONT_USE]

print('Using %i features:'%(len(features)))
np.asarray(features)

Using 42 features:


array(['creator_follower_count', 'creator_following_count',
       'creator_is_verified', 'engager_follower_count',
       'engager_following_count', 'engager_follows_creator', 'reply',
       'retweet', 'retweet_comment', 'like', 'tweet_type', 'media',
       'len_hashtags', 'len_domains', 'len_links', 'dt_dow', 'dt_hour',
       'count_ats', 'count_char', 'count_words', 'tw_len',
       'TE_engager_user_id_tweet_type_language_like',
       'TE_tw_first_word_tweet_type_language_like',
       'TE_tw_last_word_tweet_type_language_like',
       'TE_tw_hash0_tweet_type_language_like',
       'TE_tw_hash1_tweet_type_language_like',
       'TE_tw_rt_uhash_tweet_type_language_like',
       'TE_creator_user_id_like', 'TE_engager_user_id_like',
       'TE_tw_hash_like', 'TE_tw_freq_hash_like',
       'TE_media_tweet_type_language_creator_is_verified_engager_is_verified_engager_follows_creator_like',
       'TE_creator_count_combined_tweet_type_language_like',
       'TE_creator_user_fer_count_

In [5]:
df = df[features]

In [6]:
df.head()

Unnamed: 0,creator_follower_count,creator_following_count,creator_is_verified,engager_follower_count,engager_following_count,engager_follows_creator,reply,retweet,retweet_comment,like,...,TE_creator_count_combined_tweet_type_language_like,TE_creator_user_fer_count_delta_time_media_language_like,TE_creator_user_fing_count_delta_time_media_language_like,TE_creator_user_fering_count_delta_time_tweet_type_language_like,TE_creator_user_fing_count_mode_media_language_like,TE_creator_user_fer_count_mode_media_language_like,TE_creator_user_fering_count_mode_tweet_type_language_like,TE_domains_media_tweet_type_language_like,TE_links_media_tweet_type_language_like,TE_hashtags_media_tweet_type_language_like
0,3885,3051,0,634,531,1,0,1,0,0,...,0.47231,0.353791,0.353791,0.47231,0.354039,0.354039,0.471939,0.435391,0.435391,0.423956
1,226443,0,0,633,151,0,0,1,0,1,...,0.474047,0.355,0.355,0.474047,0.355901,0.355901,0.473944,0.432625,0.432625,0.434331
2,1353309,537,1,2304,844,0,0,0,0,1,...,0.527204,0.48851,0.48851,0.527204,0.487882,0.487882,0.526625,0.597564,0.597564,0.601652
3,226308,9,0,85,808,0,0,0,0,1,...,0.524442,0.475492,0.475492,0.523578,0.475356,0.475356,0.524442,0.597564,0.597564,0.601652
4,131219,1023,0,141,340,0,0,0,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3080163 entries, 0 to 3080162
Data columns (total 42 columns):
 #   Column                                                                                             Dtype  
---  ------                                                                                             -----  
 0   creator_follower_count                                                                             int32  
 1   creator_following_count                                                                            int32  
 2   creator_is_verified                                                                                int8   
 3   engager_follower_count                                                                             int32  
 4   engager_following_count                                                                            int32  
 5   engager_follows_creator                                                                           

In [8]:
sparse_features = ['creator_is_verified', 'engager_follows_creator', 'tweet_type', 'media', 'dt_dow', 'dt_hour']
target = ['reply', 'like', 'retweet', 'retweet_comment']  
dense_features = [feat for feat in df.columns if (feat not in sparse_features) and (feat not in target)]

In [9]:
df[sparse_features] = df[sparse_features].fillna('-1', )
df[dense_features] = df[dense_features].fillna(0,)

In [10]:
df[target] = df[target].applymap(lambda x : 1 if x > 0 else 0)

In [11]:
df.head()

Unnamed: 0,creator_follower_count,creator_following_count,creator_is_verified,engager_follower_count,engager_following_count,engager_follows_creator,reply,retweet,retweet_comment,like,...,TE_creator_count_combined_tweet_type_language_like,TE_creator_user_fer_count_delta_time_media_language_like,TE_creator_user_fing_count_delta_time_media_language_like,TE_creator_user_fering_count_delta_time_tweet_type_language_like,TE_creator_user_fing_count_mode_media_language_like,TE_creator_user_fer_count_mode_media_language_like,TE_creator_user_fering_count_mode_tweet_type_language_like,TE_domains_media_tweet_type_language_like,TE_links_media_tweet_type_language_like,TE_hashtags_media_tweet_type_language_like
0,3885,3051,0,634,531,1,0,1,0,0,...,0.47231,0.353791,0.353791,0.47231,0.354039,0.354039,0.471939,0.435391,0.435391,0.423956
1,226443,0,0,633,151,0,0,1,0,1,...,0.474047,0.355,0.355,0.474047,0.355901,0.355901,0.473944,0.432625,0.432625,0.434331
2,1353309,537,1,2304,844,0,0,0,0,1,...,0.527204,0.48851,0.48851,0.527204,0.487882,0.487882,0.526625,0.597564,0.597564,0.601652
3,226308,9,0,85,808,0,0,0,0,1,...,0.524442,0.475492,0.475492,0.523578,0.475356,0.475356,0.524442,0.597564,0.597564,0.601652
4,131219,1023,0,141,340,0,0,0,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Preprocessing

In [12]:
for feat in sparse_features :
    lbe = LabelEncoder()
    df[feat] = lbe.fit_transform(df[feat])

In [13]:
mms = MinMaxScaler(feature_range = (0, 1))
df[dense_features] = mms.fit_transform(df[dense_features])

## Generate Feature Columns

In [14]:
fixlen_feature_columns = [SparseFeat(feat, vocabulary_size = df[feat].max() +1, embedding_dim = 4) for feat in sparse_features]  + [DenseFeat(feat, 1,) for feat in dense_features]

In [15]:
dnn_feature_columns = fixlen_feature_columns
linear_feature_columns = fixlen_feature_columns

feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)

## Split Dataset

In [16]:
train, test = train_test_split(df, test_size = 0.2)

In [17]:
train_model_input = {name:train[name].values for name in feature_names}
test_model_input = {name:test[name].values for name in feature_names}

## Modeling

In [18]:
model = DeepFM(linear_feature_columns, dnn_feature_columns, task = 'binary')

In [19]:
model.compile("adam", "binary_crossentropy", 
             metrics = ['binary_crossentropy'])

In [20]:
history = model.fit(train_model_input, train['like'].values,
                    batch_size = 256,
                    epochs = 5,
                    verbose = 1,
                    validation_split = 0.2,)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [31]:
pred_ans = model.predict(test_model_input, batch_size = 256)

In [37]:
pred_ans

array([[0.50470215],
       [0.2477827 ],
       [0.0630095 ],
       ...,
       [0.18208307],
       [0.6332116 ],
       [0.6054634 ]], dtype=float32)

In [34]:
rce_like = compute_rce(pred_ans, test['like'])
rce_like

10.657167919927124

In [35]:
ap_like = average_precision_score(test['like'], pred_ans)
ap_like

0.6191048938871294