In [2]:
import sys
sys.path.append('../../..')

import tqdm
import numpy as np
import pandas as pd
from tensorflow.keras import optimizers
from deepctr.models import DeepFM
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from deepctr.feature_column import SparseFeat, DenseFeat, get_feature_names
from sklearn import preprocessing
from utils.preprocessing import read_data
import core.config as conf

from utils.evaluate import calculate_ctr, compute_rce, average_precision_score

In [3]:
TARGET = 'like'

In [4]:
df = pd.read_csv(f'{conf.dataset_path}/train')

In [5]:
label_names = ['reply', 'retweet', 'comment', 'like']
DONT_USE = ['tweet_timestamp','creator_account_creation','engager_account_creation','engage_time',
            'creator_account_creation', 'engager_account_creation',
            'fold','tweet_id', 
            'tr','dt_day','','',
            'engager_id','creator_id','engager_is_verified',
            'elapsed_time',
            'links','domains','hashtags0','hashtags1',
            'hashtags','tweet_hash','dt_second','id',
            'tw_hash0',
            'tw_hash1',
            'tw_rt_uhash',
            'same_language', 'nan_language','language',
            'tw_hash', 'tw_freq_hash','tw_first_word', 'tw_second_word', 'tw_last_word', 'tw_llast_word',
            'ypred','creator_count_combined','creator_user_fer_count_delta_time','creator_user_fing_count_delta_time','creator_user_fering_count_delta_time','creator_user_fing_count_mode','creator_user_fer_count_mode','creator_user_fering_count_mode'
           ]
DONT_USE += label_names
DONT_USE += conf.labels
RMV = [c for c in DONT_USE if c in df.columns]

In [6]:
y_train = df[TARGET]
X_train = df.drop(RMV, axis=1)

### scaling

In [7]:
 scaling_columns = ['creator_following_count', 'creator_follower_count', 'engager_follower_count', 
                           'engager_following_count', 'dt_dow', 'dt_hour', 'len_domains', 'creator_main_language', 'engager_main_language',
                           'engager_feature_number_of_previous_like_engagement',
                           'engager_feature_number_of_previous_reply_engagement',
                           'engager_feature_number_of_previous_retweet_engagement',
                           'engager_feature_number_of_previous_comment_engagement',
                           'number_of_engagements_positive',
                        #    'creator_feature_number_of_previous_like_engagement',
                        #    'creator_feature_number_of_previous_reply_engagement',
                        #    'creator_feature_number_of_previous_retweet_engagement',
                        #    'creator_feature_number_of_previous_comment_engagement',
                        #    'creator_number_of_engagements_positive',
                        #    'len_text_tokens',
                        #    'len_text_tokens_unique',
                        #    'cnt_mention',
                            'number_of_tweet_engagements']

In [14]:
X_train = X_train.reset_index(drop=True)
standard_scaler = preprocessing.StandardScaler()
standard_scaler.fit(X_train[scaling_columns])
sc = standard_scaler.transform(X_train[scaling_columns])

In [15]:
X_train[scaling_columns] = pd.DataFrame(sc, columns = scaling_columns)
X_train = X_train.fillna(0)

## sparse & dense features

In [17]:
X_train.columns

Index(['tweet_type', 'creator_follower_count', 'creator_following_count',
       'engager_follower_count', 'engager_following_count', 'media', 'dt_dow',
       'dt_hour', 'len_domains', 'creator_main_language',
       'engager_main_language', 'creator_and_engager_have_same_main_language',
       'is_tweet_in_creator_main_language',
       'is_tweet_in_engager_main_language',
       'engager_feature_number_of_previous_like_engagement',
       'engager_feature_number_of_previous_reply_engagement',
       'engager_feature_number_of_previous_retweet_engagement',
       'engager_feature_number_of_previous_comment_engagement',
       'number_of_engagements_positive', 'number_of_engagements_ratio_like',
       'number_of_engagements_ratio_reply',
       'number_of_engagements_ratio_retweet',
       'number_of_engagements_ratio_comment', 'number_of_tweet_engagements'],
      dtype='object')

In [18]:
X_train.head()

Unnamed: 0,tweet_type,creator_follower_count,creator_following_count,engager_follower_count,engager_following_count,media,dt_dow,dt_hour,len_domains,creator_main_language,...,engager_feature_number_of_previous_like_engagement,engager_feature_number_of_previous_reply_engagement,engager_feature_number_of_previous_retweet_engagement,engager_feature_number_of_previous_comment_engagement,number_of_engagements_positive,number_of_engagements_ratio_like,number_of_engagements_ratio_reply,number_of_engagements_ratio_retweet,number_of_engagements_ratio_comment,number_of_tweet_engagements
0,0,-0.168011,-0.103682,-0.065392,-0.180827,0,0.018588,1.18286,-0.340453,-0.05882,...,-0.352543,-0.252742,-0.216313,-0.157483,-0.384874,1.0,0.0,0.0,0.0,-0.118908
1,3,-0.152424,-0.083075,-0.053885,-0.137149,1,0.538678,-1.182958,-0.340453,-0.720478,...,-0.359098,-0.209647,-0.216313,-0.157483,-0.388705,0.8,0.2,0.0,0.0,-0.116953
2,3,-0.021888,-0.081112,0.099652,2.309918,1,0.018588,1.478588,-0.340453,0.602837,...,3.85139,-0.166552,-0.123877,0.002552,3.332522,0.991273,0.001027,0.007187,0.000513,-0.089579
3,3,-0.168009,-0.099573,-0.052077,-0.220191,4,-0.501503,-1.478685,-0.340453,-0.803185,...,0.927865,-0.080362,-0.189903,-0.157483,0.752752,0.986689,0.006656,0.006656,0.0,-0.118908
4,3,-0.15688,-0.08237,-0.046323,0.335225,0,-1.021593,1.478588,2.558924,-0.05882,...,-0.265144,-0.080362,-0.15689,-0.157483,-0.283369,0.783333,0.066667,0.15,0.0,-0.0974


In [22]:
sparse_features = ['tweet_type', 'creator_main_language', 'engager_main_language', 'media']
target = ['reply', 'like', 'retweet', 'retweet_comment'] 
dense_features = [feat for feat in X_train.columns if (feat not in sparse_features) and (feat not in target)]

In [24]:
X_train[sparse_features] = X_train[sparse_features].fillna('-1', )
X_train[dense_features] = X_train[dense_features].fillna(0,)

In [25]:
X_train[sparse_features] = X_train[sparse_features].astype(np.float32)

In [26]:
X_train.head()

Unnamed: 0,tweet_type,creator_follower_count,creator_following_count,engager_follower_count,engager_following_count,media,dt_dow,dt_hour,len_domains,creator_main_language,...,engager_feature_number_of_previous_like_engagement,engager_feature_number_of_previous_reply_engagement,engager_feature_number_of_previous_retweet_engagement,engager_feature_number_of_previous_comment_engagement,number_of_engagements_positive,number_of_engagements_ratio_like,number_of_engagements_ratio_reply,number_of_engagements_ratio_retweet,number_of_engagements_ratio_comment,number_of_tweet_engagements
0,0.0,-0.168011,-0.103682,-0.065392,-0.180827,0.0,0.018588,1.18286,-0.340453,-0.05882,...,-0.352543,-0.252742,-0.216313,-0.157483,-0.384874,1.0,0.0,0.0,0.0,-0.118908
1,3.0,-0.152424,-0.083075,-0.053885,-0.137149,1.0,0.538678,-1.182958,-0.340453,-0.720478,...,-0.359098,-0.209647,-0.216313,-0.157483,-0.388705,0.8,0.2,0.0,0.0,-0.116953
2,3.0,-0.021888,-0.081112,0.099652,2.309918,1.0,0.018588,1.478588,-0.340453,0.602837,...,3.85139,-0.166552,-0.123877,0.002552,3.332522,0.991273,0.001027,0.007187,0.000513,-0.089579
3,3.0,-0.168009,-0.099573,-0.052077,-0.220191,4.0,-0.501503,-1.478685,-0.340453,-0.803185,...,0.927865,-0.080362,-0.189903,-0.157483,0.752752,0.986689,0.006656,0.006656,0.0,-0.118908
4,3.0,-0.15688,-0.08237,-0.046323,0.335225,0.0,-1.021593,1.478588,2.558924,-0.05882,...,-0.265144,-0.080362,-0.15689,-0.157483,-0.283369,0.783333,0.066667,0.15,0.0,-0.0974


## preprocessing

In [28]:
for feat in sparse_features :
    lbe = LabelEncoder()
    X_train[feat] = lbe.fit_transform(X_train[feat])

In [29]:
mms = MinMaxScaler(feature_range = (0, 1))
X_train[dense_features] = mms.fit_transform(X_train[dense_features])

In [30]:
fixlen_feature_columns = [SparseFeat(feat, vocabulary_size = X_train[feat].max() +1, embedding_dim = 4) for feat in sparse_features]  + [DenseFeat(feat, 1,) for feat in dense_features]

In [38]:
dnn_feature_columns = fixlen_feature_columns
linear_feature_columns = fixlen_feature_columns

feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)

## data split

In [34]:
train_size = 3000000
valid_size = 200000 1
X_train, X_valid, X_test = X_train[:train_size], X_train[train_size:train_size+valid_size], X_train[train_size+valid_size:]
y_train, y_valid, y_test = y_train[:train_size], y_train[train_size:train_size+valid_size], y_train[train_size+valid_size:]

In [40]:
train_model_input = {name:X_train[name].values for name in feature_names}
test_model_input = {name:X_test[name].values for name in feature_names}

## Model

In [39]:
model = DeepFM(linear_feature_columns, dnn_feature_columns, task = 'binary')

In [41]:
model.compile("adam", "binary_crossentropy", 
             metrics = ['binary_crossentropy'])

In [43]:
history = model.fit(train_model_input, y_train.values,
                    batch_size = 256,
                    epochs = 10,
                    verbose = 1,
                    validation_split = 0.2,)

KeyError: 'like'