In [1]:
import sys
sys.path.append('../../..')

import tqdm
import numpy as np
import pandas as pd
from tensorflow.keras import optimizers
from deepctr.models import DeepFM
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from deepctr.feature_column import SparseFeat, DenseFeat, get_feature_names
from sklearn import preprocessing
from utils.preprocessing import read_data
import core.config as conf
import tensorflow as tf
from tensorflow.keras import backend as K


from utils.evaluate import calculate_ctr, compute_rce, average_precision_score

In [3]:
data_path = conf.data_root + 'nvidia_models'
df = pd.read_parquet(f'{data_path}/train-final-te-like-1.parquet')

In [4]:
DONT_USE = ['timestamp','creator_account_creation','engager_account_creation','engage_time',
            'creator_account_creation', 'engager_account_creation',
            'fold','tweet_id', 
            'tr','dt_day','','',
            'engager_user_id','creator_user_id','engager_is_verified',
            'elapsed_time',
            'links','domains','hashtags0','hashtags1',
            'hashtags','tweet_hash','dt_second','id',
            'tw_hash0',
            'tw_hash1',
            'tw_rt_uhash',
            'same_language', 'nan_language','language',
            'tw_hash', 'tw_freq_hash','tw_first_word', 'tw_second_word', 'tw_last_word', 'tw_llast_word',
            'ypred','creator_count_combined','creator_user_fer_count_delta_time','creator_user_fing_count_delta_time','creator_user_fering_count_delta_time','creator_user_fing_count_mode','creator_user_fer_count_mode','creator_user_fering_count_mode'
           ]

features = [c for c in df.columns if c not in DONT_USE]

print('Using %i features:'%(len(features)))
np.asarray(features)

Using 42 features:


array(['creator_follower_count', 'creator_following_count',
       'creator_is_verified', 'engager_follower_count',
       'engager_following_count', 'engager_follows_creator', 'reply',
       'retweet', 'retweet_comment', 'like', 'tweet_type', 'media',
       'len_hashtags', 'len_domains', 'len_links', 'dt_dow', 'dt_hour',
       'count_ats', 'count_char', 'count_words', 'tw_len',
       'TE_engager_user_id_tweet_type_language_like',
       'TE_tw_first_word_tweet_type_language_like',
       'TE_tw_last_word_tweet_type_language_like',
       'TE_tw_hash0_tweet_type_language_like',
       'TE_tw_hash1_tweet_type_language_like',
       'TE_tw_rt_uhash_tweet_type_language_like',
       'TE_creator_user_id_like', 'TE_engager_user_id_like',
       'TE_tw_hash_like', 'TE_tw_freq_hash_like',
       'TE_media_tweet_type_language_creator_is_verified_engager_is_verified_engager_follows_creator_like',
       'TE_creator_count_combined_tweet_type_language_like',
       'TE_creator_user_fer_count_

In [5]:
df = df[features]

In [6]:
df.head()

Unnamed: 0,creator_follower_count,creator_following_count,creator_is_verified,engager_follower_count,engager_following_count,engager_follows_creator,reply,retweet,retweet_comment,like,...,TE_creator_count_combined_tweet_type_language_like,TE_creator_user_fer_count_delta_time_media_language_like,TE_creator_user_fing_count_delta_time_media_language_like,TE_creator_user_fering_count_delta_time_tweet_type_language_like,TE_creator_user_fing_count_mode_media_language_like,TE_creator_user_fer_count_mode_media_language_like,TE_creator_user_fering_count_mode_tweet_type_language_like,TE_domains_media_tweet_type_language_like,TE_links_media_tweet_type_language_like,TE_hashtags_media_tweet_type_language_like
0,3885,3051,0,634,531,1,0,1,0,0,...,0.47231,0.353791,0.353791,0.47231,0.354039,0.354039,0.471939,0.435391,0.435391,0.423956
1,226443,0,0,633,151,0,0,1,0,1,...,0.474047,0.355,0.355,0.474047,0.355901,0.355901,0.473944,0.432625,0.432625,0.434331
2,1353309,537,1,2304,844,0,0,0,0,1,...,0.527204,0.48851,0.48851,0.527204,0.487882,0.487882,0.526625,0.597564,0.597564,0.601652
3,226308,9,0,85,808,0,0,0,0,1,...,0.524442,0.475492,0.475492,0.523578,0.475356,0.475356,0.524442,0.597564,0.597564,0.601652
4,131219,1023,0,141,340,0,0,0,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3080163 entries, 0 to 3080162
Data columns (total 42 columns):
 #   Column                                                                                             Dtype  
---  ------                                                                                             -----  
 0   creator_follower_count                                                                             int32  
 1   creator_following_count                                                                            int32  
 2   creator_is_verified                                                                                int8   
 3   engager_follower_count                                                                             int32  
 4   engager_following_count                                                                            int32  
 5   engager_follows_creator                                                                           

In [8]:
sparse_features = ['creator_is_verified', 'engager_follows_creator', 'tweet_type', 'media', 'dt_dow', 'dt_hour']
sparse_features += df.columns.values.tolist()[21:]
target = ['reply', 'like', 'retweet', 'retweet_comment']  
dense_features = [feat for feat in df.columns if (feat not in sparse_features) and (feat not in target)]

In [9]:
df[sparse_features] = df[sparse_features].fillna('-1', )
df[dense_features] = df[dense_features].fillna(0,)

In [10]:
df[target] = df[target].applymap(lambda x : 1 if x > 0 else 0)

In [11]:
df.head()

Unnamed: 0,creator_follower_count,creator_following_count,creator_is_verified,engager_follower_count,engager_following_count,engager_follows_creator,reply,retweet,retweet_comment,like,...,TE_creator_count_combined_tweet_type_language_like,TE_creator_user_fer_count_delta_time_media_language_like,TE_creator_user_fing_count_delta_time_media_language_like,TE_creator_user_fering_count_delta_time_tweet_type_language_like,TE_creator_user_fing_count_mode_media_language_like,TE_creator_user_fer_count_mode_media_language_like,TE_creator_user_fering_count_mode_tweet_type_language_like,TE_domains_media_tweet_type_language_like,TE_links_media_tweet_type_language_like,TE_hashtags_media_tweet_type_language_like
0,3885,3051,0,634,531,1,0,1,0,0,...,0.47231,0.353791,0.353791,0.47231,0.354039,0.354039,0.471939,0.435391,0.435391,0.423956
1,226443,0,0,633,151,0,0,1,0,1,...,0.474047,0.355,0.355,0.474047,0.355901,0.355901,0.473944,0.432625,0.432625,0.434331
2,1353309,537,1,2304,844,0,0,0,0,1,...,0.527204,0.48851,0.48851,0.527204,0.487882,0.487882,0.526625,0.597564,0.597564,0.601652
3,226308,9,0,85,808,0,0,0,0,1,...,0.524442,0.475492,0.475492,0.523578,0.475356,0.475356,0.524442,0.597564,0.597564,0.601652
4,131219,1023,0,141,340,0,0,0,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Preprocessing

In [13]:
df[sparse_features] = df[sparse_features].astype(np.float32)

In [14]:
for feat in sparse_features :
    lbe = LabelEncoder()
    df[feat] = lbe.fit_transform(df[feat])

In [15]:
mms = MinMaxScaler(feature_range = (0, 1))
df[dense_features] = mms.fit_transform(df[dense_features])

## Negative Sampling 

In [16]:
df_positive = df[df['like']==1]
df_negative = df[df['like']==0]
print(len(df_positive))
print(len(df_negative))

1222441
1857722


In [17]:
df_negative = df_negative.sample(n = len(df_positive))

In [18]:
df = pd.concat([df_positive, df_negative])

In [19]:
df = df.sample(frac=1)

In [20]:
df = df.reset_index(drop=True)

In [21]:
df

Unnamed: 0,creator_follower_count,creator_following_count,creator_is_verified,engager_follower_count,engager_following_count,engager_follows_creator,reply,retweet,retweet_comment,like,...,TE_creator_count_combined_tweet_type_language_like,TE_creator_user_fer_count_delta_time_media_language_like,TE_creator_user_fing_count_delta_time_media_language_like,TE_creator_user_fering_count_delta_time_tweet_type_language_like,TE_creator_user_fing_count_mode_media_language_like,TE_creator_user_fer_count_mode_media_language_like,TE_creator_user_fering_count_mode_tweet_type_language_like,TE_domains_media_tweet_type_language_like,TE_links_media_tweet_type_language_like,TE_hashtags_media_tweet_type_language_like
0,5.781469e-05,0.000257,0,0.000049,0.000363,0,0,0,0,1,...,2016,2001,2001,1660,2065,2065,1634,6921,3728,6035
1,1.031410e-05,0.000576,0,0.000062,0.000345,1,0,0,0,1,...,283,993,993,243,1069,1069,237,1390,396,2402
2,1.067931e-04,0.002560,0,0.000024,0.000130,1,0,0,0,0,...,1,1,1,1,1,1,1,1,1,1
3,1.077062e-05,0.000059,0,0.000022,0.000461,1,1,0,0,1,...,2217,648,648,1812,662,662,1804,5790,2844,3979
4,9.880803e-06,0.000009,0,0.000003,0.000138,0,0,0,0,1,...,626,2410,2410,534,2383,2383,520,3247,1274,2188
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2444877,1.346390e-01,0.000066,1,0.000171,0.000863,0,0,0,0,1,...,2140,796,796,1749,797,797,1738,6796,3620,4746
2444878,6.809011e-07,0.000102,0,0.000010,0.000228,0,0,0,0,0,...,2078,693,693,1703,713,713,1687,2321,0,4713
2444879,2.028776e-05,0.000343,0,0.000277,0.001435,1,0,0,0,1,...,0,0,0,0,0,0,0,7435,4172,5661
2444880,3.242018e-06,0.000094,0,0.000007,0.000153,1,0,0,0,1,...,550,2544,2544,473,2550,2550,459,3551,1443,2394


## Generate Feature Columns

In [22]:
fixlen_feature_columns = [SparseFeat(feat, vocabulary_size = df[feat].max() +1, embedding_dim = 4) for feat in sparse_features]  + [DenseFeat(feat, 1,) for feat in dense_features]

In [23]:
dnn_feature_columns = fixlen_feature_columns
linear_feature_columns = fixlen_feature_columns

feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)

## Split Dataset

In [24]:
train, test = train_test_split(df, test_size = 0.2)

In [25]:
train_model_input = {name:train[name].values for name in feature_names}
test_model_input = {name:test[name].values for name in feature_names}

## Modeling

In [169]:

def f1(y_true, y_pred):
    y_pred = K.round(y_pred)
    tp = K.sum(K.cast(y_true*y_pred, 'float'), axis=0)
    tn = K.sum(K.cast((1-y_true)*(1-y_pred), 'float'), axis=0)
    fp = K.sum(K.cast((1-y_true)*y_pred, 'float'), axis=0)
    fn = K.sum(K.cast(y_true*(1-y_pred), 'float'), axis=0)

    p = tp / (tp + fp + K.epsilon())
    r = tp / (tp + fn + K.epsilon())

    f1 = 2*p*r / (p+r+K.epsilon())
    f1 = tf.where(tf.math.is_nan(f1), tf.zeros_like(f1), f1)
    return K.mean(f1)

def f1_loss(y_true, y_pred):
    
    tp = K.sum(K.cast(y_true*y_pred, 'float'), axis=0)
    tn = K.sum(K.cast((1-y_true)*(1-y_pred), 'float'), axis=0)
    fp = K.sum(K.cast((1-y_true)*y_pred, 'float'), axis=0)
    fn = K.sum(K.cast(y_true*(1-y_pred), 'float'), axis=0)

    p = tp / (tp + fp + K.epsilon())
    r = tp / (tp + fn + K.epsilon())

    f1 = 2*p*r / (p+r+K.epsilon())
    f1 = tf.where(tf.math.is_nan(f1), tf.zeros_like(f1), f1)
    return 1 - K.mean(f1)


In [26]:
# optimizer = tf.keras.optimizers.Adam(lr=0.05, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=False)
model = DeepFM(linear_feature_columns, dnn_feature_columns, task = 'binary')
model.compile('adam', 'binary_crossentropy', metrics=['accuracy', 'binary_crossentropy'])
# model.compile(optimizer, f1_loss, metrics=['accuracy', f1])

In [27]:
history = model.fit(train_model_input, train['like'].values.astype(np.float32),
                    batch_size = 256,
                    epochs = 20,
                    verbose = 1,
                    validation_split = 0.2,)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [28]:
pred_ans = model.predict(test_model_input, batch_size = 256)

In [29]:
pred_ans.reshape(-1)

array([0.5420783 , 0.5551176 , 0.7504901 , ..., 0.3383872 , 0.40947878,
       0.39408386], dtype=float32)

In [30]:
pred = pred_ans.reshape(-1)

In [31]:
float_formatter = "{:.5f}".format
pred_reply = []
for i in range(len(pred)):
    pred_reply.append(float_formatter(pred[i]))
    # print(float_formatter(pred[i]))

In [32]:
pred_reply = np.array(pred_reply)
pred_reply = pred_reply.astype(np.float64)

In [33]:
pd.set_option("display.max_rows", 101)

test['pred'] = pred_reply
test[['reply', 'pred']].head(100)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,reply,pred
296706,0,0.54208
477813,0,0.55512
64815,0,0.75049
1677126,0,0.69816
596992,0,0.97602
1452891,0,0.25776
1080660,0,0.20211
1420636,0,0.71872
1384579,0,0.54993
1756308,0,0.29975


In [36]:
rce_like = compute_rce(pred_reply, test['like'])
rce_like

8.2642759018702

In [37]:
ap_like = average_precision_score(test['like'], pred_reply)
ap_like

0.6854284712142407