In [2]:
import sys
sys.path.append('../../..')

import tqdm
import numpy as np
import pandas as pd
from tensorflow.keras import optimizers
from deepctr.models import DeepFM
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from deepctr.feature_column import SparseFeat, DenseFeat, get_feature_names
from sklearn import preprocessing
from utils.preprocessing import read_data
import core.config as conf
import tensorflow as tf
from tensorflow.keras import backend as K


from utils.evaluate import calculate_ctr, compute_rce, average_precision_score

In [112]:
data_path = conf.data_root + 'nvidia_models'
df = pd.read_parquet(f'{data_path}/train-final-te-retweet-1.parquet')

In [113]:
DONT_USE = ['timestamp','creator_account_creation','engager_account_creation','engage_time',
            'creator_account_creation', 'engager_account_creation',
            'fold','tweet_id', 
            'tr','dt_day','','',
            'engager_user_id','creator_user_id','engager_is_verified',
            'elapsed_time',
            'links','domains','hashtags0','hashtags1',
            'hashtags','tweet_hash','dt_second','id',
            'tw_hash0',
            'tw_hash1',
            'tw_rt_uhash',
            'same_language', 'nan_language','language',
            'tw_hash', 'tw_freq_hash','tw_first_word', 'tw_second_word', 'tw_last_word', 'tw_llast_word',
            'ypred','creator_count_combined','creator_user_fer_count_delta_time','creator_user_fing_count_delta_time','creator_user_fering_count_delta_time','creator_user_fing_count_mode','creator_user_fer_count_mode','creator_user_fering_count_mode'
           ]

features = [c for c in df.columns if c not in DONT_USE]

print('Using %i features:'%(len(features)))
np.asarray(features)

Using 42 features:


array(['creator_follower_count', 'creator_following_count',
       'creator_is_verified', 'engager_follower_count',
       'engager_following_count', 'engager_follows_creator', 'reply',
       'retweet', 'retweet_comment', 'like', 'tweet_type', 'media',
       'len_hashtags', 'len_domains', 'len_links', 'dt_dow', 'dt_hour',
       'count_ats', 'count_char', 'count_words', 'tw_len',
       'TE_engager_user_id_tweet_type_language_retweet',
       'TE_tw_first_word_tweet_type_language_retweet',
       'TE_tw_last_word_tweet_type_language_retweet',
       'TE_tw_hash0_tweet_type_language_retweet',
       'TE_tw_hash1_tweet_type_language_retweet',
       'TE_tw_rt_uhash_tweet_type_language_retweet',
       'TE_creator_user_id_retweet', 'TE_engager_user_id_retweet',
       'TE_tw_hash_retweet', 'TE_tw_freq_hash_retweet',
       'TE_media_tweet_type_language_creator_is_verified_engager_is_verified_engager_follows_creator_retweet',
       'TE_creator_count_combined_tweet_type_language_retweet'

In [114]:
df = df[features]

In [115]:
df.head()

Unnamed: 0,creator_follower_count,creator_following_count,creator_is_verified,engager_follower_count,engager_following_count,engager_follows_creator,reply,retweet,retweet_comment,like,...,TE_creator_count_combined_tweet_type_language_retweet,TE_creator_user_fer_count_delta_time_media_language_retweet,TE_creator_user_fing_count_delta_time_media_language_retweet,TE_creator_user_fering_count_delta_time_tweet_type_language_retweet,TE_creator_user_fing_count_mode_media_language_retweet,TE_creator_user_fer_count_mode_media_language_retweet,TE_creator_user_fering_count_mode_tweet_type_language_retweet,TE_domains_media_tweet_type_language_retweet,TE_links_media_tweet_type_language_retweet,TE_hashtags_media_tweet_type_language_retweet
0,615,22,0,201,97,0,0,1,0,0,...,0.08669,0.074553,0.074553,0.08669,0.074605,0.074605,0.086573,0.086066,0.086066,0.084328
1,737043,389,1,442,503,0,0,0,0,0,...,0.065006,0.068069,0.068069,0.065006,0.067918,0.067918,0.064688,0.064897,,
2,2243,446,0,236,114,0,0,1,0,0,...,0.093806,0.049899,0.049899,0.094561,0.051428,0.051428,0.093806,0.083682,0.083682,
3,12374,352,0,1117,1118,1,0,0,0,1,...,0.048169,0.069458,0.069458,0.048169,0.069164,0.069164,0.047914,0.054768,0.054768,0.083439
4,66757,6945,0,2912,42,1,0,0,0,0,...,0.048169,0.045479,0.045479,0.048169,0.045439,0.045439,0.047914,0.02366,0.02366,0.026696


In [116]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3079047 entries, 0 to 3079046
Data columns (total 42 columns):
 #   Column                                                                                                Dtype  
---  ------                                                                                                -----  
 0   creator_follower_count                                                                                int32  
 1   creator_following_count                                                                               int32  
 2   creator_is_verified                                                                                   int8   
 3   engager_follower_count                                                                                int32  
 4   engager_following_count                                                                               int32  
 5   engager_follows_creator                                                      

In [117]:
sparse_features = ['creator_is_verified', 'engager_follows_creator', 'tweet_type', 'media', 'dt_dow', 'dt_hour']
sparse_features += df.columns.values.tolist()[21:]
target = ['reply', 'like', 'retweet', 'retweet_comment']  
dense_features = [feat for feat in df.columns if (feat not in sparse_features) and (feat not in target)]

In [118]:
df[sparse_features] = df[sparse_features].fillna('-1', )
df[dense_features] = df[dense_features].fillna(0,)

In [119]:
df[target] = df[target].applymap(lambda x : 1 if x > 0 else 0)

In [120]:
df.head()

Unnamed: 0,creator_follower_count,creator_following_count,creator_is_verified,engager_follower_count,engager_following_count,engager_follows_creator,reply,retweet,retweet_comment,like,...,TE_creator_count_combined_tweet_type_language_retweet,TE_creator_user_fer_count_delta_time_media_language_retweet,TE_creator_user_fing_count_delta_time_media_language_retweet,TE_creator_user_fering_count_delta_time_tweet_type_language_retweet,TE_creator_user_fing_count_mode_media_language_retweet,TE_creator_user_fer_count_mode_media_language_retweet,TE_creator_user_fering_count_mode_tweet_type_language_retweet,TE_domains_media_tweet_type_language_retweet,TE_links_media_tweet_type_language_retweet,TE_hashtags_media_tweet_type_language_retweet
0,615,22,0,201,97,0,0,1,0,0,...,0.0866901,0.0745531,0.0745531,0.0866901,0.074605,0.074605,0.0865726,0.086066,0.086066,0.0843278
1,737043,389,1,442,503,0,0,0,0,0,...,0.0650061,0.0680689,0.0680689,0.0650061,0.0679184,0.0679184,0.0646877,0.0648966,-1.0,-1.0
2,2243,446,0,236,114,0,0,1,0,0,...,0.0938059,0.0498988,0.0498988,0.0945607,0.0514282,0.0514282,0.0938059,0.0836817,0.0836817,-1.0
3,12374,352,0,1117,1118,1,0,0,0,1,...,0.0481695,0.069458,0.069458,0.0481695,0.0691641,0.0691641,0.047914,0.054768,0.054768,0.0834385
4,66757,6945,0,2912,42,1,0,0,0,0,...,0.0481695,0.0454793,0.0454793,0.0481695,0.0454394,0.0454394,0.047914,0.0236601,0.0236601,0.0266961


## Preprocessing

In [121]:
df[sparse_features] = df[sparse_features].astype(np.float32)

In [122]:
for feat in sparse_features :
    lbe = LabelEncoder()
    df[feat] = lbe.fit_transform(df[feat])

In [123]:
mms = MinMaxScaler(feature_range = (0, 1))
df[dense_features] = mms.fit_transform(df[dense_features])

## Negative Sampling 

In [124]:
df_positive = df[df['retweet']==1]
df_negative = df[df['retweet']==0]
print(len(df_positive))
print(len(df_negative))

269540
2809507


In [40]:
df_negative = df_negative.sample(n = len(df_positive))

In [41]:
df = pd.concat([df_positive, df_negative])

In [42]:
df = df.sample(frac=1)

In [43]:
df = df.reset_index(drop=True)

In [44]:
df

Unnamed: 0,creator_follower_count,creator_following_count,creator_is_verified,engager_follower_count,engager_following_count,engager_follows_creator,reply,retweet,retweet_comment,like,...,TE_creator_count_combined_tweet_type_language_reply,TE_creator_user_fer_count_delta_time_media_language_reply,TE_creator_user_fing_count_delta_time_media_language_reply,TE_creator_user_fering_count_delta_time_tweet_type_language_reply,TE_creator_user_fing_count_mode_media_language_reply,TE_creator_user_fer_count_mode_media_language_reply,TE_creator_user_fering_count_mode_tweet_type_language_reply,TE_domains_media_tweet_type_language_reply,TE_links_media_tweet_type_language_reply,TE_hashtags_media_tweet_type_language_reply
0,0.000008,0.000019,0,0.000041,0.008112,0,0,0,0,1,...,1329,1375,1375,1138,1363,1363,1133,3416,2367,2557
1,0.000065,0.000174,0,0.000510,0.004588,0,0,0,0,1,...,1329,1375,1375,1138,1363,1363,1133,3416,2367,2557
2,0.000006,0.000209,0,0.000150,0.005763,1,1,0,0,0,...,1283,1300,1300,1101,1289,1289,1092,3369,2327,2523
3,0.000008,0.000210,0,0.000324,0.002283,1,1,0,0,1,...,1126,1163,1163,953,1155,1155,960,3270,0,2564
4,0.060340,0.000551,1,0.000003,0.000598,0,0,0,0,0,...,1,1,1,1,1,1,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
181461,0.000048,0.001546,0,0.000437,0.008239,1,0,0,0,0,...,1126,1163,1163,953,1155,1155,960,1907,0,2245
181462,0.000003,0.000079,0,0.000415,0.002405,1,0,1,1,0,...,191,1350,1350,164,1320,1320,160,708,518,570
181463,0.000040,0.000019,0,0.000098,0.002178,0,0,0,0,0,...,991,840,840,833,818,818,822,2376,1536,1681
181464,0.000002,0.000011,0,0.000010,0.000537,0,1,0,0,1,...,1329,1375,1375,1138,1363,1363,1133,3416,2367,0


## Generate Feature Columns

In [45]:
fixlen_feature_columns = [SparseFeat(feat, vocabulary_size = df[feat].max() +1, embedding_dim = 4) for feat in sparse_features]  + [DenseFeat(feat, 1,) for feat in dense_features]

In [46]:
dnn_feature_columns = fixlen_feature_columns
linear_feature_columns = fixlen_feature_columns

feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)

## Split Dataset

In [47]:
train, test = train_test_split(df, test_size = 0.2)

In [48]:
train_model_input = {name:train[name].values for name in feature_names}
test_model_input = {name:test[name].values for name in feature_names}

## Modeling

In [49]:

def f1(y_true, y_pred):
    y_pred = K.round(y_pred)
    tp = K.sum(K.cast(y_true*y_pred, 'float'), axis=0)
    tn = K.sum(K.cast((1-y_true)*(1-y_pred), 'float'), axis=0)
    fp = K.sum(K.cast((1-y_true)*y_pred, 'float'), axis=0)
    fn = K.sum(K.cast(y_true*(1-y_pred), 'float'), axis=0)

    p = tp / (tp + fp + K.epsilon())
    r = tp / (tp + fn + K.epsilon())

    f1 = 2*p*r / (p+r+K.epsilon())
    f1 = tf.where(tf.math.is_nan(f1), tf.zeros_like(f1), f1)
    return K.mean(f1)

def f1_loss(y_true, y_pred):
    
    tp = K.sum(K.cast(y_true*y_pred, 'float'), axis=0)
    tn = K.sum(K.cast((1-y_true)*(1-y_pred), 'float'), axis=0)
    fp = K.sum(K.cast((1-y_true)*y_pred, 'float'), axis=0)
    fn = K.sum(K.cast(y_true*(1-y_pred), 'float'), axis=0)

    p = tp / (tp + fp + K.epsilon())
    r = tp / (tp + fn + K.epsilon())

    f1 = 2*p*r / (p+r+K.epsilon())
    f1 = tf.where(tf.math.is_nan(f1), tf.zeros_like(f1), f1)
    return 1 - K.mean(f1)


In [99]:

def mAP(y_true, y_pred):
    return tf.reduce_mean(tf.metrics.sparse_average_precision_at_k(tf.cast(y_true, tf.int64), y_pred, 1)[0])

In [100]:
# optimizer = tf.keras.optimizers.Adam(lr=0.05, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=False)
model = DeepFM(linear_feature_columns, dnn_feature_columns, task = 'binary')
model.compile('adam', 'binary_crossentropy', metrics=['accuracy', 'binary_crossentropy', tf.keras.metrics.kl_divergence])
# model.compile(optimizer, f1_loss, metrics=['accuracy', f1])

In [101]:
history = model.fit(train_model_input, train['reply'].values.astype(np.float32),
                    batch_size = 256,
                    epochs = 50,
                    verbose = 1,
                    validation_split = 0.2,)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [102]:
pred_ans = model.predict(test_model_input, batch_size = 256)

In [103]:
pred_ans.reshape(-1)

array([9.0173162e-14, 1.0000000e+00, 7.9729486e-01, ..., 1.6947210e-02,
       5.7837087e-01, 6.8724453e-03], dtype=float32)

In [104]:
pred = pred_ans.reshape(-1)

In [105]:
float_formatter = "{:.5f}".format
pred_reply = []
for i in range(len(pred)):
    pred_reply.append(float_formatter(pred[i]))
    # print(float_formatter(pred[i]))

In [106]:
pred_reply = np.array(pred_reply)
pred_reply = pred_reply.astype(np.float64)

In [107]:
pd.set_option("display.max_rows", 101)

test['pred'] = pred_reply
test[['reply', 'pred']].head(100)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,reply,pred
154290,0,0.0
152055,1,1.0
165721,0,0.79729
163370,0,0.83698
66775,1,0.64947
41774,0,0.0
129568,0,0.99999
33101,1,1.0
148242,1,0.99642
157910,0,0.31408


In [108]:
rce_like = compute_rce(pred_reply, test['reply'])
rce_like

-474.17020469502324

In [109]:
ap_like = average_precision_score(test['reply'], pred_reply)
ap_like

0.6136799571285597