In [54]:
import sys
sys.path.append('../../..')

import tqdm
import numpy as np
import pandas as pd
from tensorflow.keras import optimizers
from deepctr.models import DeepFM
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from deepctr.feature_column import SparseFeat, DenseFeat, get_feature_names
from sklearn import preprocessing
from utils.preprocessing import read_data
import core.config as conf
import tensorflow as tf
from tensorflow.keras import backend as K


from utils.evaluate import calculate_ctr, compute_rce, average_precision_score

In [21]:
data_path = conf.data_root + 'nvidia_models'
df = pd.read_parquet(f'{data_path}/train-final-te-reply-1.parquet')

In [22]:
DONT_USE = ['timestamp','creator_account_creation','engager_account_creation','engage_time',
            'creator_account_creation', 'engager_account_creation',
            'fold','tweet_id', 
            'tr','dt_day','','',
            'engager_user_id','creator_user_id','engager_is_verified',
            'elapsed_time',
            'links','domains','hashtags0','hashtags1',
            'hashtags','tweet_hash','dt_second','id',
            'tw_hash0',
            'tw_hash1',
            'tw_rt_uhash',
            'same_language', 'nan_language','language',
            'tw_hash', 'tw_freq_hash','tw_first_word', 'tw_second_word', 'tw_last_word', 'tw_llast_word',
            'ypred','creator_count_combined','creator_user_fer_count_delta_time','creator_user_fing_count_delta_time','creator_user_fering_count_delta_time','creator_user_fing_count_mode','creator_user_fer_count_mode','creator_user_fering_count_mode'
           ]

features = [c for c in df.columns if c not in DONT_USE]

print('Using %i features:'%(len(features)))
np.asarray(features)

Using 42 features:


array(['creator_follower_count', 'creator_following_count',
       'creator_is_verified', 'engager_follower_count',
       'engager_following_count', 'engager_follows_creator', 'reply',
       'retweet', 'retweet_comment', 'like', 'tweet_type', 'media',
       'len_hashtags', 'len_domains', 'len_links', 'dt_dow', 'dt_hour',
       'count_ats', 'count_char', 'count_words', 'tw_len',
       'TE_engager_user_id_tweet_type_language_reply',
       'TE_tw_first_word_tweet_type_language_reply',
       'TE_tw_last_word_tweet_type_language_reply',
       'TE_tw_hash0_tweet_type_language_reply',
       'TE_tw_hash1_tweet_type_language_reply',
       'TE_tw_rt_uhash_tweet_type_language_reply',
       'TE_creator_user_id_reply', 'TE_engager_user_id_reply',
       'TE_tw_hash_reply', 'TE_tw_freq_hash_reply',
       'TE_media_tweet_type_language_creator_is_verified_engager_is_verified_engager_follows_creator_reply',
       'TE_creator_count_combined_tweet_type_language_reply',
       'TE_creator_use

In [23]:
df = df[features]

In [24]:
df.head()

Unnamed: 0,creator_follower_count,creator_following_count,creator_is_verified,engager_follower_count,engager_following_count,engager_follows_creator,reply,retweet,retweet_comment,like,...,TE_creator_count_combined_tweet_type_language_reply,TE_creator_user_fer_count_delta_time_media_language_reply,TE_creator_user_fing_count_delta_time_media_language_reply,TE_creator_user_fering_count_delta_time_tweet_type_language_reply,TE_creator_user_fing_count_mode_media_language_reply,TE_creator_user_fer_count_mode_media_language_reply,TE_creator_user_fering_count_mode_tweet_type_language_reply,TE_domains_media_tweet_type_language_reply,TE_links_media_tweet_type_language_reply,TE_hashtags_media_tweet_type_language_reply
0,615,22,0,201,97,0,0,1,0,0,...,0.006714,0.031688,0.031688,0.006714,0.031773,0.031773,0.006774,0.00728,0.00728,0.007575
1,737043,389,1,442,503,0,0,0,0,0,...,0.050184,0.039951,0.039951,0.050184,0.040103,0.040103,0.050486,0.02184,,
2,2243,446,0,236,114,0,0,1,0,0,...,0.002921,0.027356,0.027356,0.002754,0.027915,0.027915,0.002921,0.005243,0.005243,
3,12374,352,0,1117,1118,1,0,0,0,1,...,0.035443,0.027843,0.027843,0.035443,0.028063,0.028063,0.035467,0.039339,0.039339,0.028081
4,66757,6945,0,2912,42,1,0,0,0,0,...,0.035443,0.027264,0.027264,0.035443,0.027248,0.027248,0.035467,0.045184,0.045184,0.043042


In [25]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3079047 entries, 0 to 3079046
Data columns (total 42 columns):
 #   Column                                                                                              Dtype  
---  ------                                                                                              -----  
 0   creator_follower_count                                                                              int32  
 1   creator_following_count                                                                             int32  
 2   creator_is_verified                                                                                 int8   
 3   engager_follower_count                                                                              int32  
 4   engager_following_count                                                                             int32  
 5   engager_follows_creator                                                                    

In [153]:
sparse_features = ['creator_is_verified', 'engager_follows_creator', 'tweet_type', 'media', 'dt_dow', 'dt_hour']
sparse_features += df.columns.values.tolist()[21:]
target = ['reply', 'like', 'retweet', 'retweet_comment']  
dense_features = [feat for feat in df.columns if (feat not in sparse_features) and (feat not in target)]

In [154]:
df[sparse_features] = df[sparse_features].fillna('-1', )
df[dense_features] = df[dense_features].fillna(0,)

In [155]:
df[target] = df[target].applymap(lambda x : 1 if x > 0 else 0)

In [156]:
df.head()

Unnamed: 0,creator_follower_count,creator_following_count,creator_is_verified,engager_follower_count,engager_following_count,engager_follows_creator,reply,retweet,retweet_comment,like,...,TE_creator_count_combined_tweet_type_language_reply,TE_creator_user_fer_count_delta_time_media_language_reply,TE_creator_user_fing_count_delta_time_media_language_reply,TE_creator_user_fering_count_delta_time_tweet_type_language_reply,TE_creator_user_fing_count_mode_media_language_reply,TE_creator_user_fer_count_mode_media_language_reply,TE_creator_user_fering_count_mode_tweet_type_language_reply,TE_domains_media_tweet_type_language_reply,TE_links_media_tweet_type_language_reply,TE_hashtags_media_tweet_type_language_reply
0,6.469334e-05,0.001072,0,1.56319e-06,0.000766,0,1,0,0,0,...,0.290946,0.251246,0.251246,0.311728,0.230496,0.230496,0.289187,0.133815,0.210848,0.117511
1,0.0001669446,0.000552,0,5.040806e-05,0.002342,1,0,0,0,1,...,0.225487,0.228282,0.228282,0.241593,0.210392,0.210392,0.219806,0.140428,0.221268,0.115781
2,9.51714e-07,3e-05,0,3.512354e-06,0.00049,1,0,0,0,0,...,0.210791,0.148512,0.148512,0.225847,0.140353,0.140353,0.210791,0.092878,0.146345,0.067859
3,0.001060813,0.000273,0,2.508824e-07,0.001785,0,0,0,0,1,...,0.285699,0.313002,0.313002,0.306106,0.282465,0.282465,0.271594,0.21071,0.33201,0.0
4,1.354065e-05,0.000437,0,2.246362e-05,0.003667,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Preprocessing

In [157]:
for feat in sparse_features :
    lbe = LabelEncoder()
    df[feat] = lbe.fit_transform(df[feat])

In [158]:
mms = MinMaxScaler(feature_range = (0, 1))
df[dense_features] = mms.fit_transform(df[dense_features])

## Negative Sampling 

In [159]:
df_positive = df[df['reply']==1]
df_negative = df[df['reply']==0]
print(len(df_positive))
print(len(df_negative))

90733
90733


In [160]:
df_negative = df_negative.sample(n = len(df_positive))

In [161]:
df = pd.concat([df_positive, df_negative])

In [162]:
df = df.sample(frac=1)

In [163]:
df = df.reset_index(drop=True)

In [164]:
df

Unnamed: 0,creator_follower_count,creator_following_count,creator_is_verified,engager_follower_count,engager_following_count,engager_follows_creator,reply,retweet,retweet_comment,like,...,TE_creator_count_combined_tweet_type_language_reply,TE_creator_user_fer_count_delta_time_media_language_reply,TE_creator_user_fing_count_delta_time_media_language_reply,TE_creator_user_fering_count_delta_time_tweet_type_language_reply,TE_creator_user_fing_count_mode_media_language_reply,TE_creator_user_fer_count_mode_media_language_reply,TE_creator_user_fering_count_mode_tweet_type_language_reply,TE_domains_media_tweet_type_language_reply,TE_links_media_tweet_type_language_reply,TE_hashtags_media_tweet_type_language_reply
0,0.000010,0.000253,0,0.000031,0.003775,0,0,0,0,1,...,208,1349,1349,176,1316,1316,171,702,516,578
1,0.000008,0.000135,0,0.000382,0.006268,1,0,0,0,1,...,131,275,275,103,250,250,98,326,261,325
2,0.000192,0.000176,0,0.000009,0.005227,0,0,0,0,0,...,605,1432,1432,518,1454,1454,483,2101,1325,1389
3,0.000101,0.001007,0,0.000094,0.002662,0,0,0,0,1,...,200,1334,1334,170,1302,1302,166,701,515,576
4,0.000001,0.000104,0,0.000234,0.007102,1,1,0,0,0,...,0,0,0,0,0,0,0,2303,1473,1651
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
181461,0.000003,0.000103,0,0.000236,0.005941,1,0,0,0,1,...,662,506,506,568,498,498,567,1822,1137,1267
181462,0.000016,0.000486,0,0.000275,0.007369,1,1,0,0,0,...,947,570,570,806,517,517,794,2508,1622,1992
181463,0.001647,0.000206,0,0.008900,0.233451,1,0,0,0,1,...,754,679,679,644,664,664,638,1822,1137,1267
181464,0.000021,0.000216,0,0.000646,0.016129,1,1,0,0,0,...,1525,1606,1606,1303,1577,1577,1289,3778,2607,2953


## Generate Feature Columns

In [165]:
fixlen_feature_columns = [SparseFeat(feat, vocabulary_size = df[feat].max() +1, embedding_dim = 4) for feat in sparse_features]  + [DenseFeat(feat, 1,) for feat in dense_features]

In [166]:
dnn_feature_columns = fixlen_feature_columns
linear_feature_columns = fixlen_feature_columns

feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)

## Split Dataset

In [167]:
train, test = train_test_split(df, test_size = 0.2)

In [168]:
train_model_input = {name:train[name].values for name in feature_names}
test_model_input = {name:test[name].values for name in feature_names}

## Modeling

In [169]:

def f1(y_true, y_pred):
    y_pred = K.round(y_pred)
    tp = K.sum(K.cast(y_true*y_pred, 'float'), axis=0)
    tn = K.sum(K.cast((1-y_true)*(1-y_pred), 'float'), axis=0)
    fp = K.sum(K.cast((1-y_true)*y_pred, 'float'), axis=0)
    fn = K.sum(K.cast(y_true*(1-y_pred), 'float'), axis=0)

    p = tp / (tp + fp + K.epsilon())
    r = tp / (tp + fn + K.epsilon())

    f1 = 2*p*r / (p+r+K.epsilon())
    f1 = tf.where(tf.math.is_nan(f1), tf.zeros_like(f1), f1)
    return K.mean(f1)

def f1_loss(y_true, y_pred):
    
    tp = K.sum(K.cast(y_true*y_pred, 'float'), axis=0)
    tn = K.sum(K.cast((1-y_true)*(1-y_pred), 'float'), axis=0)
    fp = K.sum(K.cast((1-y_true)*y_pred, 'float'), axis=0)
    fn = K.sum(K.cast(y_true*(1-y_pred), 'float'), axis=0)

    p = tp / (tp + fp + K.epsilon())
    r = tp / (tp + fn + K.epsilon())

    f1 = 2*p*r / (p+r+K.epsilon())
    f1 = tf.where(tf.math.is_nan(f1), tf.zeros_like(f1), f1)
    return 1 - K.mean(f1)


In [176]:
# optimizer = tf.keras.optimizers.Adam(lr=0.05, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=False)
model = DeepFM(linear_feature_columns, dnn_feature_columns, task = 'binary')
model.compile('adam', 'binary_crossentropy', metrics=['accuracy', 'binary_crossentropy'])
# model.compile(optimizer, f1_loss, metrics=['accuracy', f1])

In [177]:
history = model.fit(train_model_input, train['reply'].values.astype(np.float32),
                    batch_size = 256,
                    epochs = 100,
                    verbose = 1,
                    validation_split = 0.2,)

6 - val_binary_crossentropy: 0.6165
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 7

In [178]:
pred_ans = model.predict(test_model_input, batch_size = 256)

In [187]:
pred_ans.reshape(-1)

array([6.3042974e-01, 1.0000000e+00, 7.3082703e-01, ..., 8.7730626e-08,
       9.7275877e-01, 9.8262084e-01], dtype=float32)

In [196]:
pred = pred_ans.reshape(-1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [239]:
float_formatter = "{:.5f}".format
pred_reply = []
for i in range(len(pred)):
    pred_reply.append(float_formatter(pred[i]))
    # print(float_formatter(pred[i]))

In [240]:
pred_reply = np.array(pred_reply)
pred_reply = pred_reply.astype(np.float64)

In [241]:
pd.set_option("display.max_rows", 101)

test['pred'] = pred_reply
test[['reply', 'pred']].head(100)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,reply,pred
33469,1,0.63043
35913,1,1.0
6047,1,0.73083
176012,1,0.99974
154969,1,0.64813
146939,1,0.98798
13744,0,0.01696
37972,0,0.03941
69769,0,0.01566
118679,1,1.0


In [242]:
rce_like = compute_rce(pred_reply, test['reply'])
rce_like

-581.0918912052389

In [243]:
ap_like = average_precision_score(test['reply'], pred_reply)
ap_like

0.6079216453871057