In [1]:
import sys
sys.path.append('../../..')

import tqdm
import pandas as pd
from tensorflow.python.keras.models import  save_model,load_model
from tensorflow.keras import optimizers
from deepctr.models import DeepFM
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from deepctr.feature_column import SparseFeat, DenseFeat, get_feature_names
from sklearn import preprocessing
from utils.preprocessing import read_data
import core.config as conf
import numpy as np

from utils.evaluate import calculate_ctr, compute_rce, average_precision_score

In [2]:
data_path = conf.raw_data_path
df = pd.read_parquet(data_path + 'part-00000')

In [3]:
DONT_USE = ['text_tokens', 'hashtags', 'tweet_id', 'engaged_with_user_id', 'enaging_user_id', 'language','present_links', 'present_domains', 'id']
features = [c for c in df.columns if c not in DONT_USE]

In [4]:
df = df[features]

In [5]:
df.head()

Unnamed: 0,text_ tokens,present_media,tweet_type,tweet_timestamp,engaged_with_user_follower_count,engaged_with_user_following_count,engaged_with_user_is_verified,engaged_with_user_account_creation,enaging_user_follower_count,enaging_user_following_count,enaging_user_is_verified,enaging_user_account_creation,engagee_follows_engager,reply_timestamp,retweet_timestamp,retweet_with_comment_timestamp,like_timestamp
0,101\t56898\t137\t16925\t10731\t11481\t13980\t1...,,Retweet,1613237034,2473,662,False,1261859734,169,339,False,1520886748,False,0,0,0,0
1,101\t10159\t11322\t58550\t10836\t10126\t25900\...,,TopLevel,1613748600,4418640,228,True,1266804490,393,1190,False,1237570695,False,0,0,0,0
2,101\t10159\t56314\t14364\t10109\t14657\t11359\...,,TopLevel,1613386238,219715,3685,True,1202617218,629,1473,False,1263176351,False,0,0,0,1613388292
3,101\t56898\t137\t10224\t10731\t64194\t41939\t3...,,Retweet,1613708640,2388283,13511,True,1251645191,123,200,False,1268276559,False,0,0,0,0
4,101\t7143\t1938\t113\t100\t10097\t100\t10097\t...,Photo,TopLevel,1612586018,414,720,False,1578273274,134,379,False,1483862063,True,0,0,0,1612587384


In [6]:
sparse_features = ['present_media', 'tweet_type', 'engaged_with_user_is_verified', 'enaging_user_is_verified', 'engagee_follows_engager']
dense_features = ['tweet_timestamp', 'engaged_with_user_follower_count', 'engaged_with_user_following_count', 'engaged_with_user_account_creation', 'enaging_user_follower_count', 'enaging_user_following_count', 'enaging_user_account_creation']

In [7]:
target = ['like_timestamp', 'reply_timestamp', 'retweet_timestamp', 'retweet_with_comment_timestamp']

In [8]:
df[target] = df[target].applymap(lambda x : 1 if x > 0 else 0)

In [9]:
df.head()

Unnamed: 0,text_ tokens,present_media,tweet_type,tweet_timestamp,engaged_with_user_follower_count,engaged_with_user_following_count,engaged_with_user_is_verified,engaged_with_user_account_creation,enaging_user_follower_count,enaging_user_following_count,enaging_user_is_verified,enaging_user_account_creation,engagee_follows_engager,reply_timestamp,retweet_timestamp,retweet_with_comment_timestamp,like_timestamp
0,101\t56898\t137\t16925\t10731\t11481\t13980\t1...,,Retweet,1613237034,2473,662,False,1261859734,169,339,False,1520886748,False,0,0,0,0
1,101\t10159\t11322\t58550\t10836\t10126\t25900\...,,TopLevel,1613748600,4418640,228,True,1266804490,393,1190,False,1237570695,False,0,0,0,0
2,101\t10159\t56314\t14364\t10109\t14657\t11359\...,,TopLevel,1613386238,219715,3685,True,1202617218,629,1473,False,1263176351,False,0,0,0,1
3,101\t56898\t137\t10224\t10731\t64194\t41939\t3...,,Retweet,1613708640,2388283,13511,True,1251645191,123,200,False,1268276559,False,0,0,0,0
4,101\t7143\t1938\t113\t100\t10097\t100\t10097\t...,Photo,TopLevel,1612586018,414,720,False,1578273274,134,379,False,1483862063,True,0,0,0,1


## Preprocessing

## Negative Sampling

In [12]:
df_positive = df[df['like_timestamp']==1]
df_negative = df[df['like_timestamp']==0]
print(len(df_positive))
print(len(df_negative))

1207588
1826314


In [13]:
df_negative = df_negative.sample(n = len(df_positive), random_state=777)

In [14]:
df = pd.concat([df_positive, df_negative])

In [15]:
df  = df.sample(frac = 1)

## Encoding and Scaling

In [16]:
for feat in sparse_features :
    lbe = LabelEncoder()
    df[feat] = lbe.fit_transform(df[feat])

In [17]:
mms = MinMaxScaler(feature_range = (0, 1))
df[dense_features] = mms.fit_transform(df[dense_features])

## Generate Feature Columns

In [18]:
fixlen_feature_columns = [SparseFeat(feat, vocabulary_size = df[feat].max() +1, embedding_dim = 4) for feat in sparse_features]  + [DenseFeat(feat, 1,) for feat in dense_features]

In [19]:
dnn_feature_columns = fixlen_feature_columns
linear_feature_columns = fixlen_feature_columns

feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)

## Split Dataset

In [21]:
train, test = train_test_split(df, test_size = 0.2)

In [22]:
train_model_input = {name:train[name].values for name in feature_names}
test_model_input = {name:test[name].values for name in feature_names}

## Modeling

In [24]:
model = DeepFM(linear_feature_columns, dnn_feature_columns, task = 'binary')

In [25]:
from tensorflow.keras import backend as K

def recall(y_target, y_pred):
    # clip(t, clip_value_min, clip_value_max) : clip_value_min~clip_value_max 이외 가장자리를 깎아 낸다
    # round : 반올림한다
    y_target_yn = K.round(K.clip(y_target, 0, 1)) # 실제값을 0(Negative) 또는 1(Positive)로 설정한다
    y_pred_yn = K.round(K.clip(y_pred, 0, 1)) # 예측값을 0(Negative) 또는 1(Positive)로 설정한다

    # True Positive는 실제 값과 예측 값이 모두 1(Positive)인 경우이다
    count_true_positive = K.sum(y_target_yn * y_pred_yn) 

    # (True Positive + False Negative) = 실제 값이 1(Positive) 전체
    count_true_positive_false_negative = K.sum(y_target_yn)

    # Recall =  (True Positive) / (True Positive + False Negative)
    # K.epsilon()는 'divide by zero error' 예방차원에서 작은 수를 더한다
    recall = count_true_positive / (count_true_positive_false_negative + K.epsilon())

    # return a single tensor value
    return recall


def precision(y_target, y_pred):
    # clip(t, clip_value_min, clip_value_max) : clip_value_min~clip_value_max 이외 가장자리를 깎아 낸다
    # round : 반올림한다
    y_pred_yn = K.round(K.clip(y_pred, 0, 1)) # 예측값을 0(Negative) 또는 1(Positive)로 설정한다
    y_target_yn = K.round(K.clip(y_target, 0, 1)) # 실제값을 0(Negative) 또는 1(Positive)로 설정한다

    # True Positive는 실제 값과 예측 값이 모두 1(Positive)인 경우이다
    count_true_positive = K.sum(y_target_yn * y_pred_yn) 

    # (True Positive + False Positive) = 예측 값이 1(Positive) 전체
    count_true_positive_false_positive = K.sum(y_pred_yn)

    # Precision = (True Positive) / (True Positive + False Positive)
    # K.epsilon()는 'divide by zero error' 예방차원에서 작은 수를 더한다
    precision = count_true_positive / (count_true_positive_false_positive + K.epsilon())

    # return a single tensor value
    return precision


def f1score(y_target, y_pred):
    _recall = recall(y_target, y_pred)
    _precision = precision(y_target, y_pred)
    # K.epsilon()는 'divide by zero error' 예방차원에서 작은 수를 더한다
    _f1score = ( 2 * _recall * _precision) / (_recall + _precision+ K.epsilon())
    
    # return a single tensor value
    return _f1score

In [26]:
import tensorflow as tf

def f1(y_true, y_pred):
    y_pred = K.round(y_pred)
    tp = K.sum(K.cast(y_true*y_pred, 'float'), axis=0)
    tn = K.sum(K.cast((1-y_true)*(1-y_pred), 'float'), axis=0)
    fp = K.sum(K.cast((1-y_true)*y_pred, 'float'), axis=0)
    fn = K.sum(K.cast(y_true*(1-y_pred), 'float'), axis=0)

    p = tp / (tp + fp + K.epsilon())
    r = tp / (tp + fn + K.epsilon())

    f1 = 2*p*r / (p+r+K.epsilon())
    f1 = tf.where(tf.math.is_nan(f1), tf.zeros_like(f1), f1)
    return K.mean(f1)

def f1_loss(y_true, y_pred):
    
    tp = K.sum(K.cast(y_true*y_pred, 'float'), axis=0)
    tn = K.sum(K.cast((1-y_true)*(1-y_pred), 'float'), axis=0)
    fp = K.sum(K.cast((1-y_true)*y_pred, 'float'), axis=0)
    fn = K.sum(K.cast(y_true*(1-y_pred), 'float'), axis=0)

    p = tp / (tp + fp + K.epsilon())
    r = tp / (tp + fn + K.epsilon())

    f1 = 2*p*r / (p+r+K.epsilon())
    f1 = tf.where(tf.math.is_nan(f1), tf.zeros_like(f1), f1)
    return 1 - K.mean(f1)


In [27]:
model.compile("adam", "binary_crossentropy", 
             metrics = ['binary_crossentropy'])
        

In [28]:
y_train =  train['like_timestamp'].values

In [29]:
history = model.fit(train_model_input, y_train,
                    batch_size = 32,
                    epochs = 30,
                    verbose = 1,
                    validation_split = 0.2,)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [33]:
y_test = test['like_timestamp']

In [37]:
pred = model.predict(test_model_input, batch_size = 256)

In [39]:
rce_like = compute_rce(pred, y_test)
rce_like

5.980581884560099

In [40]:
ap_like = average_precision_score(y_test, pred)
ap_like

0.6397470597584911

In [28]:
save_model(model, 'DeepFM.h5')

In [40]:
import deepctr

In [41]:
deepctr.__version__

'0.8.5'