In [2]:
import sys
sys.path.append('../../..')

import tqdm
import pandas as pd
from tensorflow.python.keras.models import  save_model,load_model
from tensorflow.keras import optimizers
from deepctr.models import DeepFM
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from deepctr.feature_column import SparseFeat, DenseFeat, get_feature_names
from sklearn import preprocessing
from utils.preprocessing import read_data
import core.config as conf
import numpy as np

from utils.evaluate import calculate_ctr, compute_rce, average_precision_score

In [3]:
data_path = conf.raw_data_path
df = pd.read_parquet(data_path + 'part-00043')

In [4]:
DONT_USE = ['text_ tokens', 'hashtags', 'tweet_id', 'engaged_with_user_id', 'enaging_user_id', 'language','present_links', 'present_domains', 'id']
features = [c for c in df.columns if c not in DONT_USE]

In [5]:
df = df[features]

In [6]:
df.head()

Unnamed: 0,present_media,tweet_type,tweet_timestamp,engaged_with_user_follower_count,engaged_with_user_following_count,engaged_with_user_is_verified,engaged_with_user_account_creation,enaging_user_follower_count,enaging_user_following_count,enaging_user_is_verified,enaging_user_account_creation,engagee_follows_engager,reply_timestamp,retweet_timestamp,retweet_with_comment_timestamp,like_timestamp
0,Photo\tPhoto,Retweet,1612787398,390,322,False,1600869556,219,215,False,1594373633,True,0,0,0,0
1,,TopLevel,1614058647,86775,375,False,1437682138,42,53,False,1517645474,False,0,0,0,0
2,Video,TopLevel,1613991576,276517,230,False,1562367304,111,123,False,1572527313,False,0,0,0,1614004646
3,,TopLevel,1612602066,330,499,False,1404678822,270,241,False,1422212371,True,0,0,0,1612606852
4,,TopLevel,1613303201,1005,543,False,1480686845,413,681,False,1493199020,False,0,0,0,0


In [7]:
sparse_features = ['present_media', 'tweet_type', 'engaged_with_user_is_verified', 'enaging_user_is_verified', 'engagee_follows_engager']
dense_features = ['tweet_timestamp', 'engaged_with_user_follower_count', 'engaged_with_user_following_count', 'engaged_with_user_account_creation', 'enaging_user_follower_count', 'enaging_user_following_count', 'enaging_user_account_creation']

In [8]:
target = ['like_timestamp', 'reply_timestamp', 'retweet_timestamp', 'retweet_with_comment_timestamp']

In [9]:
df[target] = df[target].applymap(lambda x : 1 if x > 0 else 0)

In [10]:
df.head()

Unnamed: 0,present_media,tweet_type,tweet_timestamp,engaged_with_user_follower_count,engaged_with_user_following_count,engaged_with_user_is_verified,engaged_with_user_account_creation,enaging_user_follower_count,enaging_user_following_count,enaging_user_is_verified,enaging_user_account_creation,engagee_follows_engager,reply_timestamp,retweet_timestamp,retweet_with_comment_timestamp,like_timestamp
0,Photo\tPhoto,Retweet,1612787398,390,322,False,1600869556,219,215,False,1594373633,True,0,0,0,0
1,,TopLevel,1614058647,86775,375,False,1437682138,42,53,False,1517645474,False,0,0,0,0
2,Video,TopLevel,1613991576,276517,230,False,1562367304,111,123,False,1572527313,False,0,0,0,1
3,,TopLevel,1612602066,330,499,False,1404678822,270,241,False,1422212371,True,0,0,0,1
4,,TopLevel,1613303201,1005,543,False,1480686845,413,681,False,1493199020,False,0,0,0,0


## Preprocessing

In [11]:
for feat in sparse_features :
    lbe = LabelEncoder()
    df[feat] = lbe.fit_transform(df[feat])

In [12]:
mms = MinMaxScaler(feature_range = (0, 1))
df[dense_features] = mms.fit_transform(df[dense_features])

## Generate Feature Columns

In [13]:
fixlen_feature_columns = [SparseFeat(feat, vocabulary_size = df[feat].max() +1, embedding_dim = 4) for feat in sparse_features]  + [DenseFeat(feat, 1,) for feat in dense_features]

In [14]:
dnn_feature_columns = fixlen_feature_columns
linear_feature_columns = fixlen_feature_columns

feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)

## Split Dataset

In [15]:
train, test = train_test_split(df, test_size = 0.2)

In [16]:
train_model_input = {name:train[name].values for name in feature_names}
test_model_input = {name:test[name].values for name in feature_names}

## Modeling

In [17]:
model = DeepFM(linear_feature_columns, dnn_feature_columns, task = 'binary')

In [18]:
from tensorflow.keras import backend as K

def recall(y_target, y_pred):
    # clip(t, clip_value_min, clip_value_max) : clip_value_min~clip_value_max 이외 가장자리를 깎아 낸다
    # round : 반올림한다
    y_target_yn = K.round(K.clip(y_target, 0, 1)) # 실제값을 0(Negative) 또는 1(Positive)로 설정한다
    y_pred_yn = K.round(K.clip(y_pred, 0, 1)) # 예측값을 0(Negative) 또는 1(Positive)로 설정한다

    # True Positive는 실제 값과 예측 값이 모두 1(Positive)인 경우이다
    count_true_positive = K.sum(y_target_yn * y_pred_yn) 

    # (True Positive + False Negative) = 실제 값이 1(Positive) 전체
    count_true_positive_false_negative = K.sum(y_target_yn)

    # Recall =  (True Positive) / (True Positive + False Negative)
    # K.epsilon()는 'divide by zero error' 예방차원에서 작은 수를 더한다
    recall = count_true_positive / (count_true_positive_false_negative + K.epsilon())

    # return a single tensor value
    return recall


def precision(y_target, y_pred):
    # clip(t, clip_value_min, clip_value_max) : clip_value_min~clip_value_max 이외 가장자리를 깎아 낸다
    # round : 반올림한다
    y_pred_yn = K.round(K.clip(y_pred, 0, 1)) # 예측값을 0(Negative) 또는 1(Positive)로 설정한다
    y_target_yn = K.round(K.clip(y_target, 0, 1)) # 실제값을 0(Negative) 또는 1(Positive)로 설정한다

    # True Positive는 실제 값과 예측 값이 모두 1(Positive)인 경우이다
    count_true_positive = K.sum(y_target_yn * y_pred_yn) 

    # (True Positive + False Positive) = 예측 값이 1(Positive) 전체
    count_true_positive_false_positive = K.sum(y_pred_yn)

    # Precision = (True Positive) / (True Positive + False Positive)
    # K.epsilon()는 'divide by zero error' 예방차원에서 작은 수를 더한다
    precision = count_true_positive / (count_true_positive_false_positive + K.epsilon())

    # return a single tensor value
    return precision


def f1score(y_target, y_pred):
    _recall = recall(y_target, y_pred)
    _precision = precision(y_target, y_pred)
    # K.epsilon()는 'divide by zero error' 예방차원에서 작은 수를 더한다
    _f1score = ( 2 * _recall * _precision) / (_recall + _precision+ K.epsilon())
    
    # return a single tensor value
    return _f1score

In [29]:
import tensorflow as tf

def f1(y_true, y_pred):
    y_pred = K.round(y_pred)
    tp = K.sum(K.cast(y_true*y_pred, 'float'), axis=0)
    tn = K.sum(K.cast((1-y_true)*(1-y_pred), 'float'), axis=0)
    fp = K.sum(K.cast((1-y_true)*y_pred, 'float'), axis=0)
    fn = K.sum(K.cast(y_true*(1-y_pred), 'float'), axis=0)

    p = tp / (tp + fp + K.epsilon())
    r = tp / (tp + fn + K.epsilon())

    f1 = 2*p*r / (p+r+K.epsilon())
    f1 = tf.where(tf.math.is_nan(f1), tf.zeros_like(f1), f1)
    return K.mean(f1)

def f1_loss(y_true, y_pred):
    
    tp = K.sum(K.cast(y_true*y_pred, 'float'), axis=0)
    tn = K.sum(K.cast((1-y_true)*(1-y_pred), 'float'), axis=0)
    fp = K.sum(K.cast((1-y_true)*y_pred, 'float'), axis=0)
    fn = K.sum(K.cast(y_true*(1-y_pred), 'float'), axis=0)

    p = tp / (tp + fp + K.epsilon())
    r = tp / (tp + fn + K.epsilon())

    f1 = 2*p*r / (p+r+K.epsilon())
    f1 = tf.where(tf.math.is_nan(f1), tf.zeros_like(f1), f1)
    return 1 - K.mean(f1)


In [30]:
model.compile("adam", f1_loss, metrics=['accuracy', f1])
        

In [31]:
a =  train['like_timestamp'].values

In [32]:
a = a.astype(np.float32)

In [28]:
history = model.fit(train_model_input, a,
                    batch_size = 256,
                    epochs = 5,
                    verbose = 1,
                    validation_split = 0.2,)

Epoch 1/5


AttributeError: in user code:

    /home/hyez/anaconda3/envs/dask-cudf/lib/python3.7/site-packages/tensorflow/python/keras/engine/training.py:805 train_function  *
        return step_function(self, iterator)
    <ipython-input-20-193183065d88>:28 f1_loss  *
        f1 = tf.where(tf.is_nan(f1), tf.zeros_like(f1), f1)

    AttributeError: module 'tensorflow' has no attribute 'is_nan'


In [24]:
pred_ans = model.predict(test_model_input, batch_size = 256)

In [25]:
rce_like = compute_rce(pred_ans, test['like_timestamp'])
rce_like

5.558013052157563

In [26]:
ap_like = average_precision_score(test['like_timestamp'], pred_ans)
ap_like

0.5358848036096607

In [28]:
save_model(model, 'DeepFM.h5')

In [39]:
from deepctr.utils import custom_objects

ImportError: cannot import name 'custom_objects' from 'deepctr.utils' (/home/nyongja/anaconda3/envs/dask/lib/python3.7/site-packages/deepctr/utils.py)

In [40]:
import deepctr

In [41]:
deepctr.__version__

'0.8.5'