In [19]:
import sys
sys.path.append('../../..')

import tqdm
import pandas as pd
from tensorflow.python.keras.models import  save_model,load_model
from tensorflow.keras import optimizers
from deepctr.models import DeepFM
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from deepctr.feature_column import SparseFeat, DenseFeat, get_feature_names
from sklearn import preprocessing
from utils.preprocessing import read_data
import core.config as conf
import numpy as np
import tensorflow as tf 
from utils.evaluate import calculate_ctr, compute_rce, average_precision_score

In [2]:
data_path = conf.raw_data_path
df = pd.read_parquet(data_path + 'part-00043')

In [3]:
DONT_USE = ['text_ tokens', 'hashtags', 'tweet_id', 'engaged_with_user_id', 'enaging_user_id', 'language','present_links', 'present_domains', 'id']
features = [c for c in df.columns if c not in DONT_USE]

In [4]:
df = df[features]

In [5]:
df.head()

Unnamed: 0,present_media,tweet_type,tweet_timestamp,engaged_with_user_follower_count,engaged_with_user_following_count,engaged_with_user_is_verified,engaged_with_user_account_creation,enaging_user_follower_count,enaging_user_following_count,enaging_user_is_verified,enaging_user_account_creation,engagee_follows_engager,reply_timestamp,retweet_timestamp,retweet_with_comment_timestamp,like_timestamp
0,Photo\tPhoto,Retweet,1612787398,390,322,False,1600869556,219,215,False,1594373633,True,0,0,0,0
1,,TopLevel,1614058647,86775,375,False,1437682138,42,53,False,1517645474,False,0,0,0,0
2,Video,TopLevel,1613991576,276517,230,False,1562367304,111,123,False,1572527313,False,0,0,0,1614004646
3,,TopLevel,1612602066,330,499,False,1404678822,270,241,False,1422212371,True,0,0,0,1612606852
4,,TopLevel,1613303201,1005,543,False,1480686845,413,681,False,1493199020,False,0,0,0,0


In [6]:
sparse_features = ['present_media', 'tweet_type', 'engaged_with_user_is_verified', 'enaging_user_is_verified', 'engagee_follows_engager']
dense_features = ['tweet_timestamp', 'engaged_with_user_follower_count', 'engaged_with_user_following_count', 'engaged_with_user_account_creation', 'enaging_user_follower_count', 'enaging_user_following_count', 'enaging_user_account_creation']

In [7]:
target_timestamp = ['like_timestamp', 'reply_timestamp', 'retweet_timestamp', 'retweet_with_comment_timestamp']
target = ['like', 'reply', 'retweet', 'comment']


In [8]:
df[target] = df[target_timestamp].applymap(lambda x : 1 if x > 0 else 0)

In [9]:
df.head()

Unnamed: 0,present_media,tweet_type,tweet_timestamp,engaged_with_user_follower_count,engaged_with_user_following_count,engaged_with_user_is_verified,engaged_with_user_account_creation,enaging_user_follower_count,enaging_user_following_count,enaging_user_is_verified,enaging_user_account_creation,engagee_follows_engager,reply_timestamp,retweet_timestamp,retweet_with_comment_timestamp,like_timestamp,like,reply,retweet,comment
0,Photo\tPhoto,Retweet,1612787398,390,322,False,1600869556,219,215,False,1594373633,True,0,0,0,0,0,0,0,0
1,,TopLevel,1614058647,86775,375,False,1437682138,42,53,False,1517645474,False,0,0,0,0,0,0,0,0
2,Video,TopLevel,1613991576,276517,230,False,1562367304,111,123,False,1572527313,False,0,0,0,1614004646,1,0,0,0
3,,TopLevel,1612602066,330,499,False,1404678822,270,241,False,1422212371,True,0,0,0,1612606852,1,0,0,0
4,,TopLevel,1613303201,1005,543,False,1480686845,413,681,False,1493199020,False,0,0,0,0,0,0,0,0


## Preprocessing

In [10]:
for feat in sparse_features :
    lbe = LabelEncoder()
    df[feat] = lbe.fit_transform(df[feat])

In [11]:
mms = MinMaxScaler(feature_range = (0, 1))
df[dense_features] = mms.fit_transform(df[dense_features])

## Negative Sampling 

In [12]:
df_positive = df[df['like']==1]
df_negative = df[df['like']==0]
print(len(df_positive))
print(len(df_negative))

df_negative = df_negative.sample(n = len(df_positive))
df = pd.concat([df_positive, df_negative])
df = df.sample(frac=1)
df = df.reset_index(drop=True)

1208624
1824917


## Generate Feature Columns

In [13]:
fixlen_feature_columns = [SparseFeat(feat, vocabulary_size = df[feat].max() +1, embedding_dim = 4) for feat in sparse_features]  + [DenseFeat(feat, 1,) for feat in dense_features]

In [14]:
dnn_feature_columns = fixlen_feature_columns
linear_feature_columns = fixlen_feature_columns

feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)

In [15]:
train, test = train_test_split(df, test_size = 0.2)

In [16]:
train_model_input = {name:train[name].values for name in feature_names}
test_model_input = {name:test[name].values for name in feature_names}

## Modeling

In [25]:
model = DeepFM(linear_feature_columns, dnn_feature_columns, task = 'binary')
model.compile('adam', 'binary_crossentropy', metrics=['accuracy', 'binary_crossentropy', tf.keras.metrics.kl_divergence])


In [26]:
history = model.fit(train_model_input, train['like'].values.astype(np.float32),
                    batch_size = 256,
                    epochs = 50,
                    verbose = 1,
                    validation_split = 0.2,)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [24]:
pred_ans = model.predict(test_model_input, batch_size = 256)

In [25]:
rce_like = compute_rce(pred_ans, test['like_timestamp'])
rce_like

5.558013052157563

In [26]:
ap_like = average_precision_score(test['like_timestamp'], pred_ans)
ap_like

0.5358848036096607

In [28]:
save_model(model, 'DeepFM.h5')

In [39]:
from deepctr.utils import custom_objects

ImportError: cannot import name 'custom_objects' from 'deepctr.utils' (/home/nyongja/anaconda3/envs/dask/lib/python3.7/site-packages/deepctr/utils.py)

In [40]:
import deepctr

In [41]:
deepctr.__version__

'0.8.5'