In [27]:
import sys
sys.path.append('../..')

import tqdm
import pandas as pd
from tensorflow.python.keras.models import  save_model,load_model
from tensorflow.keras import optimizers
from deepctr.models import DeepFM
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from deepctr.feature_column import SparseFeat, DenseFeat, get_feature_names
from sklearn import preprocessing
from utils.preprocessing import read_data
import core.config as conf

from utils.evaluate import calculate_ctr, compute_rce, average_precision_score

In [2]:
data_path = conf.raw_data_path
df = pd.read_parquet(data_path + 'part-00000')

In [3]:
DONT_USE = ['text_ tokens', 'hashtags', 'tweet_id', 'engaged_with_user_id', 'enaging_user_id', 'language','present_links', 'present_domains', 'id']
features = [c for c in df.columns if c not in DONT_USE]

In [4]:
df = df[features]

In [5]:
df.head()

Unnamed: 0,present_media,tweet_type,tweet_timestamp,engaged_with_user_follower_count,engaged_with_user_following_count,engaged_with_user_is_verified,engaged_with_user_account_creation,enaging_user_follower_count,enaging_user_following_count,enaging_user_is_verified,enaging_user_account_creation,engagee_follows_engager,reply_timestamp,retweet_timestamp,retweet_with_comment_timestamp,like_timestamp
0,,Retweet,1613237034,2473,662,False,1261859734,169,339,False,1520886748,False,0,0,0,0
1,,TopLevel,1613748600,4418640,228,True,1266804490,393,1190,False,1237570695,False,0,0,0,0
2,,TopLevel,1613386238,219715,3685,True,1202617218,629,1473,False,1263176351,False,0,0,0,1613388292
3,,Retweet,1613708640,2388283,13511,True,1251645191,123,200,False,1268276559,False,0,0,0,0
4,Photo,TopLevel,1612586018,414,720,False,1578273274,134,379,False,1483862063,True,0,0,0,1612587384


In [6]:
sparse_features = ['present_media', 'tweet_type', 'engaged_with_user_is_verified', 'enaging_user_is_verified', 'engagee_follows_engager']
dense_features = ['tweet_timestamp', 'engaged_with_user_follower_count', 'engaged_with_user_following_count', 'engaged_with_user_account_creation', 'enaging_user_follower_count', 'enaging_user_following_count', 'enaging_user_account_creation']

In [7]:
target = ['like_timestamp', 'reply_timestamp', 'retweet_timestamp', 'retweet_with_comment_timestamp']

In [8]:
df[target] = df[target].applymap(lambda x : 1 if x > 0 else 0)

In [9]:
df.head()

Unnamed: 0,present_media,tweet_type,tweet_timestamp,engaged_with_user_follower_count,engaged_with_user_following_count,engaged_with_user_is_verified,engaged_with_user_account_creation,enaging_user_follower_count,enaging_user_following_count,enaging_user_is_verified,enaging_user_account_creation,engagee_follows_engager,reply_timestamp,retweet_timestamp,retweet_with_comment_timestamp,like_timestamp
0,,Retweet,1613237034,2473,662,False,1261859734,169,339,False,1520886748,False,0,0,0,0
1,,TopLevel,1613748600,4418640,228,True,1266804490,393,1190,False,1237570695,False,0,0,0,0
2,,TopLevel,1613386238,219715,3685,True,1202617218,629,1473,False,1263176351,False,0,0,0,1
3,,Retweet,1613708640,2388283,13511,True,1251645191,123,200,False,1268276559,False,0,0,0,0
4,Photo,TopLevel,1612586018,414,720,False,1578273274,134,379,False,1483862063,True,0,0,0,1


## Preprocessing

In [10]:
for feat in sparse_features :
    lbe = LabelEncoder()
    df[feat] = lbe.fit_transform(df[feat])

In [11]:
mms = MinMaxScaler(feature_range = (0, 1))
df[dense_features] = mms.fit_transform(df[dense_features])

## Generate Feature Columns

In [12]:
fixlen_feature_columns = [SparseFeat(feat, vocabulary_size = df[feat].max() +1, embedding_dim = 4) for feat in sparse_features]  + [DenseFeat(feat, 1,) for feat in dense_features]

In [13]:
dnn_feature_columns = fixlen_feature_columns
linear_feature_columns = fixlen_feature_columns

feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)

## Split Dataset

In [14]:
train, test = train_test_split(df, test_size = 0.2)

In [15]:
train_model_input = {name:train[name].values for name in feature_names}
test_model_input = {name:test[name].values for name in feature_names}

## Modeling

In [16]:
model = DeepFM(linear_feature_columns, dnn_feature_columns, task = 'binary')

In [17]:
model.compile("adam", "binary_crossentropy", 
             metrics = ['binary_crossentropy'])

In [18]:
history = model.fit(train_model_input, train['like_timestamp'].values,
                    batch_size = 256,
                    epochs = 5,
                    verbose = 1,
                    validation_split = 0.2,)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [19]:
pred_ans = model.predict(test_model_input, batch_size = 256)

In [20]:
rce_like = compute_rce(pred_ans, test['like_timestamp'])
rce_like

5.45389187849592

In [21]:
ap_like = average_precision_score(test['like_timestamp'], pred_ans)
ap_like

0.5341913966290482

In [28]:
save_model(model, 'DeepFM.h5')

In [39]:
from deepctr.utils import custom_objects

ImportError: cannot import name 'custom_objects' from 'deepctr.utils' (/home/nyongja/anaconda3/envs/dask/lib/python3.7/site-packages/deepctr/utils.py)

In [40]:
import deepctr

In [41]:
deepctr.__version__

'0.8.5'