In [1]:
import os
import sys
sys.path.append('..')

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import pickle
import joblib
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow import feature_column as fc
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn import preprocessing
from sklearn.utils import shuffle
from tensorflow.keras.models import save_model,load_model

from utils.util import *
from utils.preprocessing import *
from utils.dataiter import Dataiter
from utils.evaluate import calculate_ctr, compute_rce, average_precision_score
from utils.target_encode import MTE_one_shot

import tensorflow.keras.backend as K
import core.config as conf

## Load Data

In [2]:

path = f'{conf.dataset_mini_path}/test'
test = read_data(path)


In [3]:
TARGET = 'like'

## Preprocessing

In [4]:
def set_dataframe_types(df, train):
    df['id']   = np.arange( df.shape[0] )
    df['id']   = df['id'].astype(np.uint32)

    if train:
        df['reply_timestamp']   = df['reply_timestamp'].fillna(0)
        df['retweet_timestamp'] = df['retweet_timestamp'].fillna(0)
        df['comment_timestamp'] = df['comment_timestamp'].fillna(0)
        df['like_timestamp']    = df['like_timestamp'].fillna(0)

        df['reply_timestamp']   = df['reply_timestamp'].astype(np.uint32)
        df['retweet_timestamp'] = df['retweet_timestamp'].astype(np.uint32)
        df['comment_timestamp'] = df['comment_timestamp'].astype(np.uint32)
        df['like_timestamp']    = df['like_timestamp'].astype(np.uint32)

    df['tweet_timestamp']         = df['tweet_timestamp'].astype( np.uint32 )
    df['creator_follower_count']  = df['creator_follower_count'].astype( np.uint32 )
    df['creator_following_count'] = df['creator_following_count'].astype( np.uint32 )
    df['creator_account_creation']= df['creator_account_creation'].astype( np.uint32 )
    df['engager_follower_count']  = df['engager_follower_count'].astype( np.uint32 )
    df['engager_following_count'] = df['engager_following_count'].astype( np.uint32 )
    df['engager_account_creation']= df['engager_account_creation'].astype( np.uint32 )

    return df



def preprocess(df, target, train):
    df = set_dataframe_types(df, train)
    # df = df.set_index('id')
    # df.columns = conf.raw_features + conf.labels
    df = df.drop('text_tokens', axis=1)
    
    df = feature_extraction(df, features=conf.used_features, train=train) # extract 'used_features'
    cols = []

    return df

In [5]:
test = preprocess(test, TARGET, True)

### pickle matching

#### language

In [6]:
pickle_path = conf.pickle_data

In [7]:
user_main_language_path = pickle_path + "user_main_language.pkl"

if os.path.exists(user_main_language_path) :
    with open(user_main_language_path, 'rb') as f :
        user_main_language = pickle.load(f)

In [8]:
language_dict_path = pickle_path + "language_dict.pkl"

if os.path.exists(language_dict_path ) :
    with open(language_dict_path , 'rb') as f :
        language_dict = pickle.load(f)

In [9]:

test['language'] = test.apply(lambda x : language_dict[x['language']], axis = 1)


In [10]:
del language_dict

In [11]:

test['creator_main_language'] = test['creator_id'].map(user_main_language)

In [12]:

test['engager_main_language'] = test['engager_id'].map(user_main_language)

In [13]:
test['creator_and_engager_have_same_main_language'] = test.apply(lambda x : 1 if x['creator_main_language'] == x['engager_main_language'] else 0, axis = 1)

In [14]:
test['is_tweet_in_creator_main_language'] = test.apply(lambda x : 1 if x['creator_main_language'] == x['language'] else 0, axis = 1)

In [15]:
test['is_tweet_in_engager_main_language'] = test.apply(lambda x : 1 if x['engager_main_language'] == x['language'] else 0, axis = 1)

In [16]:
del user_main_language

#### engagements

In [17]:
engagement_like_path = pickle_path + "engagement-like.pkl"

if os.path.exists(engagement_like_path ) :
    with open(engagement_like_path , 'rb') as f :
        engagement_like = pickle.load(f)

In [18]:
test['engager_feature_number_of_previous_like_engagement'] = test.apply(lambda x : engagement_like[x['engager_id']], axis = 1)
del engagement_like

In [19]:
engagement_reply_path = pickle_path + "engagement-reply.pkl"

if os.path.exists(engagement_reply_path ) :
    with open(engagement_reply_path , 'rb') as f :
        engagement_reply = pickle.load(f)

In [20]:
test['engager_feature_number_of_previous_reply_engagement'] = test.apply(lambda x : engagement_reply[x['engager_id']], axis = 1)
del engagement_reply

In [21]:
engagement_retweet_path = pickle_path + "engagement-retweet.pkl"

if os.path.exists(engagement_retweet_path ) :
    with open(engagement_retweet_path , 'rb') as f :
        engagement_retweet = pickle.load(f)

In [22]:
test['engager_feature_number_of_previous_retweet_engagement'] = test.apply(lambda x : engagement_retweet[x['engager_id']], axis = 1)
del engagement_retweet

In [23]:
engagement_comment_path = pickle_path + "engagement-comment.pkl"

if os.path.exists(engagement_comment_path ) :
    with open(engagement_comment_path , 'rb') as f :
        engagement_comment = pickle.load(f)

In [24]:
test['engager_feature_number_of_previous_comment_engagement'] = test.apply(lambda x : engagement_comment[x['engager_id']], axis = 1)
del engagement_comment

In [25]:
test['number_of_engagements_positive'] = test.apply(lambda x : x['engager_feature_number_of_previous_like_engagement'] + x['engager_feature_number_of_previous_retweet_engagement'] + x['engager_feature_number_of_previous_reply_engagement'] + x['engager_feature_number_of_previous_comment_engagement'], axis = 1)

In [26]:
test[f'number_of_engagements_ratio_like'] = test.apply(lambda x : x[f'engager_feature_number_of_previous_like_engagement'] / x['number_of_engagements_positive'] if x['number_of_engagements_positive'] != 0 else 0, axis = 1)
test[f'number_of_engagements_ratio_reply'] = test.apply(lambda x : x[f'engager_feature_number_of_previous_reply_engagement'] / x['number_of_engagements_positive'] if x['number_of_engagements_positive'] != 0 else 0, axis = 1)
test[f'number_of_engagements_ratio_retweet'] = test.apply(lambda x : x[f'engager_feature_number_of_previous_retweet_engagement'] / x['number_of_engagements_positive'] if x['number_of_engagements_positive'] != 0 else 0, axis = 1)
test[f'number_of_engagements_ratio_comment'] = test.apply(lambda x : x[f'engager_feature_number_of_previous_comment_engagement'] / x['number_of_engagements_positive'] if x['number_of_engagements_positive'] != 0 else 0, axis = 1)

## Split

In [27]:
label_names = ['reply', 'retweet', 'comment', 'like']
DONT_USE = ['tweet_timestamp','creator_account_creation','engager_account_creation','engage_time',
            'creator_account_creation', 'engager_account_creation',
            'fold','tweet_id', 
            'tr','dt_day','','',
            'engager_id','creator_id','engager_is_verified',
            'elapsed_time',
            'links','domains','hashtags0','hashtags1',
            'hashtags','tweet_hash','dt_second','id',
            'tw_hash0',
            'tw_hash1',
            'tw_rt_uhash',
            'same_language', 'nan_language','language',
            'tw_hash', 'tw_freq_hash','tw_first_word', 'tw_second_word', 'tw_last_word', 'tw_llast_word',
            'ypred','creator_count_combined','creator_user_fer_count_delta_time','creator_user_fing_count_delta_time','creator_user_fering_count_delta_time','creator_user_fing_count_mode','creator_user_fer_count_mode','creator_user_fering_count_mode'
           ]
DONT_USE += label_names
DONT_USE += conf.labels

In [28]:
RMV = [c for c in DONT_USE if c in test.columns]

In [29]:
yt_test = test[label_names]
Xt_test = test.drop(RMV, axis=1)
del test

## Scaling

In [30]:
standard_scaler = pickle.load(open(conf.scaler_path + 'scaler.pkl', 'rb'))

In [31]:
Xt_test = Xt_test.reset_index(drop=True)

In [32]:
scaling_columns = ['creator_following_count', 'creator_follower_count', 'engager_follower_count', 
                   'engager_following_count', 'dt_dow', 'dt_hour', 'len_domains', 'creator_main_language', 'engager_main_language',
                   'engager_feature_number_of_previous_like_engagement',
                   'engager_feature_number_of_previous_reply_engagement',
                   'engager_feature_number_of_previous_retweet_engagement',
                   'engager_feature_number_of_previous_comment_engagement',
                   'number_of_engagements_positive']

In [33]:
ss = standard_scaler.transform(Xt_test[scaling_columns])
Xt_test[scaling_columns] = pd.DataFrame(ss, columns = scaling_columns)

In [34]:
Xt_test = Xt_test.fillna(Xt_test.mean())

## Modeling

In [37]:
from tensorflow import keras

In [60]:
model = model = keras.models.load_model(f'/hdd/models/ffnn_pkl/ffnn_all/ffnn--comment-130')

In [41]:
Xt_test

Unnamed: 0,tweet_type,creator_follower_count,creator_following_count,engager_follower_count,engager_following_count,media,dt_dow,dt_hour,len_domains,creator_main_language,...,is_tweet_in_engager_main_language,engager_feature_number_of_previous_like_engagement,engager_feature_number_of_previous_reply_engagement,engager_feature_number_of_previous_retweet_engagement,engager_feature_number_of_previous_comment_engagement,number_of_engagements_positive,number_of_engagements_ratio_like,number_of_engagements_ratio_reply,number_of_engagements_ratio_retweet,number_of_engagements_ratio_comment
0,3,-0.162526,-0.082124,-0.076058,-0.379439,1,-0.983670,1.339368,2.577894,-0.557175,...,1,-0.348633,-0.247094,-0.217264,-0.166385,-0.381704,1.000000,0.000000,0.000000,0.000000
1,3,-0.167457,-0.093997,-0.002149,0.005615,1,-0.983670,1.191686,-0.338471,-0.226362,...,0,0.049752,-0.247094,-0.210600,-0.166385,-0.030256,0.994792,0.000000,0.005208,0.000000
2,3,-0.169131,-0.105358,-0.079302,-0.345956,0,-1.484162,0.305590,-0.338471,-0.722582,...,1,-0.366144,-0.247094,-0.217264,-0.166385,-0.397068,1.000000,0.000000,0.000000,0.000000
3,3,-0.163935,-0.107136,-0.077173,-0.333998,0,-1.484162,-1.318919,2.577894,0.269858,...,0,-0.363955,-0.247094,-0.217264,-0.166385,-0.395148,1.000000,0.000000,0.000000,0.000000
4,0,-0.168604,-0.035928,-0.034187,-0.048795,0,-0.983670,-1.466601,-0.338471,-0.557175,...,0,0.192033,0.760813,0.029269,1.509986,0.229009,0.782875,0.073394,0.113150,0.030581
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
876157,3,-0.167974,-0.013056,0.723553,0.798445,0,-1.484162,1.487051,-0.338471,2.089331,...,0,-0.315799,0.172867,-0.170622,-0.166385,-0.320249,0.585366,0.243902,0.170732,0.000000
876158,3,-0.138012,-0.099813,-0.038952,-0.186912,1,-0.983670,0.748638,-0.338471,-0.557175,...,1,-0.328932,-0.121105,-0.210600,-0.166385,-0.356738,0.818182,0.136364,0.045455,0.000000
876159,3,-0.000808,-0.106413,-0.041182,-0.134894,0,-0.983670,-0.432823,2.577894,-0.722582,...,1,-0.348633,-0.247094,-0.217264,-0.166385,-0.381704,1.000000,0.000000,0.000000,0.000000
876160,3,-0.169115,-0.109035,-0.076970,-0.270619,1,-1.484162,0.748638,2.577894,-0.391769,...,1,-0.368333,-0.247094,-0.217264,-0.166385,-0.398989,0.000000,0.000000,0.000000,0.000000


In [61]:
X_test = Xt_test.drop(conf.drop_features[2], axis = 1)

In [62]:
y_test = yt_test['comment']

## Predict

In [63]:
pred = model.predict(X_test)

In [64]:
rce = compute_rce(pred, y_test)
rce

24.91234911157657

In [65]:
average_precision_score(y_test, pred)

0.13745298017791904