In [2]:
import sys
sys.path.append('../../..')

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow import feature_column as fc
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn import preprocessing
from tensorflow.keras.models import save_model,load_model
from utils.preprocessing import *
from utils.dataset import Dataset
import pickle

 
import tensorflow_datasets as tfds
import tensorflow_recommenders as tfrs


from utils.evaluate import calculate_ctr, compute_rce, average_precision_score
import tensorflow.keras.backend as K
import core.config as conf

## Load Data & Preprocessing

In [3]:
ds = Dataset(train=True, target_encoding=False)

In [4]:
data_path = conf.dataset_mini_path + 'train'
df = read_data(data_path)
df = ds.preprocess(df)

In [5]:
data_path = conf.dataset_mini_path + 'valid'
val_df = read_data(data_path)
val_df = ds.preprocess(val_df)

In [6]:
data_path = conf.dataset_mini_path + 'test'
test_df = read_data(data_path)
test_df = ds.preprocess(test_df)

In [7]:
df.columns

Index(['creator_follows_engager', 'engager_follower_count',
       'creator_account_creation', 'engager_following_count',
       'creator_is_verified', 'language', 'tweet_id', 'tweet_type',
       'creator_follower_count', 'engager_id', 'domains', 'media',
       'tweet_timestamp', 'engager_is_verified', 'creator_id',
       'creator_following_count', 'engager_account_creation', 'reply',
       'retweet', 'comment', 'like', 'dt_day', 'dt_dow', 'dt_hour',
       'len_domains'],
      dtype='object')

In [8]:
pkl_path = conf.dict_path + 'user_main_language.pkl'
with open(pkl_path, 'rb') as f:
    user_to_main_language = pickle.load(f)

In [9]:
users = user_to_main_language.keys()

In [10]:
used_features = ['engager_follower_count', 
                'engager_following_count', 
                'engager_is_verified', 
                'engager_account_creation', 
                'creator_follower_count', 
                'creator_following_count', 
                'creator_is_verified', 
                'creator_account_creation',
                'media',
                'domains',
                'language',
                'dt_day', 
                'dt_dow', 
                'dt_hour', 
                'len_domains']

In [11]:
X_train = df[used_features]
Y_train = df['like']

X_val = val_df[used_features]
Y_val = val_df['like']

X_test = test_df[used_features]
Y_test = test_df['like']

In [13]:
df['engager_main_language'] = df['engager_id'].apply(lambda x: user_to_main_language[x])
df['creator_main_language'] = df['creator_id'].apply(lambda x: user_to_main_language[x])
df['is_same_main_language'] = df['engager_main_language'] == df['creator_main_language']

In [14]:
df.head()


Unnamed: 0,creator_follows_engager,engager_follower_count,creator_account_creation,engager_following_count,creator_is_verified,language,tweet_id,tweet_type,creator_follower_count,engager_id,...,retweet,comment,like,dt_day,dt_dow,dt_hour,len_domains,engager_main_language,creator_main_language,is_same_main_language
0,0,595,1525021638,1424,0,10,6238B9E15E83B6D477394E9D80B3784E,3,560,CFFD4D0D3AAF28B5397A98E1F4C92C77,...,0.0,0.0,0.0,9,1,15,0,35,9,False
1,1,514,1348805977,416,0,1,731FB90C6CFEF1B71D322106573F71DB,0,2591,E0F9A3BB331C7B613B165F32F8D273BE,...,0.0,0.0,0.0,9,1,9,0,1,1,True
2,1,376,1583939938,311,0,5,89276E5272498E858EE8AF691EBF0951,3,174,58ABF7FB24D65FB94C6487915A889460,...,0.0,0.0,1.0,18,3,15,0,4,5,False
3,1,22461,1306675315,16342,1,12,502FDBC0EB4E7AB157D38262817716EB,3,115852,76730357A84D1D04A662FDB1449F1AFF,...,0.0,0.0,0.0,8,0,2,1,1,12,False
4,0,388,1308377880,647,1,0,CF1F523F7D4D4139E5FD3EBD72F27D5A,3,1208919,9008C5AB3CB46623ADBD4C68A2A2D088,...,0.0,0.0,1.0,11,3,16,0,13,0,False
