# Import libraries and classes, set parameters

In [1]:
import re

import pandas as pd
import torch

import transformers

None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


In [3]:
DATA_PATH = "./data/"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

print(DEVICE)

In [4]:
orig_features = [
    'text_tokens',    ###############
    'hashtags',       #Tweet Features
    'tweet_id',       #
    'media',          #
    'links',          #
    'domains',        #
    'tweet_type',     #
    'language',       #
    'timestamp',      ###############
    'engaged_with_user_id',              ###########################
    'engaged_with_user_follower_count',  #Engaged With User Features
    'engaged_with_user_following_count', #
    'engaged_with_user_is_verified',     #
    'ngaged_with_user_account_creation', ###########################
    'engaging_user_id',                  #######################
    'engaging_user_follower_count',      #Engaging User Features
    'engaging_user_following_count',     #
    'engaging_user_is_verified',         #
    'engaging_user_account_creation',    #######################
    'engagee_follows_engager',    #################### Engagement Features
    'reply',          #Target Reply
    'retweet',        #Target Retweet    
    'retweet_comment',#Target Retweet with comment
    'like',           #Target Like
                      ####################
]

target_features = orig_features[-4:]

# Load and preprocess data

In [5]:
orig_df = pd.read_csv(DATA_PATH+"part-00000.tsv", header=None, names=orig_features, sep='\x01')
orig_df.sort_values(by=['timestamp'], inplace=True)
orig_df.head()

Unnamed: 0,text_tokens,hashtags,tweet_id,media,links,domains,tweet_type,language,timestamp,engaged_with_user_id,...,engaging_user_id,engaging_user_follower_count,engaging_user_following_count,engaging_user_is_verified,engaging_user_account_creation,engagee_follows_engager,reply,retweet,retweet_comment,like
751371,101\t191\t10123\t10575\t10124\t10106\t10105\t1...,,8020D9FFAC1532D4B538D66B04AE016F,,,,TopLevel,488B32D24BD4BB44172EB981C1BCA6FA,1612396800,30AED660ACD2D8EFD8257CB246D85371,...,BAEE880E85135F7D4CC93E32C41377C3,45,310,False,1552767471,False,,,,1612402000.0
62787,101\t10159\t84688\t10127\t13272\t37704\t101385...,,49ED4E73B4D979C5FEBBF6A4B047F868,,544D05AC7DBE0C59F878182FE35CE522,2C6143040101DE6BA7A8A4DB7E5A1436,TopLevel,B0FA488F2911701DD8EC5B1EA5E322D8,1612396800,1A1157062D4EC419710395E9175745CD,...,15BC8C370AB37837B87F3F2C0AFAC207,14,102,False,1444306290,False,,,,
808235,101\t27866\t10662\t22013\t10157\t117\t10472\t1...,,3E4B8281BBE8A7496CA4F14398E15863,,,,TopLevel,488B32D24BD4BB44172EB981C1BCA6FA,1612396800,10FC0159B65285C27D54B5CE919E9C33,...,3FBD8B6C403A094838D589EF9CB8CBE8,346,2776,False,1509048820,False,,,,1612409000.0
964028,101\t20452\t10142\t14908\t10841\t17565\t80677\...,,AC97A848CC9CE41F42D566B2C5EAAB45,Video,,,TopLevel,488B32D24BD4BB44172EB981C1BCA6FA,1612396800,E7A2CEC020385D583CA1F15C53E671DD,...,18F267984152FCED7CF751CFF291DC6F,74,140,False,1580571913,False,,,,
2162779,101\t49331\t72041\t10169\t10105\t11836\t20206\...,,2647BC8041258863486AB5DE695099B1,Video,,,TopLevel,488B32D24BD4BB44172EB981C1BCA6FA,1612396800,9A856BA94C206A677B4C5FF299C564B9,...,DACEDD69E830DF244FB0630B6A388D68,114,212,False,1539876578,False,,1612407000.0,,


In [7]:
bert_multilingual_tokenizer = transformers.BertTokenizer.from_pretrained("bert-base-multilingual-cased")
xlm_t_tokenizer = transformers.XLMRobertaTokenizer.from_pretrained("cardiffnlp/twitter-xlm-roberta-base")

In [8]:
def transform_tokens_count_urls(text_tokens, bert_tokenizer, xlm_t_tokenizer):
    tweet_text = bert_tokenizer.decode([int(s) for s in text_tokens.split('\t')])
    
    link_cnt = 0
    for url in re.finditer("(https : \/(\s(\/\s)?\w+(\.)?)+)", tweet_text):
        link_cnt += 1
        
    new_ids = xlm_t_tokenizer.convert_tokens_to_ids(xlm_t_tokenizer.tokenize(tweet_text))
    
    return new_ids, link_cnt
    
# transform_tokens_count_urls(orig_df['text_tokens'][3], bert_multilingual_tokenizer, xlm_t_tokenizer)

In [9]:
def fix_target(df, col):
    df[col].fillna(0, inplace=True)
    df[col].mask(df[col]>0, 1, inplace=True)
    
for col in target_features:
    fix_target(orig_df, col)

orig_df.head()

Unnamed: 0,text_tokens,hashtags,tweet_id,media,links,domains,tweet_type,language,timestamp,engaged_with_user_id,...,engaging_user_id,engaging_user_follower_count,engaging_user_following_count,engaging_user_is_verified,engaging_user_account_creation,engagee_follows_engager,reply,retweet,retweet_comment,like
751371,101\t191\t10123\t10575\t10124\t10106\t10105\t1...,,8020D9FFAC1532D4B538D66B04AE016F,,,,TopLevel,488B32D24BD4BB44172EB981C1BCA6FA,1612396800,30AED660ACD2D8EFD8257CB246D85371,...,BAEE880E85135F7D4CC93E32C41377C3,45,310,False,1552767471,False,0.0,0.0,0.0,1.0
62787,101\t10159\t84688\t10127\t13272\t37704\t101385...,,49ED4E73B4D979C5FEBBF6A4B047F868,,544D05AC7DBE0C59F878182FE35CE522,2C6143040101DE6BA7A8A4DB7E5A1436,TopLevel,B0FA488F2911701DD8EC5B1EA5E322D8,1612396800,1A1157062D4EC419710395E9175745CD,...,15BC8C370AB37837B87F3F2C0AFAC207,14,102,False,1444306290,False,0.0,0.0,0.0,0.0
808235,101\t27866\t10662\t22013\t10157\t117\t10472\t1...,,3E4B8281BBE8A7496CA4F14398E15863,,,,TopLevel,488B32D24BD4BB44172EB981C1BCA6FA,1612396800,10FC0159B65285C27D54B5CE919E9C33,...,3FBD8B6C403A094838D589EF9CB8CBE8,346,2776,False,1509048820,False,0.0,0.0,0.0,1.0
964028,101\t20452\t10142\t14908\t10841\t17565\t80677\...,,AC97A848CC9CE41F42D566B2C5EAAB45,Video,,,TopLevel,488B32D24BD4BB44172EB981C1BCA6FA,1612396800,E7A2CEC020385D583CA1F15C53E671DD,...,18F267984152FCED7CF751CFF291DC6F,74,140,False,1580571913,False,0.0,0.0,0.0,0.0
2162779,101\t49331\t72041\t10169\t10105\t11836\t20206\...,,2647BC8041258863486AB5DE695099B1,Video,,,TopLevel,488B32D24BD4BB44172EB981C1BCA6FA,1612396800,9A856BA94C206A677B4C5FF299C564B9,...,DACEDD69E830DF244FB0630B6A388D68,114,212,False,1539876578,False,0.0,1.0,0.0,0.0


In [74]:
#based on https://maxhalford.github.io/blog/target-encoding/

class TargetEncoder():
    
    def __init__(self, cols, w):
        
        if isinstance(cols, str):
            self.cols = [cols]
        else: self.cols = cols

        self.w = w
        
    def fit(self, X, y):
        #calculate target overall mean
        if y not in X.columns:
             raise ValueError('Column: {} not in dataframe'.format(y))
        
        self.target = y
        self.target_mean= X[self.target].mean()
        
        #store mapping for each column w.r.t target column
        self.maps = {}
        for col in self.cols:
            
            if col not in X:
                self.maps = {} # valid state if we decide to transform nothing happens
                raise ValueError('Column: {} not in dataframe'.format(col))

            aggr = X.groupby(col)[self.target].agg(['count', 'mean'])
            counts = aggr['count']
            means = aggr['mean']

            self.maps[col] = (counts * means + self.w * self.target_mean) / (counts + self.w)

        return self

    
    def transform(self, X, y=None):
        X_new = X.copy()
        for col, smooth in self.maps.items():
            new_col_name = col + '_' + self.target + "_TE"
            X_new[new_col_name] = X_new[col].map(smooth) 
        return X_new
    
    def fit_transform(self, X, y):
        return self.fit(X, y).transform(X, y)

In [75]:
df_test = pd.DataFrame({
    'x_0': ['a'] * 5 + ['b'] * 5,
    'x_1': ['c'] * 9 + ['d'] * 1,
    'y': [1, 1, 1, 1, 0, 1, 0, 0, 0, 0]
})

te = TargetEncoder(['x_0','x_1'], 10)
df_test

Unnamed: 0,x_0,x_1,y
0,a,c,1
1,a,c,1
2,a,c,1
3,a,c,1
4,a,c,0
5,b,c,1
6,b,c,0
7,b,c,0
8,b,c,0
9,b,d,0


In [76]:
new_df = te.fit_transform(df_test, 'y')
new_df

Unnamed: 0,x_0,x_1,y,x_0_y_TE,x_1_y_TE
0,a,c,1,0.6,0.526316
1,a,c,1,0.6,0.526316
2,a,c,1,0.6,0.526316
3,a,c,1,0.6,0.526316
4,a,c,0,0.6,0.526316
5,b,c,1,0.4,0.526316
6,b,c,0,0.4,0.526316
7,b,c,0,0.4,0.526316
8,b,c,0,0.4,0.526316
9,b,d,0,0.4,0.454545


In [38]:
# m = 10
# te = TargetEncoder(['engaged_with_user_id', 'engaging_user_id'], m)

In [None]:
### Next steps:
# 1.split - 60/20/20?
# 2.apply targetencoding where needed in train
# 3.transform text to correct ids for XLM-T - already have tokenizers
# 4.create dataset object
# 5.finish the wide & deep model architecture
# 6.create trainer object & train 
# 7.use validate to monitor loss on every ??? steps(use tensorboard??)
# 8.add evaluation metrics as in contest, eval on test set

### TO think about:
# - create dataset only for some users, can't train on all, too big, 
#   this way we have all the information in table for specific users
# - use hash func for user-ids?
# - reduce size of embeddings from MLM
# - 