# Extracted from text token
tweet_feature_mentions: list of ints (or None):
- Mentions extracted from the tweet.

tweet_feature_number_of_mentions: int:
- Number of mentions in the tweet.

tweet_feature_token_length: int:
- Number of BERT tokens in the tweet.

tweet_feature_token_length_unique: int:
- Number of unique bert tokens in the tweet.

tweet_feature_text_token_decoded: list of str:
- Decoded BERT tokens.

tweet_feature_text_topic_word_count_adult_content: int:
- Number of 'adult content' words.

tweet_feature_text_topic_word_count_kpop: int:
- Number of 'kpop' words.

tweet_feature_text_topic_word_count_covid: int:
- Number of 'covid' words.

tweet_feature_text_topic_word_count_sport: int:
- Number of 'sport' words.

In [8]:
import sys
sys.path.append('..')

import core.config as conf
from utils.preprocessing import *
import numpy as np
from tqdm import tqdm
from datetime import datetime 
import matplotlib.pyplot as plt

import tensorflow as tf
from transformers import *
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint


In [4]:
#random seed 
tf.random.set_seed(1234)
np.random.seed(1234)

In [5]:
tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased", cache_dir='bert_ckpt', do_lower_case=False)

## Load data

In [10]:
data_path = '/hdd/twitter/dataset_mini/train'
df = read_data(data_path)

In [12]:
text_tokens = df['text_tokens']

In [13]:
text_tokens

0          101\t147\t69849\t11447\t57277\t63310\t136\t106...
1          101\t56898\t137\t10799\t11039\t168\t26978\t131...
2          101\t13304\t179\t14902\t74788\t10390\t33993\t1...
3          101\t620\t55399\t17451\t69304\t82692\t572\t105...
4          101\t10747\t12723\t10124\t29956\t91518\t10142\...
                                 ...                        
4338901    101\t1972\t18825\t3425\t100\t14120\t131\t120\t...
4338902    101\t56898\t137\t25944\t22659\t11305\t131\t181...
4338903    101\t100\t108\t16062\t11281\t10115\t11274\t102...
4338904    101\t100\t27577\t30416\t10107\t10655\t11239\t1...
4338905    101\t18740\t11003\t98514\t10529\t12229\t18103\...
Name: text_tokens, Length: 4338906, dtype: object

In [18]:
df['len_text_tokens'] = df['text_tokens'].apply(lambda x: len(x.split('\t')))

In [20]:
df['decoded_text_tokens'] = df['text_tokens'].apply(lambda x: tokenizer.decode(x.split('\t'), skip_special_tokens=True))

KeyboardInterrupt: 

In [19]:
tokens = list(map(lambda x: x.split('\t'), languages['text_ tokens'].to_array()))
decoded_tokens = []
n_languages = len(languages)
languages['language_id'] = range(n_languages)
for i in range(n_languages):
    decoded_tokens.append(tokenizer.decode(tokens[i], skip_special_tokens=True))
    
df['decoded_text_tokens'] = decoded_tokens

Unnamed: 0,text_tokens,hashtags,tweet_id,media,links,domains,tweet_type,language,tweet_timestamp,creator_id,...,engager_following_count,engager_is_verified,engager_account_creation,creator_follows_engager,reply_timestamp,retweet_timestamp,comment_timestamp,like_timestamp,len_tokens,len_text_tokens
0,101\t147\t69849\t11447\t57277\t63310\t136\t106...,,6238B9E15E83B6D477394E9D80B3784E,Photo\tPhoto,,,TopLevel,7F4FAB1EB12CD95EDCD9DB2A6634EFCE,1612883086,F09233A58769507FD4E6FD618BCFA5B6,...,1424,False,1307991260,False,,,,,147,30
1,101\t56898\t137\t10799\t11039\t168\t26978\t131...,,731FB90C6CFEF1B71D322106573F71DB,Photo,,,Retweet,E7F038DE3EAD397AEC9193686C911677,1612864776,46BEEE566BB708F18075A60FDDAA8503,...,416,False,1336861089,True,,,,,432,81
2,101\t13304\t179\t14902\t74788\t10390\t33993\t1...,,89276E5272498E858EE8AF691EBF0951,,,,TopLevel,1F73BB863A39DB62B4A55B7E558DB1E8,1613661020,83F59BF2E0778AC4078FA3F7B71F5960,...,311,False,1602935552,True,,,,1.613662e+09,105,19
3,101\t620\t55399\t17451\t69304\t82692\t572\t105...,,502FDBC0EB4E7AB157D38262817716EB,,F65FE23F3E97EE91DB990B4E02FC8DFC,28F197E209F61EE2F1C97FBF9128976D,TopLevel,310ECD7D1E42216E3C1B31EFDDFC72A7,1612751908,C96110509F8B6256CE08A8AE85ED75DB,...,16342,False,1366022767,True,,,,,682,124
4,101\t10747\t12723\t10124\t29956\t91518\t10142\...,,CF1F523F7D4D4139E5FD3EBD72F27D5A,,,,TopLevel,488B32D24BD4BB44172EB981C1BCA6FA,1613060168,2031F18622E7C6A30E702D33776BF2C1,...,647,False,1410270773,False,,,,1.613061e+09,365,69
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4338901,101\t1972\t18825\t3425\t100\t14120\t131\t120\t...,,EE63068E74D7E293457FDC2EA3527D65,Photo,,,TopLevel,E7F038DE3EAD397AEC9193686C911677,1612670742,968FAF2F52AACA58C2B41CE5B13A811D,...,1908,False,1420785574,False,,,,1.612707e+09,111,22
4338902,101\t56898\t137\t25944\t22659\t11305\t131\t181...,,F15B4615DA12447331AFEC90FA814B4D,,,,Retweet,488B32D24BD4BB44172EB981C1BCA6FA,1612744473,54652625625729AE0E38196970594938,...,713,False,1311226875,True,,,,,127,23
4338903,101\t100\t108\t16062\t11281\t10115\t11274\t102...,360847D5E2E14DE261D275573CFCCE34,DA947A6368CF4414BEA9172003A639FD,Photo\tPhoto,,,TopLevel,B8B04128918BBF54E2E178BFF1ABA833,1612397867,614E269F8E5D731946BDBC86E9DC4F4D,...,48,False,1597880960,False,,,,1.612443e+09,415,75
4338904,101\t100\t27577\t30416\t10107\t10655\t11239\t1...,80E7062116A700EBFA5E0A99EDCDBBAD\t3AAE07FAD1D5...,730879D11261FA6700804AD49CCB59BB,Photo\tPhoto\tPhoto\tPhoto,,,TopLevel,488B32D24BD4BB44172EB981C1BCA6FA,1612956173,5A5C7CCA69CDEBBD7E81C3797DE9DD7E,...,135,False,1604003943,False,,,,1.612962e+09,420,77
