In [3]:
import pandas as pd
import transformers

import re
import gc
import glob
from collections import Counter

features = [
    'text_tokens',    ###############
    'hashtags',       #Tweet Features
    'tweet_id',       #
    'media',          #
    'links',          #
    'domains',        #
    'tweet_type',     #
    'language',       #
    'timestamp',      ###############
    'a_user_id',              ###########################
    'a_follower_count',       #Engaged With User Features
    'a_following_count',      #
    'a_is_verified',          #
    'a_account_creation',     ###########################
    'b_user_id',              #######################
    'b_follower_count',       #Engaging User Features
    'b_following_count',      #
    'b_is_verified',          #
    'b_account_creation',     #######################
    'b_follows_a',    #################### Engagement Features
    'reply',          #Target Reply
    'retweet',        #Target Retweet    
    'retweet_comment',#Target Retweet with comment
    'like',           #Target Like
                      ####################
]

DATA_PATH = "../data/"

In [4]:
df = pd.read_csv(DATA_PATH + "part-00000.tsv", header=None, names=features, sep="\x01")
df.head()

Unnamed: 0,text_tokens,hashtags,tweet_id,media,links,domains,tweet_type,language,timestamp,a_user_id,...,b_user_id,b_follower_count,b_following_count,b_is_verified,b_account_creation,b_follows_a,reply,retweet,retweet_comment,like
0,101\t56898\t137\t10148\t96858\t18193\t11211\t1...,DF367A73CB84842E4A6080F624241703,C24EA5C3E19A388895D0FA05EA9E742F,Photo,,,Retweet,E7F038DE3EAD397AEC9193686C911677,1612712747,419BAC1CC63FF819E98CFC5EFF52DFDA,...,761EDF35532C3D3758DD01D33619D243,220,638,False,1464236635,False,,,,1612721000.0
1,101\t22800\t11206\t117\t13451\t20517\t25136\t1...,,2B41BDE7839DE3117D994FDFC9BDA6E1,,,,TopLevel,488B32D24BD4BB44172EB981C1BCA6FA,1614002023,547810FE65A44B3828FFA4587D1CA0C8,...,AE8B92F8CFE9F92C2942DEBBFE4F0FCF,392,487,False,1362687553,True,,,,
2,101\t119\t119\t119\t45857\t85270\t10169\t77086...,,BAF10FE3481E874ED104AC0950F18784,,,,Quote,488B32D24BD4BB44172EB981C1BCA6FA,1612452111,E18710A5B40F9187026AA49A220857BC,...,ED5574665DB1478CA23F13647178D9BE,1306,5000,False,1244857625,True,,,,1612458000.0
3,101\t160\t39187\t10237\t100\t11337\t100\t13028...,,BCDD3DDDA04C2E7843B3734620EE0C38,Photo,,,TopLevel,488B32D24BD4BB44172EB981C1BCA6FA,1612980390,0CC43EBBB61E292102397B8B08A73DDD,...,585F08165162A5C10DE2081ECFACCFC0,90,91,False,1551356429,False,,,,1612981000.0
4,101\t56898\t137\t40586\t10305\t53244\t22659\t1...,86458A61FFFA24A5624A9AD8AA2F0F52,43845DCDBC2A2CA15C3B18431F48F1F8,,,,Retweet,488B32D24BD4BB44172EB981C1BCA6FA,1612834174,29766FB1458BC9BB67EB7E61CBAB957C,...,CAD64086EF823706DEF953E615DB6743,158,411,False,1417656869,False,,,,


In [5]:
df.dtypes

text_tokens            object
hashtags               object
tweet_id               object
media                  object
links                  object
domains                object
tweet_type             object
language               object
timestamp               int64
a_user_id              object
a_follower_count        int64
a_following_count       int64
a_is_verified            bool
a_account_creation      int64
b_user_id              object
b_follower_count        int64
b_following_count       int64
b_is_verified            bool
b_account_creation      int64
b_follows_a              bool
reply                 float64
retweet               float64
retweet_comment       float64
like                  float64
dtype: object

In [6]:
tokenizer = transformers.BertTokenizer.from_pretrained('bert-base-multilingual-cased')
tokenizer_new = transformers.XLMRobertaTokenizer.from_pretrained("cardiffnlp/twitter-xlm-roberta-base")

In [7]:
text_tokens_lst = df['text_tokens'].map(lambda x: tokenizer.decode([int(s) for s in x.split('\t')]))
text_tokens_lst_new = df['text_tokens'].map(lambda x: tokenizer_new.decode([int(s) for s in x.split('\t')]))

In [8]:
from functools import reduce
zipped = zip(text_tokens_lst, text_tokens_lst_new)
all_equal = reduce(lambda acc, x: acc and (x[0]==x[1]), zipped, True) # replace with func to stop early

print(all_equal)

False


In [9]:
example_link_comment = text_tokens_lst[3]
example_hashtag_link = text_tokens_lst[6]
example_other = text_tokens_lst[1]
print(example_link_comment)
print(example_hashtag_link)
print(example_other)

[CLS] Which [UNK] will [UNK] you [UNK] get [UNK] first? https : / / t. co / VMzG9d3XCC [SEP]
[CLS] # CivilDisobedienceMovement ¶ # JusticeForMyanmar https : / / t. co / rCiRJADZOW [SEP]
[CLS] Oh well, next stop Rotterdam @ janniksin, hope to see you play there. : ) [SEP]


In [15]:
def clean_and_count_urls(tweet, counter=None):
    str_list = []
    prev_idx = 0
    link_cnt = 0
    for url in re.finditer("(https : \/(\s(\/\s)?\w+(\.)?)+)", tweet):
        str_list.append(tweet[prev_idx:url.start()])
        str_list.append(tweet[url.start():url.end()].replace(" ", ""))
        prev_idx = url.end() + 1
        link_cnt += 1
        
    if counter is not None:
        counter[link_cnt] += 1
    if(len(str_list)==0):
        return tweet
    return "".join(str_list)



def remove_urls(tweet, counter = None):
    str_list = []
    prev_idx = 0
    link_cnt = 0
    for url in re.finditer("(https : \/(\s(\/\s)?\w+(\.)?)+)", tweet):
        str_list.append(tweet[prev_idx:url.start()])
        #str_list.append("")
        prev_idx = url.end() + 1
        link_cnt += 1
        
    if counter is not None:
        counter[link_cnt] += 1
    if(len(str_list)==0):
        return tweet
    return "".join(str_list)

def replace_urls(tweet, counter = None):
    str_list = []
    prev_idx = 0
    link_cnt = 0
    for url in re.finditer("(https : \/(\s(\/\s)?\w+(\.)?)+)", tweet):
        str_list.append(tweet[prev_idx:url.start()])
        str_list.append("[LINK]")
        prev_idx = url.end() + 1
        link_cnt += 1
        
    if counter is not None:
        counter[link_cnt] += 1
    if(len(str_list)==0):
        return tweet
    return "".join(str_list)

In [16]:
print(clean_and_count_urls(example_link_comment))
print(clean_and_count_urls(example_hashtag_link))

print(remove_urls(example_link_comment))
print(remove_urls(example_hashtag_link))

print(replace_urls(example_link_comment))
print(replace_urls(example_hashtag_link))

[CLS] Which [UNK] will [UNK] you [UNK] get [UNK] first? https://t.co/VMzG9d3XCC
[CLS] # CivilDisobedienceMovement ¶ # JusticeForMyanmar https://t.co/rCiRJADZOW
[CLS] Which [UNK] will [UNK] you [UNK] get [UNK] first? 
[CLS] # CivilDisobedienceMovement ¶ # JusticeForMyanmar 
[CLS] Which [UNK] will [UNK] you [UNK] get [UNK] first? [LINK]
[CLS] # CivilDisobedienceMovement ¶ # JusticeForMyanmar [LINK]


In [9]:
for i in range(0,25):
    print(clean_and_count_urls(text_tokens_lst[i]))

[CLS] RT @ sakuramt6 : [UNK] # みかじ 絵 https://t.co/UdWRUepkMQ
[CLS] Oh well, next stop Rotterdam @ janniksin, hope to see you play there. : ) [SEP]
[CLS]... replacing Ares with Athena [UNK] [SEP]
[CLS] Which [UNK] will [UNK] you [UNK] get [UNK] first? https://t.co/VMzG9d3XCC
[CLS] RT @ BrainzNBoobiez : Whew... this just gave me chills... # TheWaitIsOver [SEP]
[CLS] Diktaturmegafonen [SEP]
[CLS] # CivilDisobedienceMovement ¶ # JusticeForMyanmar https://t.co/rCiRJADZOW
[CLS] RT @ mystarplay : [UNK] # 스타플레이 # 3월 # 생일투표 1 / [UNK] ¶ [UNK] # 3월생일 # [UNK] ¶ ¶ [UNK] ¶ [UNK] 초대형 생일전광판 ¶ [UNK] 파르나스몰 미디어패널 ( 39개 ) ¶ [UNK] ¶ ¶ [UNK] ¶ [UNK] ¶ ¶ # starplay 앱에서 [UNK] [SEP]
[CLS] RT @ FaseehMangi : Pakistan fiscal deficit rises to 2. 5 % in six months vs 2. 3 %, some key numbers : ¶ Tax revenue [UNK] - 0. 4 % ¶ Government expenditures [UNK] [SEP]
[CLS] The Right will push conspiracy theories and fascistic fearmongering until it stops being profitable or costs them power. ¶ ¶ They [UNK] re not going to

In [10]:
cnt = Counter()

links_fixed = text_tokens_lst.map(lambda x: clean_and_count_urls(x, cnt))

In [11]:
print(cnt)

Counter({0: 1771144, 1: 1474707, 2: 49736, 3: 3857, 4: 842, 5: 291, 6: 115, 7: 50, 8: 14, 9: 9, 10: 4, 11: 1})


In [3]:
def remove_urls_and_count(tweet):
    str_list = []
    prev_idx = 0
    link_cnt = 0
    for url in re.finditer("(https : \/(\s(\/\s)?\w+(\.)?)+)", tweet):
        str_list.append(tweet[prev_idx:url.start()])
        str_list.append(tweet[url.start():url.end()].replace(" ", ""))
        prev_idx = url.end() + 1
        link_cnt += 1
        
    if(len(str_list)==0):
        return (tweet, 0)
    return ("".join(str_list), link_cnt)

#print(remove_urls_and_count(example_link_comment))

In [16]:
all_files = glob.glob(DATA_PATH+"*.tsv")
all_files.sort()

train_files = all_files[:10]
valid_files = all_files[-10:-5]
test_files = all_files[-5:]

In [7]:
valid_df = pd.concat((pd.read_csv(valid_file, header=None, names=features, sep="\x01") for valid_file in valid_files))
valid_df.to_csv(DATA_PATH+"valid_small.csv")

In [8]:
test_df = pd.concat((pd.read_csv(test_file, header=None, names=features, sep="\x01") for test_file in test_files))
test_df.to_csv(DATA_PATH+"test_small.csv")

In [12]:
del valid_df
del test_df
gc.collect()

137

In [17]:
train_df = pd.concat((pd.read_csv(train_file, header=None, names=features, sep="\x01") for train_file in train_files))

In [20]:
text_tokens_lst = train_df['text_tokens'].map(lambda x: tokenizer.decode([int(s) for s in x.split('\t')]))

In [21]:
cleaned_urls = text_tokens_lst.map(lambda x: remove_urls_and_count(x))
del text_tokens_lst

In [27]:
a, b = zip(*cleaned_urls[:5])
urls, counts = zip(*cleaned_urls)

In [28]:
del cleaned_urls

In [29]:
train_df['url_cnt'] = counts
train_df.to_csv(DATA_PATH+"train_small.csv")

In [45]:
def convert_tweet_back_to_ids(tweet):
    ids_lst = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(urls[3]))
    ids_lst = [str(id) for id in ids_lst]
    
    return '\t'.join(ids_lst)
    
train_df['text_tokens'].apply(convert_tweet_back_to_ids)
train_df.head()

Unnamed: 0,text_tokens,hashtags,tweet_id,media,links,domains,tweet_type,language,timestamp,a_user_id,...,b_follower_count,b_following_count,b_is_verified,b_account_creation,b_follows_a,reply,retweet,retweet_comment,like,url_cnt
0,101\t56898\t137\t10148\t96858\t18193\t11211\t1...,DF367A73CB84842E4A6080F624241703,C24EA5C3E19A388895D0FA05EA9E742F,Photo,,,Retweet,E7F038DE3EAD397AEC9193686C911677,1612712747,419BAC1CC63FF819E98CFC5EFF52DFDA,...,220,638,False,1464236635,False,,,,1612721000.0,1
1,101\t22800\t11206\t117\t13451\t20517\t25136\t1...,,2B41BDE7839DE3117D994FDFC9BDA6E1,,,,TopLevel,488B32D24BD4BB44172EB981C1BCA6FA,1614002023,547810FE65A44B3828FFA4587D1CA0C8,...,392,487,False,1362687553,True,,,,,0
2,101\t119\t119\t119\t45857\t85270\t10169\t77086...,,BAF10FE3481E874ED104AC0950F18784,,,,Quote,488B32D24BD4BB44172EB981C1BCA6FA,1612452111,E18710A5B40F9187026AA49A220857BC,...,1306,5000,False,1244857625,True,,,,1612458000.0,0
3,101\t160\t39187\t10237\t100\t11337\t100\t13028...,,BCDD3DDDA04C2E7843B3734620EE0C38,Photo,,,TopLevel,488B32D24BD4BB44172EB981C1BCA6FA,1612980390,0CC43EBBB61E292102397B8B08A73DDD,...,90,91,False,1551356429,False,,,,1612981000.0,1
4,101\t56898\t137\t40586\t10305\t53244\t22659\t1...,86458A61FFFA24A5624A9AD8AA2F0F52,43845DCDBC2A2CA15C3B18431F48F1F8,,,,Retweet,488B32D24BD4BB44172EB981C1BCA6FA,1612834174,29766FB1458BC9BB67EB7E61CBAB957C,...,158,411,False,1417656869,False,,,,,0


In [46]:
train_df.to_csv(DATA_PATH+"train_small_no_urls.csv")