In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from lib.dataset_utils import *
from sklearn.preprocessing import MultiLabelBinarizer
import seaborn as sns
import os
from transformers import BertTokenizer

In [2]:
train_df, val_df, test_df = load_dataset(DatasetEnum.TwitterData, k_hot_encode=True)
label_names = train_df.columns[1:]

### Removing links and pictures or other non insteresting tokens

The problem here is that links appear as very different sequences of tokens, so we will proceed step by step

In [3]:
train_df[train_df["text"].str.contains(r"(?: href|http|(?: ^|\b)www(?: $|\b))", regex=True)]

Unnamed: 0,text,anger,fear,joy,love,sadness,surprise
125,i feel they are pretty safe on my blog img src...,0,0,1,0,0,0
323,i stopped feeling so exhausted a href http pro...,0,0,0,0,1,0
462,i feel so dazed a href http twitter,0,0,0,0,0,1
866,i feel unwelcome at work sometimes and think p...,0,0,0,0,1,0
967,i a href http feeling groggy,0,0,0,0,1,0
...,...,...,...,...,...,...,...
15344,i feel more confident already a href http john...,0,0,1,0,0,0
15518,i feel honoured to be asked thanks a href http...,0,0,1,0,0,0
15732,i feel like it was just a title mimm fall insp...,0,0,1,0,0,0
15779,i cant always identify with peoples struggles ...,0,0,0,0,1,0


these seems to be useful tokens to identify links or other data added to tweets
- script
- title bookmark
- img
- src
- href
- a (unfortunately)
- http
- nofollow

they always appear from one point till the end of the tokens of any record, so the important part is to understand where to split

In [4]:
occ = train_df[train_df["text"].str.contains(r"(?: nofollow)", regex=True)]
occ

Unnamed: 0,text,anger,fear,joy,love,sadness,surprise
4966,i feel horrible about wanting sonipro amp sour...,0,0,0,0,1,0
7195,i i feel for you rel nofollow add to delicious...,0,0,1,0,0,0
8153,i feel like an idiot around my friends target ...,0,0,0,0,1,0
9268,i feel the earth move tribute to carole king k...,0,0,0,0,1,0
9501,i feel summer session title bookmark at digg r...,0,0,0,0,1,0
9527,i sound feeling ballroom cd rel nofollow targe...,0,0,0,0,1,0
13275,i feel a change coming espa a hd target blank ...,0,0,0,0,1,0
15305,i feel that i shouldnt be his back up a rel no...,0,0,0,0,1,0


in this sample, a is always used to start metadata

In [5]:
# cut sentences from the specified tokens
tok_ids = train_df["text"].str.contains(r"(?: (?: ^|\b)nofollow(?: $|\b))", regex=True)
train_df.loc[tok_ids, 'text'] = train_df[tok_ids]['text'].str.split('a').str[0]
tok_ids = train_df["text"].str.contains(r"(?: (?: ^|\b)nofollow(?: $|\b))", regex=True)
train_df.loc[tok_ids, 'text'] = train_df[tok_ids]['text'].str.split('rel').str[0]
tok_ids = train_df["text"].str.contains(r"(?: (?: ^|\b)nofollow(?: $|\b))", regex=True)
train_df.loc[tok_ids, 'text'] = train_df[tok_ids]['text'].str.split('target').str[0]

In [6]:
occ = train_df[train_df["text"].str.contains(r"(?: \bscript\b)", regex=True)]
occ

Unnamed: 0,text,anger,fear,joy,love,sadness,surprise
3496,im feeling the moxie fab love cath script src ...,0,0,1,0,0,0
11750,im happy to have finished the script s its goo...,0,0,0,0,1,0
12033,i just have a weird feeling that there was not...,0,0,1,0,0,0


In [7]:
tok_ids = train_df["text"].str.contains(r"(?: script)", regex=True)
tok_ids[11750] = False
train_df.loc[tok_ids, 'text'] = train_df[tok_ids]['text'].str.split('script').str[0]

In [8]:
occ = train_df[train_df["text"].str.contains(r"(?: img)", regex=True)]
occ

Unnamed: 0,text,anger,fear,joy,love,sadness,surprise
125,i feel they are pretty safe on my blog img src...,0,0,1,0,0,0
1439,i feel fearless janelle mon e elle canada febr...,0,0,1,0,0,0
1542,i feel tat all of us in this world are clever ...,0,0,1,0,0,0
1726,i could just feel the joy rage coming at me fo...,0,0,1,0,0,0
2031,i use this day and night and sometimes when i ...,0,0,0,0,1,0
2279,i never feel fucked the week after i used some...,1,0,0,0,0,0
2561,i apologize to all the ppl i dragged along wit...,0,0,0,0,1,0
2754,i feel really joyful img src http s,0,0,1,0,0,0
4975,i feel more adventurous willing to take risks ...,0,0,1,0,0,0
5300,i feel studying and doing homework again after...,0,0,0,0,1,0


In [9]:
tok_ids = train_df["text"].str.contains(r"(?: img)", regex=True)
train_df.loc[tok_ids, 'text'] = train_df[tok_ids]['text'].str.split('img').str[0]
train_df.loc[tok_ids, 'text'] = train_df[tok_ids]['text'].str.split('annotation').str[0]

In [10]:
occ = train_df[train_df["text"].str.contains(r"(?: title bookmark)", regex=True)]
occ

Unnamed: 0,text,anger,fear,joy,love,sadness,surprise


In [11]:
tok_ids = train_df["text"].str.contains(r"(?: title bookmark)", regex=True)
train_df.loc[tok_ids, 'text'] = train_df[tok_ids]['text'].str.split('title bookmark').str[0]

In [12]:
occ = train_df[train_df["text"].str.contains(r"(?: src(?: url)?)", regex=True)]
occ

Unnamed: 0,text,anger,fear,joy,love,sadness,surprise
5156,im feeling generous heres a holiday classic fo...,0,0,0,1,0,0
5322,i feel like it dirty src http i,0,0,0,0,1,0
12219,i feel dirty srcurl http draftbloger,0,0,0,0,1,0
14581,i feel all kinds of dirty and not a good dirty...,0,0,0,0,1,0


In [13]:
tok_ids = train_df["text"].str.contains(r"(?: src(?: url)?)", regex=True)
train_df.loc[tok_ids, 'text'] = train_df[tok_ids]['text'].str.split('src').str[0]
train_df.loc[tok_ids, 'text'] = train_df[tok_ids]['text'].str.split('srcurl').str[0]

### Finding a good regex for links to substitute with [LINK]

In [14]:
occ = train_df[train_df["text"].str.contains(r"(?: \bhref\b)", regex=True)]
occ

Unnamed: 0,text,anger,fear,joy,love,sadness,surprise
323,i stopped feeling so exhausted a href http pro...,0,0,0,0,1,0
462,i feel so dazed a href http twitter,0,0,0,0,0,1
866,i feel unwelcome at work sometimes and think p...,0,0,0,0,1,0
967,i a href http feeling groggy,0,0,0,0,1,0
1162,i really feel that we are progressing towards ...,0,0,1,0,0,0
...,...,...,...,...,...,...,...
15344,i feel more confident already a href http john...,0,0,1,0,0,0
15518,i feel honoured to be asked thanks a href http...,0,0,1,0,0,0
15732,i feel like it was just a title mimm fall insp...,0,0,1,0,0,0
15779,i cant always identify with peoples struggles ...,0,0,0,0,1,0


In [15]:
train_df["text"].str.contains(r"(?: a href)", regex=True).sum()

135

In [16]:
train_df["text"].str.contains(r"(?: a href https?)", regex=True).sum()

134

In [17]:
train_df["text"].str.contains(r"(?: href https?)", regex=True).sum()

155

In [18]:
train_df["text"].str.contains(r"(?: a href http www)", regex=True).sum()

31

In [19]:
train_df["text"].str.contains(r"(?: \bwww\b)", regex=True).sum()

44

In [20]:
link_regex = r"(?: (?: (?:a |link ))?href https?(?: \w+|$))"
train_df["text"].str.contains(link_regex, regex=True).sum()

155

In [21]:
regex_ids = train_df["text"].str.contains(link_regex, regex=True)
occ = train_df["text"].str.contains(r"(?: \bhref\b)", regex=True)
(regex_ids & occ).sum()

155

In [22]:
train_df["text"] = train_df["text"].str.replace(link_regex, '[LINK]', regex=True)

### Same for validation and test

In [23]:
occ = val_df[val_df["text"].str.contains(r"(?: nofollow)", regex=True)]
occ

Unnamed: 0,text,anger,fear,joy,love,sadness,surprise
679,i like to show the homeowners these catalogs t...,0,0,0,0,1,0
1848,i feel like it title share on reddit reddit a ...,0,0,0,0,1,0


in this sample, a is always used to start metadata

In [24]:
# cut sentences from the specified tokens
tok_ids = val_df["text"].str.contains(r"(?: (?: ^|\b)nofollow(?: $|\b))", regex=True)
val_df.loc[tok_ids, 'text'] = val_df[tok_ids]['text'].str.split('a').str[0]

In [25]:
occ = val_df[val_df["text"].str.contains(r"(?: \bscript\b)", regex=True)]
occ

Unnamed: 0,text,anger,fear,joy,love,sadness,surprise


In [26]:
occ = val_df[val_df["text"].str.contains(r"(?: img)", regex=True)]
occ

Unnamed: 0,text,anger,fear,joy,love,sadness,surprise


In [27]:
occ = val_df[val_df["text"].str.contains(r"(?: title bookmark)", regex=True)]
occ

Unnamed: 0,text,anger,fear,joy,love,sadness,surprise


In [28]:
occ = val_df[val_df["text"].str.contains(r"(?: src(?: url)?)", regex=True)]
occ

Unnamed: 0,text,anger,fear,joy,love,sadness,surprise
101,im feeling nostalgic cant beat the corys ifram...,0,0,0,1,0,0


In [29]:
tok_ids = val_df["text"].str.contains(r"(?: src(?: url)?)", regex=True)
val_df.loc[tok_ids, 'text'] = val_df[tok_ids]['text'].str.split('src').str[0]

In [30]:
occ = val_df[val_df["text"].str.contains(r"(?: \bhref\b)", regex=True)]
occ

Unnamed: 0,text,anger,fear,joy,love,sadness,surprise
56,i feel sometimes i am like heartless tin woodm...,1,0,0,0,0,0
63,i feel perfect with you on facebook href http www,0,0,1,0,0,0
386,i feel so embarrassed and humiliated korean at...,0,0,0,0,1,0
403,i dolphins feel sweet taste of victory defeat ...,0,0,1,0,0,0
449,i feel much more comfortable finding those peo...,0,0,1,0,0,0
579,i feel loyal to a href http www,0,0,0,1,0,0
581,i still feel horny from that little a href htt...,0,0,0,1,0,0
666,i feel horny a class arialblue href chat,0,0,0,1,0,0
674,i feel so amazing and i m so by a href http yo...,0,0,1,0,0,0
706,i feel sorry for a href http bluestarlight,0,0,0,0,1,0


In [31]:
link_regex = r"(?: (?: (?:a |link ))?href(?: https?)?(?: \w+|$))"
val_df["text"].str.contains(link_regex, regex=True).sum()

22

In [32]:
regex_ids = val_df["text"].str.contains(link_regex, regex=True)
occ = val_df["text"].str.contains(r"(?: \bhref\b)", regex=True)
(regex_ids & occ).sum()

22

In [33]:
val_df["text"] = val_df["text"].str.replace(link_regex, '[LINK]', regex=True)

In [34]:
occ = test_df[test_df["text"].str.contains(r"(?: nofollow)", regex=True)]
occ

Unnamed: 0,text,anger,fear,joy,love,sadness,surprise
806,i feel the earth move tribute to carole king k...,0,0,0,0,1,0


in this sample, a is always used to start metadata

In [35]:
# cut sentences from the specified tokens
tok_ids = test_df["text"].str.contains(r"(?: (?: ^|\b)nofollow(?: $|\b))", regex=True)
test_df.loc[tok_ids, 'text'] = test_df[tok_ids]['text'].str.split('rel').str[0]

In [36]:
occ = test_df[test_df["text"].str.contains(r"(?: \bscript\b)", regex=True)]
occ

Unnamed: 0,text,anger,fear,joy,love,sadness,surprise
164,i just went about my script of would you like ...,0,1,0,0,0,0
270,i didnt feel i rushed things dhawan tweet scri...,1,0,0,0,0,0
907,i dropped off the script and left feeling diss...,1,0,0,0,0,0


In [37]:
tok_ids = test_df["text"].str.contains(r"(?: \bscript\b)", regex=True)
test_df.loc[tok_ids, 'text'] = test_df[tok_ids]['text'].str.split('tweet').str[0]

In [38]:
occ = test_df[test_df["text"].str.contains(r"(?: img)", regex=True)]
occ

Unnamed: 0,text,anger,fear,joy,love,sadness,surprise


In [39]:
occ = test_df[test_df["text"].str.contains(r"(?: title bookmark)", regex=True)]
occ

Unnamed: 0,text,anger,fear,joy,love,sadness,surprise
1548,i feel it like you target blank class di title...,0,0,0,0,1,0


In [40]:
tok_ids = test_df["text"].str.contains(r"(?: title bookmark)", regex=True)
test_df.loc[tok_ids, 'text'] = test_df[tok_ids]['text'].str.split('target').str[0]

In [41]:
occ = test_df[test_df["text"].str.contains(r"(?: src(?: url)?)", regex=True)]
occ#TODO what is this thing?

Unnamed: 0,text,anger,fear,joy,love,sadness,surprise
779,i input class size medium wp image height src ...,0,0,1,0,0,0


In [42]:
occ = test_df[test_df["text"].str.contains(r"(?: \bhref\b)", regex=True)]
occ

Unnamed: 0,text,anger,fear,joy,love,sadness,surprise
33,i feel so cold a href http irish,1,0,0,0,0,0
52,i feel like i could go into any situation and ...,0,0,1,0,0,0
99,i was still feeling weepy and strung out so ma...,0,0,0,0,1,0
154,i really like this person feel that the questi...,0,0,1,0,0,0
176,i feel brave today heading to amman and beirut...,0,0,1,0,0,0
288,i feel pretty much like this scene from a href...,0,0,1,0,0,0
375,im feeling a lot less ugly duckling and a lot ...,0,0,0,0,1,0
426,i feel unprotected a class post count link hre...,0,0,0,0,1,0
511,i feel i have to give credit to jen mitchell f...,0,0,1,0,0,0
773,ive also had a nosy on the website and seeing ...,0,0,1,0,0,0


In [43]:
link_regex = r"(?: (?: (?:a |link ))?href(?: https?)?(?: \w+|$))"
test_df["text"].str.contains(link_regex, regex=True).sum()

23

In [44]:
regex_ids = test_df["text"].str.contains(link_regex, regex=True)
occ = test_df["text"].str.contains(r"(?: \bhref\b)", regex=True)
(regex_ids & occ).sum()

23

In [45]:
test_df["text"] = test_df["text"].str.replace(link_regex, '[LINK]', regex=True)

In [46]:
# remove long sequences of same character (max 5)
train_df['text'] = train_df['text'].str.replace(r'(.)\1{5,}', r'\1\1\1\1\1', regex=True)
val_df['text'] = val_df['text'].str.replace(r'(.)\1{5,}', r'\1\1\1\1\1', regex=True)#TODO test?

In [47]:
OUT_DIR = "./dataset/TwitterDataCleaned/"

In [48]:
# Save the cleaned dataset
if not os.path.exists(OUT_DIR):
    os.makedirs(OUT_DIR)
train_df.to_csv(OUT_DIR + "train.tsv", sep='\t', index=False)
val_df.to_csv(OUT_DIR + "val.tsv", sep='\t', index=False)
test_df.to_csv(OUT_DIR + "test.tsv", sep='\t', index=False)