In [1]:
import os, re
import pandas as pd
import preprocessor as p

In [2]:
# Setup preprocessor
p.set_options(p.OPT.MENTION, p.OPT.URL, p.OPT.HASHTAG, p.OPT.EMOJI, p.OPT.SMILEY)
print(p.clean("RT @test: Foobar #awesome https://www.baz.me 22/7 is pi yup!"))
print(p.clean("@test Foobar #awesome https://www.baz.me 22/7 is pi yup!"))

def clean(word):
    if word[:2] == "RT":
        return p.clean(word[3:])[2:]
    return p.clean(word)

print(clean("RT @test: Foobar #awesome https://www.baz.me 22/7 is pi yup!"))
print(clean("@test Foobar #awesome https://www.baz.me 22/7 is pi yup!"))

RT : Foobar 22/7 is pi yup!
Foobar 22/7 is pi yup!
Foobar 22/7 is pi yup!
Foobar 22/7 is pi yup!


In [3]:
train_data, test_data, valid_data = [], [], []
size_table = {"train": [], "test": [], "valid": []}

emoji_map = pd.read_csv(os.getcwd() + "\Datasets\Original\emoji_map_1791.csv")
emoji_idx = [1381, 1424, 1392, 1447, 186, 1389, 1420, 1620, 1403, 763, 1138, 1446]

In [4]:
emoji_list = list(map(lambda x: emoji_map.iloc[x, 0], emoji_idx))
emoji_list

['😂', '😭', '😍', '🙄', '❤', '😊', '😩', '🤔', '😘', '🏽', '💯', '🙃']

In [5]:
# This particular emoji has a different behaviour
print(emoji_map.iloc[emoji_idx[-3], 0])

🏽


In [6]:
for num in range(1, 13):
    train_df = pd.read_csv(f"https://raw.githubusercontent.com/RussellDash332/CS3244-Twemoji/main/Datasets/train_text_emoji_{num}.csv")
    test_df = pd.read_csv(f"https://raw.githubusercontent.com/RussellDash332/CS3244-Twemoji/main/Datasets/test_text_emoji_{num}.csv")
    valid_df = pd.read_csv(f"https://raw.githubusercontent.com/RussellDash332/CS3244-Twemoji/main/Datasets/valid_text_emoji_{num}.csv")

    def list_to_emoji(string):
        emojis = list(map(int, string[1:-1].split(",")))
        return str().join(map(lambda x: emoji_map.iloc[x, 0], emojis))

    def emoji_to_index(emoji):
        return emoji_list.index(emoji)

    # Convert "[1381, 1424]" to "😂😭"
    train_df["annotations"] = train_df["annotations"].apply(list_to_emoji)
    test_df["annotations"] = test_df["annotations"].apply(list_to_emoji)
    valid_df["annotations"] = valid_df["annotations"].apply(list_to_emoji)

    # Update: drop the rows with multiple emojis and keep the single-labelled ones
    train_df = train_df[train_df.annotations == emoji_list[num-1]]
    test_df = test_df[test_df.annotations == emoji_list[num-1]]
    valid_df = valid_df[valid_df.annotations == emoji_list[num-1]]
    
    # Convert "😂" to 0
    train_df["labels"] = train_df["annotations"].apply(emoji_to_index)
    test_df["labels"] = test_df["annotations"].apply(emoji_to_index)
    valid_df["labels"] = valid_df["annotations"].apply(emoji_to_index)

    # Apply cleaning from twitter-preprocessor
    train_df["tweets"] = train_df["tweets"].apply(clean)
    test_df["tweets"] = test_df["tweets"].apply(clean)
    valid_df["tweets"] = valid_df["tweets"].apply(clean)

    train_data.append(train_df)
    test_data.append(test_df)
    valid_data.append(valid_df)

    # print(f"Train data size from emoji {num}:", train_df.shape)
    # print(f"Test data size from emoji {num}:", test_df.shape)
    # print(f"Valid data size from emoji {num}:", valid_df.shape)

    size_table["train"].append(train_df.shape[0])
    size_table["test"].append(test_df.shape[0])
    size_table["valid"].append(valid_df.shape[0])

In [7]:
size_table = pd.DataFrame(size_table, index=list(range(1, 13)))
size_table["sum"] = size_table.train + size_table.test + size_table.valid
size_table

# training is balanced, good to go
# stick with this t-t-v combination
# this was before duplicate removal

Unnamed: 0,train,test,valid,sum
1,4504,1804,1782,8090
2,4240,1570,1429,7239
3,4324,1093,1172,6589
4,4609,1215,1228,7052
5,4232,731,556,5519
6,4567,743,749,6059
7,4345,1308,807,6460
8,4557,1216,1840,7613
9,4086,512,591,5189
10,4289,1724,1715,7728


In [8]:
size_table.apply(sum)

train    52888
test     13486
valid    13481
sum      79855
dtype: int64

In [9]:
train_data[0]

Unnamed: 0,id,annotations,tweets,labels
0,743419925738496000,😂,school is so dead o my god,0
1,742411940677492736,😂,lol I've been told mine is worse than yours bu...,0
2,744394777974628352,😂,I'm excited to hear them..... That shit is goi...,0
3,743679858199298049,😂,Damn alycia knows everything even indirect tweets,0
4,742579985588887552,😂,That sound like everything,0
...,...,...,...,...
4995,744391978171904000,😂,Always,0
4996,747994577357930497,😂,Had to flex on Tia people,0
4997,746430898464161792,😂,DJ Esco is really the coolest DJ,0
4998,742758236986576897,😂,we be so jobless,0


In [10]:
train_data = pd.concat(train_data)
test_data = pd.concat(test_data)
valid_data = pd.concat(valid_data)

print("Before missing and duplicate removal: ", train_data.shape, test_data.shape, valid_data.shape)

Before missing and duplicate removal:  (52888, 4) (13486, 4) (13481, 4)


In [11]:
train_data = train_data[~train_data.duplicated()]
train_data = train_data [train_data.tweets != ""]

test_data = test_data[~test_data.duplicated()]
test_data = test_data[test_data.tweets != ""]

valid_data = valid_data[~valid_data.duplicated()]
valid_data = valid_data[valid_data.tweets != ""]

print("After missing and duplicate removal: ", train_data.shape, test_data.shape, valid_data.shape)

After missing and duplicate removal:  (52885, 4) (13485, 4) (13481, 4)


In [12]:
train_data = train_data.drop("id", axis=1)
test_data = test_data.drop("id", axis=1)
valid_data = valid_data.drop("id", axis=1)

In [13]:
train_data.sample(n=10, random_state=1010)

Unnamed: 0,annotations,tweets,labels
759,😍,I appreciate all you little booty women in thi...,2
3861,😩,Currently having a music shoot on snapchat lol,6
794,❤,Miss you guys so much man,4
1555,💯,. just received his first box! Hit him up for ...,10
3497,💯,I keep telling you nighas !!!!,10
651,😩,Some Jamba Juice sounds sooo good right now,6
15,❤,My heart,4
1261,🙃,Tired all day but wide awake at night,11
3486,🤔,I gotta question,7
2249,💯,Whether you doing good or bad somebody is goin...,10


In [14]:
train_data.to_csv(os.getcwd() + "\Datasets\\train_text_emoji_clean.csv", index=False)
test_data.to_csv(os.getcwd() + "\Datasets\\test_text_emoji_clean.csv", index=False)
valid_data.to_csv(os.getcwd() + "\Datasets\\valid_text_emoji_clean.csv", index=False)

- XLNet -> sentiment analysis / Text Classification (https://www.topbots.com/leading-nlp-language-models-2020/#language-models-2020-3)
- OpenAI’s GPT2 -> supervised learning on task-specific datasets (https://insights.daffodilsw.com/blog/top-5-nlp-language-models)
- Embedding Layers (https://machinelearningmastery.com/what-are-word-embeddings/)
- Word2Vec (https://machinelearningmastery.com/what-are-word-embeddings/)\
- GloVe (https://machinelearningmastery.com/what-are-word-embeddings/)
- Parsing (https://www.analyticsvidhya.com/blog/2020/12/understanding-text-classification-in-nlp-with-movie-review-example-example/#h2_6)
- Semantic (https://www.analyticsvidhya.com/blog/2020/12/understanding-text-classification-in-nlp-with-movie-review-example-example/#h2_7)