In [1]:
import os
import pandas as pd
import preprocessor as p

In [2]:
# Setup preprocessor
p.set_options(p.OPT.MENTION, p.OPT.URL, p.OPT.HASHTAG, p.OPT.EMOJI, p.OPT.SMILEY)
print(p.clean("RT: @test Foobar #awesome https://www.baz.me 22/7 is pi yup!"))

def clean(word):
    if word[:2] == "RT":
        return p.clean(word[4:])
    return p.clean(word)

clean("RT: @test Foobar #awesome https://www.baz.me 22/7 is pi yup!")

RT: Foobar 22/7 is pi yup!


'Foobar 22/7 is pi yup!'

In [3]:
train_data, test_data, valid_data = [], [], []
size_table = {"train": [], "test": [], "valid": []}

emoji_map = pd.read_csv(os.getcwd() + "\Datasets\Original\emoji_map_1791.csv")
emoji_idx = [1381, 1424, 1392, 1447, 186, 1389, 1420, 1620, 1403, 763, 1138, 1446]

In [4]:
emoji_list = list(map(lambda x: emoji_map.iloc[x, 0], emoji_idx))
emoji_list

['😂', '😭', '😍', '🙄', '❤', '😊', '😩', '🤔', '😘', '🏽', '💯', '🙃']

In [5]:
# This particular emoji has a different behaviour.
print(emoji_map.iloc[emoji_idx[-3], 0])

🏽


In [6]:
for num in range(1, 13):
    train_df = pd.read_csv(f"https://raw.githubusercontent.com/RussellDash332/CS3244-Twemoji/main/Datasets/train_text_emoji_{num}.csv")
    test_df = pd.read_csv(f"https://raw.githubusercontent.com/RussellDash332/CS3244-Twemoji/main/Datasets/test_text_emoji_{num}.csv")
    valid_df = pd.read_csv(f"https://raw.githubusercontent.com/RussellDash332/CS3244-Twemoji/main/Datasets/valid_text_emoji_{num}.csv")

    def list_to_emoji(string):
        emojis = list(map(int, string[1:-1].split(",")))
        return str().join(sorted(map(lambda x: emoji_map.iloc[x, 0], emojis), key = lambda x: emoji_list.index(x)))

    def emoji_to_index(emojis):
        return tuple(sorted(map(lambda x: emoji_list.index(x), emojis)))

    # Convert "[1381, 1424]" to "😂😭"
    train_df["annotations"] = train_df["annotations"].apply(list_to_emoji)
    test_df["annotations"] = test_df["annotations"].apply(list_to_emoji)
    valid_df["annotations"] = valid_df["annotations"].apply(list_to_emoji)
    
    # Convert "😂😭" to (0, 1)
    train_df["labels"] = train_df["annotations"].apply(emoji_to_index)
    test_df["labels"] = test_df["annotations"].apply(emoji_to_index)
    valid_df["labels"] = valid_df["annotations"].apply(emoji_to_index)

    # Apply cleaning from twitter-preprocessor
    train_df["tweets"] = train_df["tweets"].apply(clean)
    test_df["tweets"] = test_df["tweets"].apply(clean)
    valid_df["tweets"] = valid_df["tweets"].apply(clean)

    train_data.append(train_df)
    test_data.append(test_df)
    valid_data.append(valid_df)

    # print(f"Train data size from emoji {num}:", train_df.shape)
    # print(f"Test data size from emoji {num}:", test_df.shape)
    # print(f"Valid data size from emoji {num}:", valid_df.shape)

    size_table["train"].append(train_df.shape[0])
    size_table["test"].append(test_df.shape[0])
    size_table["valid"].append(valid_df.shape[0])

In [7]:
size_table = pd.DataFrame(size_table, index=list(range(1, 13)))
size_table["sum"] = size_table.train + size_table.test + size_table.valid
size_table

# training is balanced, good to go
# stick with this t-t-v combination
# this was before duplicate removal

Unnamed: 0,train,test,valid,sum
1,5000,2000,2000,9000
2,5000,1846,1708,8554
3,5000,1272,1384,7656
4,5000,1325,1355,7680
5,5000,865,683,6548
6,5000,824,815,6639
7,5000,1481,921,7402
8,5000,1319,2000,8319
9,5000,621,713,6334
10,5000,2000,2000,9000


In [8]:
size_table.apply(sum)

train    60000
test     15252
valid    15394
sum      90646
dtype: int64

In [9]:
train_data[0]

Unnamed: 0,id,annotations,tweets,labels
0,743419925738496000,😂,school is so dead o my god,"(0,)"
1,742411940677492736,😂,lol I've been told mine is worse than yours bu...,"(0,)"
2,744394777974628352,😂,I'm excited to hear them..... That shit is goi...,"(0,)"
3,743679858199298049,😂,Damn alycia knows everything even indirect tweets,"(0,)"
4,742579985588887552,😂,x_juicebox: That sound like everything,"(0,)"
...,...,...,...,...
4995,744391978171904000,😂,Always,"(0,)"
4996,747994577357930497,😂,Had to flex on Tia people,"(0,)"
4997,746430898464161792,😂,DJ Esco is really the coolest DJ,"(0,)"
4998,742758236986576897,😂,we be so jobless,"(0,)"


In [10]:
train_data = pd.concat(train_data)
test_data = pd.concat(test_data)
valid_data = pd.concat(valid_data)

print("Before duplicate removal: ", train_data.shape, test_data.shape, valid_data.shape)

train_data = train_data[~train_data.duplicated()]
test_data = test_data[~test_data.duplicated()]
valid_data = valid_data[~valid_data.duplicated()]

print("After duplicate removal: ", train_data.shape, test_data.shape, valid_data.shape)

Before duplicate removal:  (60000, 4) (15252, 4) (15394, 4)
After duplicate removal:  (59907, 4) (15185, 4) (15333, 4)


In [11]:
train_data = train_data.drop("id", axis=1)
test_data = valid_data.drop("id", axis=1)
valid_data = valid_data.drop("id", axis=1)

In [12]:
train_data.sample(n=10, random_state=1010)

Unnamed: 0,annotations,tweets,labels
2650,😍❤,When The ticket sale start i hope i Can get To...,"(2, 4)"
3241,🙃,"""Friends can break your heart too"" if that ain...","(11,)"
4264,😍,I'm so grateful for my girl,"(2,)"
2000,💯,I deserve better and ins get it.. This shit ai...,"(10,)"
1659,😂🏽,"juahoe: im sorry ""wtf u mean you dont do blow?...","(0, 9)"
2938,🤔,well today,"(7,)"
2012,💯,DJ837: Proud of the DMV music scene right now ...,"(10,)"
305,🙄,I think everyone knows that periods and period...,"(3,)"
317,😘,Happy Birthday kiddo,"(8,)"
3834,😘,thank you!!! Love you too,"(8,)"


In [13]:
train_data.to_csv(os.getcwd() + "\Datasets\\train_text_emoji_clean.csv", index=False)
test_data.to_csv(os.getcwd() + "\Datasets\\test_text_emoji_clean.csv", index=False)
valid_data.to_csv(os.getcwd() + "\Datasets\\valid_text_emoji_clean.csv", index=False)

- XLNet -> sentiment analysis / Text Classification (https://www.topbots.com/leading-nlp-language-models-2020/#language-models-2020-3)
- OpenAI’s GPT2 -> supervised learning on task-specific datasets (https://insights.daffodilsw.com/blog/top-5-nlp-language-models)
- Embedding Layers (https://machinelearningmastery.com/what-are-word-embeddings/)
- Word2Vec (https://machinelearningmastery.com/what-are-word-embeddings/)\
- GloVe (https://machinelearningmastery.com/what-are-word-embeddings/)
- Parsing (https://www.analyticsvidhya.com/blog/2020/12/understanding-text-classification-in-nlp-with-movie-review-example-example/#h2_6)
- Semantic (https://www.analyticsvidhya.com/blog/2020/12/understanding-text-classification-in-nlp-with-movie-review-example-example/#h2_7)