In [97]:
# The basic ones
import random
import os
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import cv2 as cv

# Almost everything from sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, confusion_matrix, mean_squared_error, accuracy_score
from sklearn.linear_model import LogisticRegression, LinearRegression, SGDClassifier
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.dummy import DummyRegressor, DummyClassifier
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, AdaBoostClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

# Tensorflow if needed
import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.optimizers import Adam, RMSprop
from tensorflow.keras.layers import Dense, Flatten, Conv2D, MaxPool2D, Activation, Dropout
from tensorflow.keras import Sequential
from tensorflow.keras.callbacks import ModelCheckpoint

# NLP packages
# !pip install nltk
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
from nltk.tokenize import sent_tokenize, word_tokenize, RegexpTokenizer
from nltk.probability import FreqDist
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [98]:
train_data, test_data, valid_data = [], [], []
size_table = {"train": [], "test": [], "valid": []}

for num in range(1, 13):
    train_df = pd.read_csv(f"https://raw.githubusercontent.com/RussellDash332/CS3244-Twemoji/main/Datasets/train_text_emoji_{num}.csv")
    test_df = pd.read_csv(f"https://raw.githubusercontent.com/RussellDash332/CS3244-Twemoji/main/Datasets/test_text_emoji_{num}.csv")
    valid_df = pd.read_csv(f"https://raw.githubusercontent.com/RussellDash332/CS3244-Twemoji/main/Datasets/valid_text_emoji_{num}.csv")

    train_data.append(train_df)
    test_data.append(test_df)
    valid_data.append(valid_df)

    # print(f"Train data size from emoji {num}:", train_df.shape)
    # print(f"Test data size from emoji {num}:", test_df.shape)
    # print(f"Valid data size from emoji {num}:", valid_df.shape)

    size_table["train"].append(train_df.shape[0])
    size_table["test"].append(test_df.shape[0])
    size_table["valid"].append(valid_df.shape[0])

In [99]:
size_table = pd.DataFrame(size_table, index=list(range(1, 13)))
size_table

Unnamed: 0,train,test,valid
1,5000,2000,2000
2,5000,1846,1708
3,5000,1272,1384
4,5000,1325,1355
5,5000,865,683
6,5000,824,815
7,5000,1481,921
8,5000,1319,2000
9,5000,621,713
10,5000,2000,2000


In [100]:
train_data = pd.concat(train_data)
test_data = pd.concat(test_data)
valid_data = pd.concat(valid_data)

print("Before duplicate removal: ", train_data.shape, test_data.shape, valid_data.shape)

train_data = train_data[~train_data.duplicated()]
test_data = test_data[~test_data.duplicated()]
valid_data = valid_data[~valid_data.duplicated()]

print("After duplicate removal: ", train_data.shape, test_data.shape, valid_data.shape)

Before duplicate removal:  (60000, 3) (15252, 3) (15394, 3)
After duplicate removal:  (59907, 3) (15185, 3) (15333, 3)


In [101]:
train_data.sample(n=10, random_state=1010)

Unnamed: 0,id,annotations,tweets
2650,742489752209920004,"[186, 1392]",@zaralarsson When The ticket sale start i hope...
3241,744734671477473281,[1446],"""Friends can break your heart too"" if that ain..."
4264,747884286783553537,[1392],I'm so grateful for my girl 😍
2000,743079873007243264,[1138],I deserve better and ins get it.. This shit ai...
1659,743250169236357120,"[763, 1381]","RT @juahoe: @rainaelise__ @_avb7 im sorry 😂 ""w..."
2938,744834083419652096,[1620],@jizenaaaaa well today 🤔
2012,744824980626604032,[1138],RT @DJ837: Proud of the DMV music scene right ...
305,744214758472167424,[1447],@marissa_greenee I think everyone knows that p...
317,741586067145515008,[1403],@Olly_Medd Happy Birthday kiddo 😘
3834,744168852976709633,[1403],@_hunter_noblitt thank you!!! Love you too 😘
