In [33]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import contractions
import langid
import spacy
import re
import emoji
import nltk
from sklearn.ensemble import RandomForestClassifier
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from bs4 import BeautifulSoup
from spacy.language import Language
from spacy_language_detection import LanguageDetector
from sklearn.multioutput import MultiOutputClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.multioutput import ClassifierChain
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, hamming_loss,accuracy_score
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from transformers import BertTokenizer, BertModel
import torch
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [2]:
splits = {'train': 'simplified/train-00000-of-00001.parquet', 'validation': 'simplified/validation-00000-of-00001.parquet', 'test': 'simplified/test-00000-of-00001.parquet'}
train_df = pd.read_parquet("hf://datasets/google-research-datasets/go_emotions/" + splits["train"])
valid_df = pd.read_parquet("hf://datasets/google-research-datasets/go_emotions/" + splits["validation"])
test_df = pd.read_parquet("hf://datasets/google-research-datasets/go_emotions/" + splits["test"])
# train_df = pd.read_csv("/Users/sudeepmungara/Documents/Personal_Projects/NLP/data/train.csv")

In [3]:
train_df.dropna(inplace=True)

In [4]:
emotions = ['admiration', 'amusement', 'anger', 'annoyance', 'approval', 'caring', 'confusion', 'curiosity', 'desire', 'disappointment', 'disapproval', 'disgust', 'embarrassment', 'excitement', 'fear', 'gratitude', 'grief', 'joy', 'love', 'nervousness', 'optimism', 'pride', 'realization', 'relief', 'remorse', 'sadness', 'surprise', 'neutral']

In [5]:
id2label = {i:emotions[i] for i in range(len(emotions))}

In [6]:
for i in id2label:
    train_df[id2label[i]] = train_df.labels.apply(lambda x: 1 if i in x else 0)
    valid_df[id2label[i]] = valid_df.labels.apply(lambda x: 1 if i in x else 0)
    test_df[id2label[i]] = test_df.labels.apply(lambda x: 1 if i in x else 0)

In [7]:
def demojize_text(text):
    return emoji.demojize(text)

In [8]:
def clean_text(text):

    # Ensure the input is a string
    text = str(text).lower()
    
    # Remove specific patterns and unwanted characters
    text = re.sub(r'\:(.*?)\:', '', text)
    text = re.sub(r'\[.*?\]', '', text)
    
    # Remove HTML content
    text = BeautifulSoup(text, 'html.parser').get_text()
    # Remove URLs
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    
    # Remove HTML tags, newlines, and words with numbers
    text = re.sub(r'<.*?>+', '', text)
    text = re.sub(r'\n', '', text)
    text = re.sub(r'\w*\d\w*', '', text)
    
    # Remove all punctuation
    text = re.sub(r"[^\w\s]", "", text)  # Removes everything except word characters and spaces
    
    # Replace multiple spaces with a single space
    text = re.sub(r"\s+", " ", text).strip()
    return text

In [9]:
def remove_stopwords(text):
    new_text=[]
    for word in text.split():
        if word in stopwords.words('english'):
            new_text.append('')
        else:
            new_text.append(word)
            
    x=new_text[:]
    new_text.clear()
    return " ".join(x)

In [10]:
def text_preprocessing(df):
    df['text'] = df['text'].apply(lambda x: contractions.fix(x))
    df['text'] = df['text'].apply(lambda x: demojize_text(x))
    # df['text'] = df['text'].apply(lambda x: clean_text(x))
    df['text'] = df['text'].apply(lambda x: remove_stopwords(x))
    return df

In [11]:
train_df = text_preprocessing(train_df)
test_df = text_preprocessing(test_df)
valid_df = text_preprocessing(valid_df)

In [12]:
# vectorizer = TfidfVectorizer(ngram_range=(3,3))
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

In [13]:
def get_bert_embeddings(text):
    inputs = tokenizer([text], padding=True, truncation=True, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs)
    embeddings = outputs.last_hidden_state[:, 0, :]  # [CLS] token as sentence embedding
    return embeddings[0].numpy()

In [14]:
train_df['embeddings'] = train_df['text'].apply(get_bert_embeddings)
test_df['embeddings'] = test_df['text'].apply(get_bert_embeddings)
valid_df['embeddings'] = valid_df['text'].apply(get_bert_embeddings)

In [21]:
np.stack(train_df['embeddings'].values)

array([[-0.23600633,  0.26128465, -0.00110828, ...,  0.04515841,
         0.3806575 ,  0.2508166 ],
       [-0.15286319,  0.20315409,  0.10535079, ..., -0.55064994,
         0.9888965 ,  0.0919748 ],
       [-0.00429128,  0.28572923,  0.04581843, ..., -0.17329574,
         0.1313872 ,  0.14194687],
       ...,
       [ 0.01718974, -0.2551513 , -0.27539182, ..., -0.37481186,
         0.03428522,  0.33105877],
       [-0.14047493, -0.30260164,  0.05103033, ..., -0.6276323 ,
         0.30474293,  0.30814284],
       [-0.09355879,  0.05430624,  0.2569471 , ..., -0.11302563,
        -0.23879355,  0.42790604]], dtype=float32)

In [22]:
train_df[emotions].values

array([[0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 0, 0, 1],
       [0, 0, 1, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [23]:
X_train = np.stack(train_df['embeddings'].values)
y_train = train_df[emotions].values
X_test = np.stack(test_df['embeddings'].values)
y_test = test_df[emotions].values

In [34]:
base_classifier = xgb_clf = XGBClassifier(random_state=42)
# LogisticRegression(max_iter=5000,class_weight='balanced')
model = MultiOutputClassifier(base_classifier, n_jobs=-1) 

In [35]:
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Calculate F1 score for multi-label classification
f1 = f1_score(y_test, y_pred, average="macro")
print("Macro F1 Score:", f1)



Macro F1 Score: 0.14626138379190903


In [36]:
accuracy_score(y_test, y_pred)

0.21319329279528285