In [6]:
from datasets import load_dataset
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings("ignore")

In [9]:
splits = {'train': 'simplified/train-00000-of-00001.parquet', 'validation': 'simplified/validation-00000-of-00001.parquet', 'test': 'simplified/test-00000-of-00001.parquet'}
df = pd.read_parquet("hf://datasets/google-research-datasets/go_emotions/" + splits["train"])

In [10]:
df.head()

Unnamed: 0,text,labels,id
0,My favourite food is anything I didn't have to...,[27],eebbqej
1,"Now if he does off himself, everyone will thin...",[27],ed00q6i
2,WHY THE FUCK IS BAYLESS ISOING,[2],eezlygj
3,To make her feel threatened,[14],ed7ypvh
4,Dirty Southern Wankers,[3],ed0bdzj


In [11]:
def get_single_label(labels):
    label_list = eval(labels) if isinstance(labels, str) else labels
    return label_list[0]

In [12]:
df['label'] = df['labels'].apply(get_single_label)

In [14]:
X = df['text'].values
y = df['label'].values

In [15]:
le = LabelEncoder()
y = le.fit_transform(y)

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=666)


In [17]:
count_vectorizer = CountVectorizer(max_features=5000, stop_words='english')
tfidf_vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')

In [18]:
X_train_count = count_vectorizer.fit_transform(X_train)
X_test_count = count_vectorizer.transform(X_test)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [19]:
classifiers = {
    "LogisticRegression": LogisticRegression(max_iter=1000, random_state=52, multi_class='ovr'),
    "SVC": SVC(kernel='linear', random_state=34)
}

In [20]:
results = []


In [21]:
for vectorizer_name, X_train_vec, X_test_vec in [
    ("CountVectorizer", X_train_count, X_test_count),
    ("TfidfVectorizer", X_train_tfidf, X_test_tfidf)
]:
    for clf_name, clf in classifiers.items():
        print(f"\nTraining {clf_name} with {vectorizer_name}...")

        clf.fit(X_train_vec, y_train)

        y_pred = clf.predict(X_test_vec)

        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
        recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
        f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)

        results.append({
            'Vectorizer': vectorizer_name,
            'Classifier': clf_name,
            'Accuracy': accuracy,
            'Precision': precision,
            'Recall': recall,
            'F1': f1
        })




Training LogisticRegression with CountVectorizer...

Training SVC with CountVectorizer...

Training LogisticRegression with TfidfVectorizer...

Training SVC with TfidfVectorizer...


In [22]:
results_df = pd.DataFrame(results)
print("\nResults:")
print(results_df)



Results:
        Vectorizer          Classifier  Accuracy  Precision    Recall  \
0  CountVectorizer  LogisticRegression  0.511979   0.483529  0.511979   
1  CountVectorizer                 SVC  0.486178   0.448252  0.486178   
2  TfidfVectorizer  LogisticRegression  0.496775   0.509143  0.496775   
3  TfidfVectorizer                 SVC  0.518544   0.507778  0.518544   

         F1  
0  0.464345  
1  0.453055  
2  0.432323  
3  0.459024  


In [23]:
best_f1 = results_df['F1'].max()
best_combo = results_df[results_df['F1'] == best_f1].iloc[0]
print(f"\nBest combination: {best_combo['Vectorizer']} with {best_combo['Classifier']}")
print(f"Best F1-score: {best_f1:.3f}")


Best combination: CountVectorizer with LogisticRegression
Best F1-score: 0.464
