In [2]:
import torch 
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
import json 
import csv 
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import nltk 
from sklearn.model_selection import GridSearchCV, cross_val_score
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

nltk.download('punkt')
nltk.download('stopwords')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

[nltk_data] Downloading package punkt to /home/machine73/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/machine73/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
test_data = pd.read_csv('task_0_test.csv')
X_test, y_test = test_data["text"], test_data["emotion"]

train_data = pd.read_csv('task_0_train.csv')
X_train, y_train = train_data["text"], train_data["emotion"]

def preprocess_text(text):
    # Tokenization
    tokens = word_tokenize(text)

    # Lowercasing and removing punctuation
    tokens = [word.lower() for word in tokens if word.isalpha()]

    # Removing stop words
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    # Stemming
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(word) for word in tokens]

    # Convert back to string
    return ' '.join(tokens)

X_test = [preprocess_text(utterance) for utterance in X_test]
X_train = [preprocess_text(utterance) for utterance in X_train]

all_data, X_train_list = list(X_test), list(X_train)
all_labels, X_train_labels = list(y_test), list(y_train)

all_data.extend(X_train_list)
all_labels.extend(X_train_labels)

vectorizer = CountVectorizer()
X_train_counts = vectorizer.fit_transform(X_train)
X_test_counts = vectorizer.transform(X_test)
all_data_counts = vectorizer.fit_transform(all_data)
    

#### Grid search for best solver and best penalty

In [5]:
param_grid = {
    'solver': [ 'liblinear','saga'],
    'max_iter': [1000],
    'penalty': ['l1', 'l2', 'elasticnet']
}

clf = LogisticRegression()
grid_search = GridSearchCV(clf, param_grid, cv=5, scoring='accuracy')

grid_search.fit(X_train_counts, y_train)

# Print the best parameters and the best score
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best cross-validation score: {grid_search.best_score_}")

# Evaluate on the test set using the best found parameters
best_clf = grid_search.best_estimator_
y_pred = best_clf.predict(X_test_counts)

# Print classification report and accuracy
print(classification_report(y_test, y_pred))
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

Best parameters: {'max_iter': 1000, 'penalty': 'l1', 'solver': 'liblinear'}
Best cross-validation score: 0.4843845778428501
              precision    recall  f1-score   support

       anger       0.26      0.09      0.13       374
     disgust       0.20      0.03      0.05       111
        fear       0.50      0.02      0.04       106
         joy       0.49      0.24      0.33       594
     neutral       0.50      0.88      0.64      1465
     sadness       0.42      0.16      0.23       276
    surprise       0.54      0.28      0.37       447

    accuracy                           0.49      3373
   macro avg       0.42      0.24      0.25      3373
weighted avg       0.46      0.49      0.42      3373

Accuracy: 0.48739994070560333


10 fits failed out of a total of 30.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/home/machine73/anaconda3/envs/MT_P/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 890, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/machine73/anaconda3/envs/MT_P/lib/python3.10/site-packages/sklearn/base.py", line 1351, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/home/machine73/anaconda3/envs/MT_P/lib/python3.10/site-packages/sklearn/linear_model/_logistic.py", line 1172, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/home/machine73/anaconda3/envs/M

#### Grid search for best reg param

In [9]:
param_grid = {
    'C': np.logspace(-4, 4, 20),
    'multi_class':['ovr', 'multinomial']
}

clf = LogisticRegression(penalty='l1',solver='liblinear',max_iter=1000)

grid_search = GridSearchCV(clf, param_grid, cv=5, scoring='accuracy')

grid_search.fit(X_train_counts, y_train)

# Print the best parameters and the best score
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best cross-validation score: {grid_search.best_score_}")

# Evaluate on the test set using the best found parameters
best_clf = grid_search.best_estimator_
y_pred = best_clf.predict(X_test_counts)

# Print classification report and accuracy
print(classification_report(y_test, y_pred))
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

Best parameters: {'C': 0.615848211066026, 'multi_class': 'ovr'}
Best cross-validation score: 0.48789834422502354
              precision    recall  f1-score   support

       anger       0.31      0.07      0.12       374
     disgust       0.20      0.02      0.03       111
        fear       0.50      0.01      0.02       106
         joy       0.51      0.23      0.31       594
     neutral       0.49      0.91      0.64      1465
     sadness       0.46      0.15      0.23       276
    surprise       0.55      0.26      0.35       447

    accuracy                           0.49      3373
   macro avg       0.43      0.24      0.24      3373
weighted avg       0.47      0.49      0.41      3373

Accuracy: 0.490364660539579


100 fits failed out of a total of 200.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
100 fits failed with the following error:
Traceback (most recent call last):
  File "/home/machine73/anaconda3/envs/MT_P/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 890, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/machine73/anaconda3/envs/MT_P/lib/python3.10/site-packages/sklearn/base.py", line 1351, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/home/machine73/anaconda3/envs/MT_P/lib/python3.10/site-packages/sklearn/linear_model/_logistic.py", line 1212, in fit
    multi_class = _check_multi_class(self.multi_class, solver, len(self.classes_))
  File "/home/mach

####  Results with best hyperparams

In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score

clf = LogisticRegression(
    penalty='l1',
    solver='liblinear',
    max_iter=1000,
    C=0.6,
    multi_class='ovr'
    )  # 'saga' is a good choice for large datasets
clf.fit(X_train_counts, y_train)

y_pred = clf.predict(X_test_counts)
print(classification_report(y_test, y_pred))

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

scores = cross_val_score(clf, all_data_counts, all_labels, cv=5)
print(f"Cross-validated scores: {scores}")

              precision    recall  f1-score   support

       anger       0.31      0.07      0.12       374
     disgust       0.25      0.02      0.03       111
        fear       1.00      0.01      0.02       106
         joy       0.52      0.23      0.31       594
     neutral       0.49      0.91      0.64      1465
     sadness       0.45      0.15      0.22       276
    surprise       0.55      0.26      0.35       447

    accuracy                           0.49      3373
   macro avg       0.51      0.23      0.24      3373
weighted avg       0.49      0.49      0.41      3373

Accuracy: 0.490364660539579
Cross-validated scores: [0.49596182 0.49375918 0.49706314 0.49082232 0.48806463]


### LogisticRegression with saga solver 

In [14]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score

clf = LogisticRegression(solver='saga', max_iter=1000)  # 'saga' is a good choice for large datasets
clf.fit(X_train_counts, y_train)

y_pred = clf.predict(X_test_counts)
print(classification_report(y_test, y_pred))

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

scores = cross_val_score(clf, all_data_counts, all_labels, cv=5)
print(f"Cross-validated scores: {scores}")

              precision    recall  f1-score   support

       anger       0.26      0.14      0.19       374
     disgust       0.24      0.04      0.06       111
        fear       0.24      0.05      0.08       106
         joy       0.45      0.27      0.34       594
     neutral       0.53      0.81      0.64      1465
     sadness       0.29      0.17      0.22       276
    surprise       0.55      0.48      0.51       447

    accuracy                           0.49      3373
   macro avg       0.37      0.28      0.29      3373
weighted avg       0.45      0.49      0.45      3373

Accuracy: 0.4939223243403498
Cross-validated scores: [0.49926579 0.49889868 0.50110132 0.49743025 0.49614396]


### Multinomial base regressor 

In [15]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score

clf = MultinomialNB()
clf.fit(X_train_counts, y_train)

y_pred = clf.predict(X_test_counts)
print(classification_report(y_test, y_pred))

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

scores = cross_val_score(clf, all_data_counts, all_labels, cv=5)
print(f"Cross-validated scores: {scores}")

              precision    recall  f1-score   support

       anger       0.32      0.11      0.17       374
     disgust       0.40      0.02      0.03       111
        fear       0.00      0.00      0.00       106
         joy       0.50      0.23      0.31       594
     neutral       0.49      0.91      0.64      1465
     sadness       0.32      0.09      0.14       276
    surprise       0.59      0.26      0.36       447

    accuracy                           0.49      3373
   macro avg       0.38      0.23      0.24      3373
weighted avg       0.46      0.49      0.41      3373

Accuracy: 0.4879928846723985
Cross-validated scores: [0.46842878 0.48017621 0.47870778 0.47870778 0.47521116]
