In [None]:

import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
import textblob
textblob.download_corpora()

AttributeError: module 'textblob' has no attribute 'download_corpora'

In [None]:
!python -m textblob.download_corpora

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.
[nltk_data] Downloading package conll2000 to /root/nltk_data...
[nltk_data]   Unzipping corpora/conll2000.zip.
[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Unzipping corpora/movie_reviews.zip.
Finished.


In [None]:

# 📦 Install required packages
!pip install nrclex --quiet

# 📚 Imports
import pandas as pd
from nrclex import NRCLex
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputClassifier
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score
import numpy as np
from tqdm import tqdm

# 📂 Load Data (upload these files to Colab)
sentences = pd.read_csv("/content/drive/MyDrive/preprocessed_data.csv")
labels = pd.read_csv("/content/drive/MyDrive/training-english/labels.tsv", sep="\t")

# 🔁 Merge on 'Text-ID' and 'Sentence-ID'
df = pd.merge(sentences, labels, on=["Text-ID", "Sentence-ID"])
df = df.dropna(subset=["cleaned_text"])

# 🧠 Emotion Categories (NRC-based)
emotion_categories = ['anger', 'anticipation', 'disgust', 'fear', 'joy', 'sadness', 'surprise', 'trust', 'positive', 'negative']

# 🔍 Extract NRC Emotion Features
def extract_nrc_features(text):
    emotion_obj = NRCLex(text)
    scores = emotion_obj.affect_frequencies
    return [scores.get(e, 0) for e in emotion_categories]
# 🧠 Apply to all sentences
tqdm.pandas()
X_features = df["cleaned_text"].progress_apply(extract_nrc_features)
X = pd.DataFrame(X_features.tolist(), columns=emotion_categories)

# 🏷️ Labels
y = df.drop(columns=["Text-ID", "Sentence-ID", "Text", "cleaned_text"])
y = (y >= 0.5).astype(int)  # Binarize labels

# 🔀 Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 🔁 Models
svm = MultiOutputClassifier(LinearSVC())
nb = MultiOutputClassifier(BernoulliNB())

# 🏋️ Train
svm.fit(X_train, y_train)
nb.fit(X_train, y_train)

# 🔎 Predict
svm_preds = svm.predict(X_test)
nb_preds = nb.predict(X_test)

# 📊 Evaluation
def evaluate_model(y_true, y_pred, model_name):
    print(f"\n📍 {model_name} Evaluation")
    print("Accuracy:", accuracy_score(y_true, y_pred))
    print("Macro F1-score:", f1_score(y_true, y_pred, average="macro", zero_division=0))
    print("Macro Precision:", precision_score(y_true, y_pred, average="macro", zero_division=0))
    print("Macro Recall:", recall_score(y_true, y_pred, average="macro", zero_division=0))

# 🧾 Show Results
evaluate_model(y_test, svm_preds, "SVM (NRC Features)")
evaluate_model(y_test, nb_preds, "Naive Bayes (NRC Features)")

100%|██████████| 44652/44652 [00:07<00:00, 5711.71it/s]



📍 SVM (NRC Features) Evaluation
Accuracy: 0.48605979173664765
Macro F1-score: 0.0
Macro Precision: 0.0
Macro Recall: 0.0

📍 Naive Bayes (NRC Features) Evaluation
Accuracy: 0.46377785242414066
Macro F1-score: 0.004159592529711375
Macro Precision: 0.0034709924204859386
Macro Recall: 0.005189028910303929


##BERT EMBEDDINGS + NRC

In [None]:

# 📦 Install required packages
!pip install nrclex --quiet

# 📚 Imports
import pandas as pd
import numpy as np
from nrclex import NRCLex
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputClassifier
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score
from tqdm import tqdm

# 📂 Load files (upload these in Colab)
sentences = pd.read_csv("/content/drive/MyDrive/preprocessed_data.csv")
labels = pd.read_csv("/content/drive/MyDrive/training-english/labels.tsv", sep="\t")
bert_embeddings = np.load("/content/drive/MyDrive/training-english/bert_embeddings.npy")  # Shape should be (n_samples, 768)

# 🔁 Merge data
df = pd.merge(sentences, labels, on=["Text-ID", "Sentence-ID"])
df = df.dropna(subset=["cleaned_text"])
df = df.reset_index(drop=True)

# ✅ NRC Emotion Feature Extraction
emotion_categories = ['anger', 'anticipation', 'disgust', 'fear', 'joy',
                      'sadness', 'surprise', 'trust', 'positive', 'negative']

def extract_nrc_features(text):
    emotion_obj = NRCLex(text)
    scores = emotion_obj.affect_frequencies
    return [scores.get(e, 0) for e in emotion_categories]

tqdm.pandas()
nrc_features = df["cleaned_text"].progress_apply(extract_nrc_features)
nrc_matrix = np.array(nrc_features.tolist())

# ✅ Combine NRC + BERT features

bert_embeddings = bert_embeddings[:nrc_matrix.shape[0]]
combined_features = np.hstack((bert_embeddings, nrc_matrix))

# ✅ Prepare labels
y = df.drop(columns=["Text-ID", "Sentence-ID", "Text", "cleaned_text"])
y_binary = (y >= 0.5).astype(int)

# 🔀 Train-test split
X_train, X_test, y_train, y_test = train_test_split(combined_features, y_binary, test_size=0.2, random_state=42)

# 🔁 Models
svm = MultiOutputClassifier(LinearSVC())
nb = MultiOutputClassifier(BernoulliNB())

# 🏋️ Train
svm.fit(X_train, y_train)
nb.fit(X_train, y_train)

# 🔎 Predict
svm_preds = svm.predict(X_test)
nb_preds = nb.predict(X_test)

# 📊 Evaluation
def evaluate_model(y_true, y_pred, name):
    print(f"\n📍 {name} Evaluation")
    print("Accuracy:", accuracy_score(y_true, y_pred))
    print("Macro F1-score:", f1_score(y_true, y_pred, average="macro", zero_division=0))
    print("Macro Precision:", precision_score(y_true, y_pred, average="macro", zero_division=0))
    print("Macro Recall:", recall_score(y_true, y_pred, average="macro", zero_division=0))

evaluate_model(y_test, svm_preds, "SVM (BERT + NRC)")
evaluate_model(y_test, nb_preds, "Naive Bayes (BERT + NRC)")

100%|██████████| 44652/44652 [00:13<00:00, 3426.39it/s]



📍 SVM (BERT + NRC) Evaluation
Accuracy: 0.485052065838092
Macro F1-score: 0.0
Macro Precision: 0.0
Macro Recall: 0.0

📍 Naive Bayes (BERT + NRC) Evaluation
Accuracy: 0.10323591982980629
Macro F1-score: 0.050253609535145925
Macro Precision: 0.030605058252982403
Macro Recall: 0.20231624716387833
