In [1]:

import pandas as pd
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import classification_report, accuracy_score

# Download necessary resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
# Load dataset
df = pd.read_csv('dataset.csv')

# Check the first few rows and column names
# print(df.head())
# print(df.columns)


[nltk_data] Downloading package punkt to C:\Users\Nabeel
[nltk_data]     Ali\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package stopwords to C:\Users\Nabeel
[nltk_data]     Ali\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Nabeel
[nltk_data]     Ali\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:



# import nltk
# import re
# from nltk.corpus import stopwords
# from nltk.stem.porter import PorterStemmer
# from nltk.stem import WordNetLemmatizer
# nltk.download('punkt_tab')
# paragraph = """This involved swimming a pretty large lake that was over my head. 
# It was one of my most shameful experiences. 
# After all, I had vegetables coming out my ears all for the benefit of the young prince."""

# ps = PorterStemmer()
# wordnet = WordNetLemmatizer()
# sentences = nltk.sent_tokenize(paragraph)
# corpus = []

# for i in range(len(sentences)):
#     review = re.sub('[^a-zA-Z]',' ' ,sentences[i])
#     review = review.lower()
#     review = review.split()
#     review = [wordnet.lemmatize(word) for word in review if not word in set (stopwords.words('english'))]
#     review = ' '.join(review)
#     corpus.append(review)

# print(corpus)
# # ['involv swim pretti larg lake head', 'one shame experi', 'veget come ear benefit young princ']

# # from sklearn.feature_extraction.text import CountVectorizer
# # cv = CountVectorizer()
# # x = cv.fit_transform(corpus).toarray()

# # print(x)

# from sklearn.feature_extraction.text import TfidfVectorizer
# cv1 = TfidfVectorizer()
# y = cv1.fit_transform(corpus).toarray()


# print(y)





# Extract text and emotion columns
texts = df['text']  # assuming your text column is named 'text'
emotions = df[['anger', 'fear', 'joy', 'sadness', 'surprise']]  # multi-label target

# Preprocessing
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))
corpus = []

for sentence in texts:
    review = re.sub('[^a-zA-Z]', ' ', str(sentence))
    review = review.lower().split()
    review = [lemmatizer.lemmatize(word) for word in review if word not in stop_words]
    corpus.append(' '.join(review))

# TF-IDF Vectorization
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus).toarray()
y = emotions.values  # multi-label targets

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Multinomial Naive Bayes with multi-label wrapper
nb = MultinomialNB()
multi_target_nb = MultiOutputClassifier(nb)
multi_target_nb.fit(X_train, y_train)

# print(X_train)

# Predict
y_pred = multi_target_nb.predict(X_test)


# Evaluate
print("Accuracy (per label):")
for i, emotion in enumerate(emotions.columns):
    print(f"{emotion}: {accuracy_score(y_test[:, i], y_pred[:, i]):.4f}")

print("\nClassification Report (overall):")
print(classification_report(y_test, y_pred, target_names=emotions.columns))






Accuracy (per label):
anger: 0.8700
fear: 0.6462
joy: 0.7960
sadness: 0.7058
surprise: 0.6895

Classification Report (overall):
              precision    recall  f1-score   support

       anger       0.00      0.00      0.00        72
        fear       0.64      0.92      0.76       330
         joy       0.75      0.03      0.05       115
     sadness       0.61      0.07      0.12       167
    surprise       0.82      0.05      0.09       179

   micro avg       0.65      0.38      0.48       863
   macro avg       0.56      0.21      0.20       863
weighted avg       0.63      0.38      0.34       863
 samples avg       0.55      0.35      0.41       863



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [5]:
new_text = "shameful experience"

# Preprocess
review = re.sub('[^a-zA-Z]', ' ', new_text)
review = review.lower().split()
review = [lemmatizer.lemmatize(word) for word in review if word not in stop_words]
processed_text = ' '.join(review)

# Vectorize using the same TF-IDF vectorizer
X_new = vectorizer.transform([processed_text]).toarray()

# Predict
predicted_labels = multi_target_nb.predict(X_new)[0]

# Print result
print("\nPredicted emotions:")
for emotion, value in zip(emotions.columns, predicted_labels):
    print(f"{emotion}: {'Yes' if value == 1 else 'No'}")







# import pandas as pd
# import math
# from collections import defaultdict, Counter

# class NaiveBayesClassifier:
#     def __init__(self):
#         self.class_priors = {}
#         self.word_counts = {}
#         self.class_word_totals = {}
#         self.vocabulary = set()
#         self.classes = set()

#     def tokenize(self, text):
#         return text.lower().split()

#     def train(self, csv_file):
#         df = pd.read_csv(csv_file)
#         class_counts = Counter()
#         word_counts = defaultdict(Counter)

#         for _, row in df.iterrows():
#             label = row['class']
#             tokens = self.tokenize(row['sentence'])
#             class_counts[label] += 1
#             word_counts[label].update(tokens)
#             self.vocabulary.update(tokens)
#             self.classes.add(label)

#         self.class_priors = {cls: count / sum(class_counts.values()) for cls, count in class_counts.items()}
#         self.word_counts = word_counts
#         self.class_word_totals = {cls: sum(word_counts[cls].values()) for cls in self.classes}
#         self.vocabulary.add('<UNK>')  # unknown word token

#     def predict(self, sentence):
#         tokens = self.tokenize(sentence)
#         log_probs = {}

#         for cls in self.classes:
#             log_prob = math.log(self.class_priors[cls])
#             total_words = self.class_word_totals[cls]
#             vocab_size = len(self.vocabulary)

#             for token in tokens:
#                 count = self.word_counts[cls].get(token, 0)
#                 prob = (count + 1) / (total_words + vocab_size)  # add-1 smoothing
#                 log_prob += math.log(prob)

#             log_probs[cls] = log_prob

#         return max(log_probs, key=log_probs.get)

# # Example usage:
# # Create and train classifier
# nb = NaiveBayesClassifier()
# nb.train('sample_sentences.csv')  # CSV should have columns: 'sentence', 'class'

# # Predict new sentence
# print(nb.predict("Dead"))



Predicted emotions:
anger: No
fear: Yes
joy: No
sadness: Yes
surprise: No


In [6]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel, BertPreTrainedModel
from torch.optim import AdamW
from transformers import BertForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import torch.nn as nn
import numpy as np

# Load data
df = pd.read_csv('dataset.csv')
texts = df['text'].tolist()
labels = df[['anger', 'fear', 'joy', 'sadness', 'surprise']].values

In [7]:


# Train/test split
train_texts, test_texts, train_labels, test_labels = train_test_split(
    texts, labels, test_size=0.2, random_state=42
)

# Tokenization
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

class EmotionDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx],
            truncation=True,
            padding='max_length',
            max_length=self.max_len,
            return_tensors='pt'
        )
        item = {key: val.squeeze(0) for key, val in encoding.items()}
        item['labels'] = torch.FloatTensor(self.labels[idx])
        return item

train_dataset = EmotionDataset(train_texts, train_labels, tokenizer)
test_dataset = EmotionDataset(test_texts, test_labels, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16)

# Define BERT model for multi-label classification
class BertForMultiLabel(BertPreTrainedModel):
    def __init__(self, config, num_labels=5):
        super().__init__(config)
        self.bert = BertModel(config)
        self.dropout = nn.Dropout(0.3)
        self.classifier = nn.Linear(config.hidden_size, num_labels)
        self.sigmoid = nn.Sigmoid()
        self.init_weights()
    
    def forward(self, input_ids, attention_mask=None, token_type_ids=None, labels=None):
        outputs = self.bert(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        pooled_output = self.dropout(outputs.pooler_output)
        logits = self.classifier(pooled_output)
        probs = self.sigmoid(logits)
        loss = None
        if labels is not None:
            loss_fn = nn.BCELoss()
            loss = loss_fn(probs, labels)
        return {'loss': loss, 'logits': probs}

from transformers import BertConfig
config = BertConfig.from_pretrained('bert-base-uncased')
model = BertForMultiLabel.from_pretrained('bert-base-uncased', config=config)

# Training setup
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
optimizer = AdamW(model.parameters(), lr=2e-5)

# Training loop
for epoch in range(3):
    model.train()
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs['loss']
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch+1} Loss: {loss.item():.4f}")

# Evaluation
model.eval()
all_preds = []
all_true = []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].cpu().numpy()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        probs = outputs['logits'].cpu().numpy()
        preds = (probs >= 0.5).astype(int)
        all_preds.append(preds)
        all_true.append(labels)

y_pred = np.vstack(all_preds)
y_true = np.vstack(all_true)

print(classification_report(y_true, y_pred, target_names=['anger', 'fear', 'joy', 'sadness', 'surprise']))


Some weights of BertForMultiLabel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1 Loss: 0.3369
Epoch 2 Loss: 0.2695
Epoch 3 Loss: 0.2345
              precision    recall  f1-score   support

       anger       0.83      0.28      0.42        72
        fear       0.81      0.80      0.81       330
         joy       0.62      0.73      0.67       115
     sadness       0.77      0.54      0.63       167
    surprise       0.80      0.70      0.75       179

   micro avg       0.77      0.68      0.72       863
   macro avg       0.77      0.61      0.65       863
weighted avg       0.78      0.68      0.71       863
 samples avg       0.66      0.63      0.62       863



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [11]:
# New input
new_text = "Oh no I need to get to the meeting very quickly or else manager will shout at me"

# Tokenize
encoding = tokenizer(
    new_text,
    truncation=True,
    padding='max_length',
    max_length=128,
    return_tensors='pt'
)

# Move to device
input_ids = encoding['input_ids'].to(device)
attention_mask = encoding['attention_mask'].to(device)

# Model prediction
model.eval()
with torch.no_grad():
    outputs = model(input_ids=input_ids, attention_mask=attention_mask)
    probs = outputs['logits'].cpu().numpy()[0]
    preds = (probs >= 0.5).astype(int)

# Print predicted emotions
emotion_labels = ['anger', 'fear', 'joy', 'sadness', 'surprise']
predicted_emotions = [emotion_labels[i] for i, val in enumerate(preds) if val == 1]

print("Predicted emotions:", predicted_emotions)


Predicted emotions: ['fear']
