In [1]:
from transformers import AutoTokenizer, AutoModel
import torch
# For example, using 'sentence-transformers/all-MiniLM-L6-v2' for embeddings
model_name = 'sentence-transformers/all-MiniLM-L6-v2'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import pandas as pd

In [45]:
test_db = pd.read_csv('archive/test.csv')
train_db = pd.read_csv('archive/training.csv')
validation_db = pd.read_csv('archive/validation.csv')

label_mapping = {
    'sadness':0,
    'joy':1,
    'love':2,
    'anger':3,
    'fear':4,
    'shocked':5
}

In [4]:
def tokenize_tweets(tweets):
    return tokenizer(tweets, padding=True, truncation=True, return_tensors="pt")

def get_embeddings(tokenized_tweets):
    with torch.no_grad():
        outputs = model(**tokenized_tweets)
        embeddings = outputs.last_hidden_state.mean(dim=1)
    return embeddings

In [5]:
train_text_list = list(train_db['text'])
test_text_list = list(test_db['text'])
y_train = train_db['label']
y_test = test_db['label']

In [6]:
tokens = tokenize_tweets(train_text_list)

In [7]:
tokens

{'input_ids': tensor([[  101,  1045,  2134,  ...,     0,     0,     0],
        [  101,  1045,  2064,  ...,     0,     0,     0],
        [  101, 10047,  9775,  ...,     0,     0,     0],
        ...,
        [  101,  1045,  2514,  ...,     0,     0,     0],
        [  101,  1045,  2514,  ...,     0,     0,     0],
        [  101,  1045,  2113,  ...,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}

In [8]:
#example 
sentence =['my name is Rahul Jain','and my age is 40']
example_token = tokenize_tweets(sentence)
embedding = get_embeddings(example_token)
example_token

{'input_ids': tensor([[  101,  2026,  2171,  2003, 10958, 21886, 17136,   102],
        [  101,  1998,  2026,  2287,  2003,  2871,   102,     0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 0]])}

In [9]:
#embeddings = get_embeddings(tokens)
embeddings = torch.load('embeddings.pt')

In [10]:
torch.save(embeddings,'embeddings.pt')

In [11]:
type(embeddings)

torch.Tensor

In [12]:
print(embeddings.shape)
from torch import nn

torch.Size([16000, 384])


In [13]:
class EmotionClassifier(nn.Module):
    def __init__(self, input_dim, num_labels):
        super(EmotionClassifier, self).__init__()
        self.fc1 = nn.Linear(input_dim, 512)
        self.fc2 = nn.Linear(512, 256)
        self.fc3 = nn.Linear(256, 128)
        self.fc4 = nn.Linear(128, 64)
        self.fc5 = nn.Linear(64, num_labels)
        self.relu = nn.ReLU()
    
    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.relu(self.fc3(x))
        x = self.relu(self.fc4(x))
        x = self.fc5(x)  # No ReLU after the last layer, assuming it's a classification task
        return x

In [14]:
def calculate_accuracy(logits, labels):
    # No need for softmax since we're interested in the index of the maximum logit
    # which corresponds to the predicted class
    _, predicted_classes = torch.max(logits, 1)
    correct_predictions = torch.eq(predicted_classes, labels).float()  # Convert to float to perform average
    accuracy = correct_predictions.sum() / len(labels)
    return accuracy.item()  # Returns the accuracy as a Python float


In [15]:
num_labels = len(label_mapping)

In [16]:
y_train.max()

5

In [17]:
from torch.utils.data import DataLoader, TensorDataset
import torch.optim as optim

# Convert to PyTorch datasets
train_dataset = TensorDataset(embeddings, torch.tensor(y_train, dtype=torch.long))

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)

# Initialize the classifier
input_dim = embeddings.size(1)  # The size of the embeddings
classifier = EmotionClassifier(input_dim, num_labels)
# Define loss function and optimizer
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.Adam(classifier.parameters(), lr=0.001)

# Training loop
for epoch in range(20):  # Number of epochs
    total_accuracy=0
    for inputs, labels in train_loader:
        optimizer.zero_grad()
        y_logits= classifier(inputs)
        accuracy = calculate_accuracy(y_logits, labels)
        total_accuracy += accuracy
        loss = loss_fn(y_logits, labels)
        loss.backward()
        optimizer.step()
    average_accuracy = total_accuracy / len(train_loader)
    print(f'Epoch {epoch+1}, Loss: {loss.item()}, Accuracy: {average_accuracy}')


Epoch 1, Loss: 1.167739987373352, Accuracy: 0.54775
Epoch 2, Loss: 0.7808763980865479, Accuracy: 0.6363125
Epoch 3, Loss: 0.3064796030521393, Accuracy: 0.675875
Epoch 4, Loss: 0.4607236683368683, Accuracy: 0.7176875
Epoch 5, Loss: 0.46791404485702515, Accuracy: 0.752375
Epoch 6, Loss: 0.39606937766075134, Accuracy: 0.7968125
Epoch 7, Loss: 0.13932672142982483, Accuracy: 0.8241875
Epoch 8, Loss: 0.1260775625705719, Accuracy: 0.8530625
Epoch 9, Loss: 0.442171186208725, Accuracy: 0.8786875
Epoch 10, Loss: 0.1712743192911148, Accuracy: 0.8938125
Epoch 11, Loss: 0.3484903573989868, Accuracy: 0.904625
Epoch 12, Loss: 0.15029533207416534, Accuracy: 0.9221875
Epoch 13, Loss: 0.3567889630794525, Accuracy: 0.9303125
Epoch 14, Loss: 0.16044016182422638, Accuracy: 0.939
Epoch 15, Loss: 0.3745824098587036, Accuracy: 0.945625
Epoch 16, Loss: 0.09722066670656204, Accuracy: 0.9525
Epoch 17, Loss: 0.017740000039339066, Accuracy: 0.9553125
Epoch 18, Loss: 0.1835901290178299, Accuracy: 0.95525
Epoch 19, 

In [84]:
def predict(tweet):
    token = tokenize_tweets(tweet)
    embeddings = get_embeddings(token)
    logits = classifier(embeddings)
    _, predicted_class = torch.max(logits, 1)
    for key,val in label_mapping.items():
        if val==predicted_class.item():
            print("the predicted class is: ",key)

In [87]:
predict(['i wnat to die'])

the predicted class is:  sadness
