##Reading files

In [2]:
import os
import numpy as np
import pandas as pd


dataset = pd.read_excel("PERC_mendelly.xlsx")



##Exploring data

In [3]:

df = pd.DataFrame(dataset, columns=["Poem", "Emotion"])

df = df.dropna()
df.head

<bound method NDFrame.head of                                                   Poem Emotion
0    A Tree\nA tree beside the sandy river-beach \n...     sad
1    Sri Krishna\n\nO immense Light and thou, O spi...    love
2    Who\n\n\nIn the blue of the sky, in the green ...   peace
3    Revelation\n\n\nSomeone leaping from the rocks...     sad
4    The Silver Call\n\n\nThere is a godhead of unr...     joy
..                                                 ...     ...
711  Daughter Taken By Mothers Lies\n\nHave you any...     sad
712  Involuntary Acceptance\n\nEven though\nWe’re f...     sad
713  Victim Of Poverty\n\nPoverty stricken youth ju...     sad
714  Rain\n\nI sit and watch\nas the rain falls \nf...     sad
715  The Face Of Sadness\n\nIts happened again.\n\n...     sad

[716 rows x 2 columns]>

In [4]:
print(df.Emotion.unique())

['sad' 'love' 'peace' 'joy' 'courage' 'surprise' 'hate' 'anger' 'fear']


##Converting the labels to integers

In [5]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit(df['Emotion'])
df['Emotion']=le.transform(df['Emotion'])

In [6]:
df

Unnamed: 0,Poem,Emotion
0,A Tree\nA tree beside the sandy river-beach \n...,7
1,"Sri Krishna\n\nO immense Light and thou, O spi...",5
2,"Who\n\n\nIn the blue of the sky, in the green ...",6
3,Revelation\n\n\nSomeone leaping from the rocks...,7
4,The Silver Call\n\n\nThere is a godhead of unr...,4
...,...,...
711,Daughter Taken By Mothers Lies\n\nHave you any...,7
712,Involuntary Acceptance\n\nEven though\nWe’re f...,7
713,Victim Of Poverty\n\nPoverty stricken youth ju...,7
714,Rain\n\nI sit and watch\nas the rain falls \nf...,7


In [7]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

##Loading the pre-trained BERT model for fine tuning

In [8]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch.nn.functional as F

tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=5)


ModuleNotFoundError: No module named 'transformers'

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW

In [None]:
# Define a custom dataset class for annotated poems
class PoemDataset(Dataset):
    def __init__(self, data, tokenizer):
        self.tokenizer = tokenizer
        self.texts = data['Poem'].tolist()
        self.labels = data['Emotion'].tolist()

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(text, truncation=True, padding='max_length', max_length=128, return_tensors='pt')
        return {'input_ids': encoding['input_ids'][0], 'attention_mask': encoding['attention_mask'][0], 'label': label}

In [None]:
from sklearn.model_selection import train_test_split

train_data, val_data = train_test_split(df, test_size=0.2, random_state=72)


train_dataset = PoemDataset(train_data, tokenizer)
val_dataset = PoemDataset(val_data, tokenizer)

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=True)

In [None]:
train_data

In [None]:
val_data

In [None]:
from sklearn.metrics import accuracy_score, f1_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds, average="weighted")
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1}

##Defining and Training the model and fine-tuning

In [None]:
!pip install accelerate -U
!pip install transformers[torch]
from transformers import Trainer, TrainingArguments

batch_size = 16
logging_steps = len(train_data) // batch_size
training_args = TrainingArguments(output_dir="results",
                                  num_train_epochs=10,
                                  learning_rate=0.0001,
                                  per_device_train_batch_size=batch_size,
                                  per_device_eval_batch_size=batch_size,
                                  load_best_model_at_end=True,
                                  metric_for_best_model="f1",
                                  weight_decay=0.01,
                                  evaluation_strategy="epoch",
                                  save_strategy="epoch",
                                  disable_tqdm=False)

In [None]:
from transformers import Trainer
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train();

In [None]:
results = trainer.evaluate()
results


In [None]:
model.save_pretrained('./model')
tokenizer.save_pretrained('./model')

##Testing the model on a random poem

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Load the tokenizer and the model
tokenizer = AutoTokenizer.from_pretrained("/content/model")
model = AutoModelForSequenceClassification.from_pretrained("/content/model")

# Set the model to evaluation mode
model.eval()

# Read text from a file
with open('/content/LustPoemsiMInLustPoembyEffieYalenaSteyn.txt', 'r') as f:
    text = f.read()

# Tokenize the text and truncate or pad it to the desired length
inputs = tokenizer(text, padding=True, truncation=True, max_length=512, return_tensors='pt')

# Convert tokenized input to tensors
input_ids = inputs['input_ids']
attention_mask = inputs['attention_mask']

# Pass the input tensors through the model and get the predicted output probabilities
outputs = model(input_ids, attention_mask=attention_mask)
probs = torch.softmax(outputs.logits, dim=-1)

print(outputs)
print(probs)
# Extract the highest probability class index
preds = torch.argmax(probs, dim=-1)
print(preds)
print(preds.item())




##Testing another sample

In [None]:
# Read text from a file
with open('/content/AlonePoemsIAmMuchTooAloneInThisWorldYetNotAlonePoembyRainerMariaRilke.txt', 'r') as f:
    text = f.read()

# Tokenize the text and truncate or pad it to the desired length
inputs1 = tokenizer(text, padding=True, truncation=True, max_length=512, return_tensors='pt')

# Convert tokenized input to tensors
input_ids1 = inputs1['input_ids']
attention_mask1 = inputs1['attention_mask']

# Pass the input tensors through the model and get the predicted output probabilities
outputs1 = model(input_ids1, attention_mask=attention_mask1)
probs1 = torch.softmax(outputs1.logits, dim=-1)

print(outputs1)
print(probs1)
# Extract the highest probability class index
preds1 = torch.argmax(probs1, dim=-1)
print(preds1)
print(preds1.item())