In [1]:
import torch
import torch.nn as nn
from transformers import GPT2Tokenizer, GPT2Model
from fastai.text.all import *
from sklearn.model_selection import train_test_split
from torch.utils.data import TensorDataset, DataLoader

In [2]:
# Load your data
df = pd.read_csv("/kaggle/input/poem-classification-dataset/data.csv")

topic_to_id = {
    "religion": 0,
    "love": 1,
    "nature": 2,
    "relationships": 3,
    "arts&sciences": 4
}

# Apply the mapping to create the topic_id column
df['topic_id'] = df['topic'].map(topic_to_id)

In [3]:
num_classes = len(df['topic_id'].unique())
num_classes

5

In [4]:
# Split data into training and validation sets
train_df, valid_df = train_test_split(df, test_size=0.2, random_state=42)

# Initialize the tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token
model = GPT2Model.from_pretrained('gpt2')

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

In [5]:
# Tokenize input data
def tokenize_data(df):
    return tokenizer(df['poem'].tolist(), padding=True, truncation=True, return_tensors='pt')

train_tokens = tokenize_data(train_df)
valid_tokens = tokenize_data(valid_df)

# Define datasets and dataloaders
train_ds = TensorDataset(train_tokens['input_ids'], train_tokens['attention_mask'], torch.tensor(train_df['topic_id'].tolist()))
valid_ds = TensorDataset(valid_tokens['input_ids'], valid_tokens['attention_mask'], torch.tensor(valid_df['topic_id'].tolist()))


In [6]:
bs = 4
train_dl = DataLoader(train_ds, batch_size=bs, shuffle=True)
valid_dl = DataLoader(valid_ds, batch_size=bs * 2)

In [7]:
print("Length of training dataset:", len(train_ds))
print("Length of validation dataset:", len(valid_ds))

Length of training dataset: 8051
Length of validation dataset: 2013


In [8]:
# Define the model architecture
class GPT2Classifier(nn.Module):
    def __init__(self, gpt2_model, num_classes):
        super().__init__()
        self.gpt2_model = gpt2_model
        self.classifier = nn.Linear(gpt2_model.config.n_embd, num_classes)

    def forward(self, input_ids, attention_mask=None):
        outputs = self.gpt2_model(input_ids=input_ids, attention_mask=attention_mask)
        last_hidden_state = outputs.last_hidden_state
        cls_token = last_hidden_state[:, 0, :]  # Extract the CLS token
        logits = self.classifier(cls_token)
        return logits

In [9]:
# Initialize the classifier model
classifier_model = GPT2Classifier(model, num_classes)

# Define learner
learn = Learner(dls=DataLoaders(train_dl, valid_dl), model=classifier_model, loss_func=nn.CrossEntropyLoss(), metrics=[accuracy], cbs=GradientAccumulation(4))

# Apply gradient clipping to avoid memory problems
learn = learn.add_cb(GradientClip(max_norm=1.0))

In [10]:
# Train the model
learn.fit_one_cycle(3, 1e-4)

epoch,train_loss,valid_loss,accuracy,time
0,1.595003,1.608666,0.255837,21:29
1,1.593406,1.585689,0.262792,21:28
2,1.557283,1.586029,0.276701,21:28


In [11]:
learn.validate(dl=valid_dl)

(#2) [1.5860291719436646,0.2767014503479004]

In [20]:
# Tokenize new data
new_poems = ["A beautiful sunset painted the sky with hues of orange and pink.",
             "In love's embrace, hearts entwine, Two souls, one rhythm, divine. A whispered vow, a tender kiss, In love's eternal, timeless bliss.",
             "In sacred halls, voices rise, Prayers lifted to the skies. Faith's beacon, guiding light, In hearts devout, forever bright.",
             "In tender touch, two hearts align, Bound by love, a bond divine. Through highs and lows, hand in hand, Together strong, in love's grand stand.",
            "In the dance of atoms, secrets unfurl, Science and art, in a cosmic swirl. Brushstrokes of knowledge, colors of thought, Creation's canvas, where wonders are wrought."]

new_tokens = tokenizer(new_poems, padding=True, truncation=True, return_tensors='pt')

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Make predictions
with torch.no_grad():
    logits = classifier_model(input_ids=new_tokens['input_ids'].to(device), 
                              attention_mask=new_tokens['attention_mask'].to(device))
    predictions = torch.argmax(logits, dim=1)

# Map predictions back to topic labels
id_to_topic = {v: k for k, v in topic_to_id.items()}
predicted_topics = [id_to_topic[p.item()] for p in predictions]

print("==== Outputs ====")
for poem, topic in zip(new_poems, predicted_topics):
    print(f"Poem: {poem}")
    print(f"Predicted Topic: {topic}")
    print()

==== Outputs ====
Poem: A beautiful sunset painted the sky with hues of orange and pink.
Predicted Topic: nature

Poem: In love's embrace, hearts entwine, Two souls, one rhythm, divine. A whispered vow, a tender kiss, In love's eternal, timeless bliss.
Predicted Topic: nature

Poem: In sacred halls, voices rise, Prayers lifted to the skies. Faith's beacon, guiding light, In hearts devout, forever bright.
Predicted Topic: nature

Poem: In tender touch, two hearts align, Bound by love, a bond divine. Through highs and lows, hand in hand, Together strong, in love's grand stand.
Predicted Topic: nature

Poem: In the dance of atoms, secrets unfurl, Science and art, in a cosmic swirl. Brushstrokes of knowledge, colors of thought, Creation's canvas, where wonders are wrought.
Predicted Topic: nature

