In [1]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder


In [2]:
import torch
from datasets import load_dataset

In [3]:
import pandas as pd
import numpy as np
df = pd.read_csv("book_clean.csv")

In [4]:
category_mapping = {'Fiction' : "Fiction",
 'Juvenile Fiction': "Fiction",
 'Biography & Autobiography': "Nonfiction",
 'History': "Nonfiction",
 'Literary Criticism': "Nonfiction",
 'Philosophy': "Nonfiction",
 'Religion': "Nonfiction",
 'Comics & Graphic Novels': "Fiction",
 'Drama': "Fiction",
 'Juvenile Nonfiction': "Nonfiction",
 'Science': "Nonfiction",
 'Poetry': "Fiction"}

df["simple_categories"] = df["categories"].map(category_mapping)

In [5]:
df = df[df["simple_categories"].isin(["Fiction", "Nonfiction"])].dropna(subset=["description"])

In [6]:
le = LabelEncoder()
df["label"] = le.fit_transform(df["simple_categories"])

In [7]:
train_texts, val_texts, train_labels, val_labels = train_test_split(df["description"], df["label"], test_size=0.2, random_state=42)


In [8]:
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

In [9]:
def tokenize(texts):
    return tokenizer(texts.tolist(), padding="max_length", truncation=True, max_length=128, return_tensors="pt")

train_encodings = tokenize(train_texts)
val_encodings = tokenize(val_texts)


In [19]:
from torch.utils.data import Dataset,Subset
import random

class BookDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            'input_ids': self.encodings['input_ids'][idx],
            'attention_mask': self.encodings['attention_mask'][idx],
            'labels': torch.tensor(self.labels[idx], dtype=torch.long)
        }

num_train_samples = 748
num_val_samples = 186

train_dataset_full = BookDataset(train_encodings, train_labels.tolist())
val_dataset_full = BookDataset(val_encodings, val_labels.tolist())

# Random indices
train_indices = random.sample(range(len(train_dataset_full)), num_train_samples)
val_indices = random.sample(range(len(val_dataset_full)), num_val_samples)

# Subset datasets
train_dataset = Subset(train_dataset_full, train_indices)
val_dataset = Subset(val_dataset_full, val_indices)

In [20]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(le.classes_))


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [21]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=1,                        
    per_device_train_batch_size=2,             
    per_device_eval_batch_size=2,
    logging_dir="./logs",
    logging_steps=10,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)


In [27]:
def compute_metrics(p):
    logits, labels = p
    preds = torch.argmax(torch.tensor(logits), axis=1).numpy()
    return {"accuracy": accuracy_score(labels, preds)}
    

In [28]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
    
)

  trainer = Trainer(


In [29]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,0.7585,0.483407,0.913978


TrainOutput(global_step=374, training_loss=0.3119231922250679, metrics={'train_runtime': 1635.6322, 'train_samples_per_second': 0.457, 'train_steps_per_second': 0.229, 'total_flos': 49201767352320.0, 'train_loss': 0.3119231922250679, 'epoch': 1.0})

In [30]:
# Save the trained model and tokenizer
model.save_pretrained("bert-book-classifier")
tokenizer.save_pretrained("bert-book-classifier")


('bert-book-classifier\\tokenizer_config.json',
 'bert-book-classifier\\special_tokens_map.json',
 'bert-book-classifier\\vocab.txt',
 'bert-book-classifier\\added_tokens.json')

In [32]:
test_results = trainer.predict(val_dataset)

test_preds = test_results.predictions
test_labels = test_results.label_ids

pred_labels = np.argmax(test_preds, axis=1)

from sklearn.metrics import accuracy_score, classification_report

print("Test Accuracy:", accuracy_score(test_labels, pred_labels))
print("Classification Report:\n", classification_report(test_labels, pred_labels))


  arr = np.array(obj)
  arr = np.array(obj)


Test Accuracy: 0.9139784946236559
Classification Report:
               precision    recall  f1-score   support

           0       0.91      0.98      0.94       135
           1       0.93      0.75      0.83        51

    accuracy                           0.91       186
   macro avg       0.92      0.86      0.88       186
weighted avg       0.91      0.91      0.91       186



Predicted category: Fiction
