In [1]:
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments




In [2]:
import tensorflow as tf

In [3]:
labels = tf.constant([0, 1, 2])  # Example labels
logits = tf.random.normal([3, 3])  # Example logits

loss = tf.compat.v1.losses.sparse_softmax_cross_entropy(labels = labels, logits = logits)

In [4]:
import pandas as pd

In [5]:
data = pd.read_csv('Datasets/Book recommendation chatbot/Train.csv')

  data = pd.read_csv('Datasets/Book recommendation chatbot/Train.csv')


In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1031161 entries, 0 to 1031160
Data columns (total 9 columns):
 #   Column               Non-Null Count    Dtype  
---  ------               --------------    -----  
 0   User-ID              1031161 non-null  int64  
 1   ISBN                 1031161 non-null  int64  
 2   Book-Rating          1031161 non-null  float64
 3   Book-Title           1031161 non-null  object 
 4   Book-Author          1031161 non-null  object 
 5   Year-Of-Publication  1031161 non-null  object 
 6   Publisher            1031161 non-null  object 
 7   Location             1031161 non-null  object 
 8   age                  1031161 non-null  float64
dtypes: float64(2), int64(2), object(5)
memory usage: 70.8+ MB


In [7]:
data.drop('Location', axis = 1, inplace = True) 

In [8]:
def safe_convert_year(df, column):
    df[column] = pd.to_numeric(df[column], errors='coerce')
    df[column].fillna(0, inplace=True)
    df[column] = df[column].astype(int)

safe_convert_year(data, 'Year-Of-Publication')


In [9]:
data.head()

Unnamed: 0,User-ID,ISBN,Book-Rating,Book-Title,Book-Author,Year-Of-Publication,Publisher,age
0,1,60,0.0,Dangerous Interloper (Harlequin Collector's Ed...,Penny Jordan,1997,Harlequin,32.0
1,37,60,5.0,Dangerous Interloper (Harlequin Collector's Ed...,Penny Jordan,1997,Harlequin,34.0
2,37,23,9.0,Loving God,Charles Colson,1995,Billy Graham Evangelistic Association,34.0
3,37,55,8.0,The 85 Ways to Tie a Tie: The Science and Aest...,Thomas Fink,2000,Broadway Books,34.0
4,37,130,9.0,The Sacred Journey : A Memoir of Early Days,Frederick Buechner,1991,HarperSanFrancisco,34.0


In [10]:
data = pd.read_csv('Datasets/Book recommendation chatbot/Train.csv', low_memory = False)

In [11]:
def normalize_rating(rating):
    normalized = int(rating // 2)
    return min(max(normalized, 0), 4)  # Ensure normalized rating is within [0, 4]

In [12]:
data['Book-Rating'] = data['Book-Rating'].apply(normalize_rating)

In [13]:
assert data['Book-Rating'].between(0, 4).all(), "All labels should be in the range [0, 4]"

In [14]:
from sklearn.model_selection import train_test_split

In [15]:
train_data, val_data = train_test_split(data, test_size = 0.2, random_state = 42)

In [16]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [17]:
def tokenize_data(data):
    return tokenizer(
        data['Book-Title'].tolist(),
        padding = True,
        truncation = True,
        return_tensors = "pt"
    )

In [18]:
train_encodings = tokenize_data(train_data)
val_encodings = tokenize_data(val_data)

Fine-Tuning Model



In [19]:
import torch
from torch.utils.data import Dataset, DataLoader

In [20]:
class CustomBookDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype = torch.long)
        return item

    def __len__(self):
        return len(self.labels)


In [21]:
train_labels = train_data['Book-Rating'].tolist()
val_labels = val_data['Book-Rating'].tolist()

In [22]:
train_dataset = CustomBookDataset(train_encodings, train_labels)
val_dataset = CustomBookDataset(val_encodings, val_labels)

In [23]:
model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels = 5,
    ignore_mismatched_sizes = True
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([1]) in the checkpoint and torch.Size([5]) in the model instantiated
- classifier.weight: found shape torch.Size([1, 768]) in the checkpoint and torch.Size([5, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [24]:
training_args = TrainingArguments(
    output_dir = './Results_5.1',
    num_train_epochs = 3,
    per_device_train_batch_size = 32,
    per_device_eval_batch_size = 32,
    warmup_steps = 500,
    weight_decay = 0.01,
    logging_dir = './Logs_5.1',
    logging_steps = 10,
    eval_strategy="steps",
    eval_steps = 50,
    save_steps = 100
)

In [25]:
trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = train_dataset,
    eval_dataset = val_dataset
)

In [None]:
trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33mpalak-coder[0m ([33mpalak-coder14[0m). Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss,Validation Loss


In [None]:
output_dir = './trained_model'
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)