In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from ydata_profiling import ProfileReport
import numpy as np
from tqdm.auto import tqdm

In [2]:
# Using twitter model to predict the popularity of the books
import torch
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification
from torch.utils.data import DataLoader, Dataset, random_split, WeightedRandomSampler
import torch.nn.functional as F
from torchmetrics import Accuracy
from torch import nn, optim
import lightning as pl
from sklearn.utils.class_weight import compute_class_weight
from lightning.pytorch.callbacks import EarlyStopping, ModelCheckpoint

In [3]:
batch_size = 32
lr = 1e-5
max_length = 512
model_name = 'cardiffnlp/twitter-roberta-base-sentiment-latest'
# model_name = 'MoritzLaurer/deberta-v3-large-zeroshot-v2.0'
# model_name = 'dslim/bert-base-NER'
# model_name = 'distilbert/distilbert-base-uncased-finetuned-sst-2-english'

In [4]:
# Import the books.csv file
books = pd.read_csv('books.csv')
books.head()

Unnamed: 0,title,price,review/helpfulness,review/summary,review/text,description,authors,categories,popularity
0,We Band of Angels: The Untold Story of America...,10.88,2/3,A Great Book about women in WWII,I have alway been a fan of fiction books set i...,"In the fall of 1941, the Philippines was a gar...",'Elizabeth Norman','History',Unpopular
1,Prayer That Brings Revival: Interceding for Go...,9.35,0/0,Very helpful book for church prayer groups and...,Very helpful book to give you a better prayer ...,"In Prayer That Brings Revival, best-selling au...",'Yong-gi Cho','Religion',Unpopular
2,The Mystical Journey from Jesus to Christ,24.95,17/19,Universal Spiritual Awakening Guide With Some ...,The message of this book is to find yourself a...,THE MYSTICAL JOURNEY FROM JESUS TO CHRIST Disc...,'Muata Ashby',"'Body, Mind & Spirit'",Unpopular
3,Death Row,7.99,0/1,Ben Kincaid tries to stop an execution.,The hero of William Bernhardt's Ben Kincaid no...,"Upon receiving his execution date, one of the ...",'Lynden Harris','Social Science',Unpopular
4,Sound and Form in Modern Poetry: Second Editio...,32.5,18/20,good introduction to modern prosody,There's a lot in this book which the reader wi...,An updated and expanded version of a classic a...,"'Harvey Seymour Gross', 'Robert McDowell'",'Poetry',Unpopular


In [5]:
# Check the info of the books dataframe
books.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15719 entries, 0 to 15718
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   title               15719 non-null  object 
 1   price               15719 non-null  float64
 2   review/helpfulness  15719 non-null  object 
 3   review/summary      15718 non-null  object 
 4   review/text         15719 non-null  object 
 5   description         15719 non-null  object 
 6   authors             15719 non-null  object 
 7   categories          15719 non-null  object 
 8   popularity          15719 non-null  object 
dtypes: float64(1), object(8)
memory usage: 1.1+ MB


In [6]:
# Check null
books.isnull().sum()

title                 0
price                 0
review/helpfulness    0
review/summary        1
review/text           0
description           0
authors               0
categories            0
popularity            0
dtype: int64

In [7]:
# Drop null
books = books.dropna()

# Univariate analysis
# Check the distribution of the price
plt.hist(books['price'])
plt.xlabel('Price')
plt.ylabel('Frequency')
plt.title('Distribution of Price')
plt.show()

# Profile report
profile = ProfileReport(books, title='Pandas Profiling Report', explorative=True)
profile

In [8]:
# Check duplicates
books.duplicated().sum()

3294

In [9]:
# Drop duplicates and reset index
books = books.drop_duplicates(ignore_index=True)

In [10]:
books.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12424 entries, 0 to 12423
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   title               12424 non-null  object 
 1   price               12424 non-null  float64
 2   review/helpfulness  12424 non-null  object 
 3   review/summary      12424 non-null  object 
 4   review/text         12424 non-null  object 
 5   description         12424 non-null  object 
 6   authors             12424 non-null  object 
 7   categories          12424 non-null  object 
 8   popularity          12424 non-null  object 
dtypes: float64(1), object(8)
memory usage: 873.7+ KB


In [11]:
# Preprocess the data further by incorporating price and review/helpfulness
books['review_helpfulness_ratio'] = books['review/helpfulness'].apply(lambda x: int(x.split('/')[0]) / (int(x.split('/')[1]) + 1))

# Keep the records with review/helpfulness ratio greater than 0.5 or total reviews are zero
books = books[books['review_helpfulness_ratio'] >= 0.5].reset_index(drop=True)
books.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6662 entries, 0 to 6661
Data columns (total 10 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   title                     6662 non-null   object 
 1   price                     6662 non-null   float64
 2   review/helpfulness        6662 non-null   object 
 3   review/summary            6662 non-null   object 
 4   review/text               6662 non-null   object 
 5   description               6662 non-null   object 
 6   authors                   6662 non-null   object 
 7   categories                6662 non-null   object 
 8   popularity                6662 non-null   object 
 9   review_helpfulness_ratio  6662 non-null   float64
dtypes: float64(2), object(8)
memory usage: 520.6+ KB


In [12]:
# We just need the texts columns and the popularity column
df = books[['title', 'review/text', 'description', 'authors', 'categories', 'popularity']]

In [13]:
# Map the popularity to 0 and 1
df['popularity'] = df['popularity'].map({'Unpopular': 0, 'Popular': 1})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['popularity'] = df['popularity'].map({'Unpopular': 0, 'Popular': 1})


In [14]:
books['popularity'].value_counts()

popularity
Unpopular    3636
Popular      3026
Name: count, dtype: int64

# Define the lightning data module
class BookDataModule(pl.LightningDataModule):
    def __init__(self, df, max_length=512, batch_size=32, model_name=model_name):
        super().__init__()
        self.df = df
        self.model_name = model_name
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.max_length = max_length
        self.batch_size = batch_size

    def setup(self, stage=None):
        train_size = int(0.8 * len(self.df))
        val_size = len(self.df) - train_size

        self.train_dataset, self.val_dataset = random_split(self.df, [train_size, val_size])

    def train_dataloader(self):
        return DataLoader(self.train_dataset, batch_size=self.batch_size, shuffle=True, collate_fn=self.collate_fn)

    def val_dataloader(self):
        return DataLoader(self.val_dataset, batch_size=self.batch_size, collate_fn=self.collate_fn)
    
    def collate_fn(self, batch):
        texts = [item['description'] for item in batch]
        labels = [item['popularity'] for item in batch]

        encoding = self.tokenizer(texts, return_tensors='pt', padding=True, truncation=True, max_length=self.max_length)
        encoding['labels'] = torch.tensor(labels)

        return encoding

In [15]:
class BookDataset(Dataset):
    def __init__(self, df, tokenizer, max_length):
        self.df = df
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        # # We will concatenate the 'title' and 'review/summary' and 'description' columns
        # text = row['review/text']
        # Use special tokens to separate different parts of the text 'title', 'review/text', 'description', 'authors', 'categories'
        text = (f"{self.tokenizer.sep_token}title{self.tokenizer.sep_token} " + row['title'] + 
                f" {self.tokenizer.sep_token}review/text{self.tokenizer.sep_token} " + row['review/text'] + 
                f" {self.tokenizer.sep_token}description{self.tokenizer.sep_token} " + row['description'] + 
                f" {self.tokenizer.sep_token}authors{self.tokenizer.sep_token} " + row['authors'] + 
                f" {self.tokenizer.sep_token}categories{self.tokenizer.sep_token} " + row['categories'])
        # label = row.get('popularity', 0)  # Default to 0 if 'popularity' is missing
        label = row['popularity']
        
        encoding = self.tokenizer(text, 
                                  truncation=True, 
                                  max_length=self.max_length, 
                                  padding='max_length',
                                  return_tensors='pt')
        
        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'labels': torch.tensor(label, dtype=torch.float)
        }

class BookDataModule(pl.LightningDataModule):
    def __init__(self, df, max_length=512, batch_size=32, model_name=model_name):
        super().__init__()
        self.df = df
        self.model_name = model_name
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.max_length = max_length
        self.batch_size = batch_size

    def setup(self, stage=None):
        dataset = BookDataset(self.df, self.tokenizer, self.max_length)
        train_size = int(0.8 * len(dataset))
        val_size = len(dataset) - train_size
        self.train_dataset, self.val_dataset = random_split(dataset, [train_size, val_size])

    def train_dataloader(self):
        return DataLoader(self.train_dataset, batch_size=self.batch_size, shuffle=True, collate_fn=self.collate_fn)

    def val_dataloader(self):
        return DataLoader(self.val_dataset, batch_size=self.batch_size, collate_fn=self.collate_fn)
    
    def collate_fn(self, batch):
        input_ids = torch.stack([item['input_ids'] for item in batch])
        attention_mask = torch.stack([item['attention_mask'] for item in batch])
        labels = torch.stack([item['labels'] for item in batch])
        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': labels
        }

In [16]:
# Initialize the data module
data_module = BookDataModule(df, max_length=max_length, batch_size=batch_size, model_name=model_name)



class ImbalancedBookDataModule(pl.LightningDataModule):
    def __init__(self, df, max_length=512, batch_size=32, model_name=None):
        super().__init__()
        self.df = df
        self.model_name = model_name
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.max_length = max_length
        self.batch_size = batch_size
        self.class_weights = None  # Initialize class_weights attribute

    def setup(self, stage=None):
        self.dataset = BookDataset(self.df, self.tokenizer, self.max_length)
        train_size = int(0.8 * len(self.dataset))
        val_size = len(self.dataset) - train_size
        self.train_dataset, self.val_dataset = torch.utils.data.random_split(self.dataset, [train_size, val_size])

        # Compute class weights
        labels = self.df['popularity'].values
        unique_labels = np.unique(labels)
        class_weights = compute_class_weight('balanced', classes=unique_labels, y=labels)
        self.class_weights = torch.tensor(class_weights, dtype=torch.float)

        # Create weighted sampler for training data
        train_labels = [self.df['popularity'].iloc[i] for i in self.train_dataset.indices]
        weights = [class_weights[unique_labels.tolist().index(label)] for label in train_labels]
        self.sampler = WeightedRandomSampler(weights, len(weights), replacement=True)

    def train_dataloader(self):
        return DataLoader(self.train_dataset, batch_size=self.batch_size, sampler=self.sampler)

    def val_dataloader(self):
        return DataLoader(self.val_dataset, batch_size=self.batch_size)

    def get_class_weights(self):
        if self.class_weights is None:
            raise ValueError("setup() must be called before accessing class_weights")
        return self.class_weights

# Instantiate the data module, call setup() and check the class weights
data_module = ImbalancedBookDataModule(df, max_length=max_length, batch_size=batch_size, model_name=model_name)
data_module.setup()

In [17]:
# Define the lightning model
class BookModel(pl.LightningModule):
    def __init__(self, model_name=model_name, lr=2e-5):
        super().__init__()
        self.save_hyperparameters()
        self.model = AutoModel.from_pretrained(model_name)
        self.classifier = nn.Linear(self.model.config.hidden_size, 1)
        self.lr = lr
        self.accuracy = Accuracy(task='binary')
        self.loss = nn.BCEWithLogitsLoss()

        # We freeze the model weights and only train the classifier
        for param in self.model.parameters():
            param.requires_grad = False

        # Unfreeze the top 3 layer of the model
        for param in self.model.encoder.layer[-3:].parameters():
            param.requires_grad = True

        # # Fully connected layer after CLS token
        # self.fc = nn.Sequential(
        #     nn.Linear(self.model.config.hidden_size, 512),
        #     nn.ReLU(),
        #     nn.Dropout(0.2),
        #     nn.Linear(512, 1)
        # )

    # # def forward(self, input_ids, attention_mask):
    #     outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
    #     cls_output = outputs.last_hidden_state[:, 0, :]
    #     logits = self.fc(cls_output)

    #     return logits

    def forward(self, input_ids, attention_mask):
        outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = outputs.last_hidden_state[:, 0, :]
        logits = self.classifier(cls_output)

        return logits
    
    def training_step(self, batch, batch_idx):
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['labels']

        logits = self(input_ids, attention_mask).squeeze()
        loss = self.loss(logits, labels)
        acc = self.accuracy(logits, labels)

        self.log('train_loss', loss, on_step=True, on_epoch=True, prog_bar=True)
        self.log('train_acc', acc, on_step=True, on_epoch=True, prog_bar=True)
        return loss
    
    def validation_step(self, batch, batch_idx):
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['labels']

        logits = self(input_ids, attention_mask).squeeze()
        loss = self.loss(logits, labels)
        acc = self.accuracy(logits, labels)

        self.log('val_loss', loss, on_step=True, on_epoch=True, prog_bar=True)
        self.log('val_acc', acc, on_step=True, on_epoch=True, prog_bar=True)
        return loss
    
    def configure_optimizers(self):
        optimizer = optim.AdamW(self.parameters(), lr=self.lr)
        return optimizer

In [18]:
# Initialize the model
model = BookModel(model_name=model_name, lr=lr)

class ImbalancedBookClassifier(pl.LightningModule):
    def __init__(self, model, class_weights):
        super().__init__()
        self.class_weights = class_weights
        self.save_hyperparameters()
        self.model = AutoModel.from_pretrained(model_name)
        self.lr = lr
        self.accuracy = Accuracy(task='binary')
        self.loss = nn.BCEWithLogitsLoss()

        # We freeze the model weights and only train the classifier
        for param in self.model.parameters():
            param.requires_grad = False

        # Fully connected layer after CLS token
        self.fc = nn.Sequential(
            nn.Linear(self.model.config.hidden_size, 1024),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(1024, 1)
        )

    def forward(self, input_ids, attention_mask):
        outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = outputs.last_hidden_state[:, 0, :]
        logits = self.fc(cls_output)
        return logits

    def training_step(self, batch, batch_idx):
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['labels']

        logits = self(input_ids, attention_mask).squeeze()
        loss = self.loss(logits, labels)
        acc = self.accuracy(logits, labels)

        # # Use weighted loss
        # loss_fct = torch.nn.CrossEntropyLoss(weight=self.class_weights.to(self.device))
        # loss = loss_fct(logits.view(-1, 2), labels.view(-1))
        # acc = self.accuracy(logits, labels)

        self.log('train_loss', loss, on_step=True, on_epoch=True, prog_bar=True)
        self.log('train_acc', acc, on_step=True, on_epoch=True, prog_bar=True)
        return loss
    
    def validation_step(self, batch, batch_idx):
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['labels']

        logits = self(input_ids, attention_mask).squeeze()
        loss = self.loss(logits, labels)
        acc = self.accuracy(logits, labels)

        self.log('val_loss', loss, on_step=True, on_epoch=True, prog_bar=True)
        self.log('val_acc', acc, on_step=True, on_epoch=True, prog_bar=True)
        return loss
    
    def configure_optimizers(self):
        optimizer = optim.AdamW(self.parameters(), lr=self.lr)
        return optimizer

# Instantiate the model
model = ImbalancedBookClassifier(model_name, data_module.get_class_weights())

In [19]:
# Early stopping callback
early_stopping = EarlyStopping(monitor='val_loss', patience=3, mode='min', verbose=True)

In [20]:
# Model checkpoint callback
checkpoint_callback = ModelCheckpoint(monitor='val_loss', mode='min',
                                      dirpath='checkpoints',
                                      filename='book_model-{epoch:02d}-{val_loss:.2f}',
                                      save_top_k=3)

In [21]:
# Define the trainer
trainer = pl.Trainer(max_epochs=100, devices=-1,
                     callbacks=[early_stopping, checkpoint_callback],
                     precision='16-mixed')

Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


In [22]:
# Train the model
trainer.fit(model, data_module)

You are using a CUDA device ('NVIDIA GeForce RTX 3070') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
f:\IT\qkenv39\lib\site-packages\lightning\pytorch\callbacks\model_checkpoint.py:654: Checkpoint directory F:\IT\DataScience\Datacamp_Competition\books\checkpoints exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name       | Type              | Params | Mode 
---------------------------------------------------------
0 | model      | RobertaModel      | 124 M  | eval 
1 | classifier | Linear            | 769    | train
2 | accuracy   | BinaryAccuracy    | 0      | train
3 | loss       | BCEWithLogitsLoss | 0      | train
---------------------------------------------------------
21.3 M    Trainable params
10

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

f:\IT\qkenv39\lib\site-packages\lightning\pytorch\trainer\connectors\data_connector.py:424: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.
f:\IT\qkenv39\lib\site-packages\lightning\pytorch\trainer\connectors\data_connector.py:424: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Metric val_loss improved. New best score: 0.468


Validation: |          | 0/? [00:00<?, ?it/s]

Metric val_loss improved by 0.024 >= min_delta = 0.0. New best score: 0.444


Validation: |          | 0/? [00:00<?, ?it/s]

Metric val_loss improved by 0.005 >= min_delta = 0.0. New best score: 0.439


Validation: |          | 0/? [00:00<?, ?it/s]

Metric val_loss improved by 0.009 >= min_delta = 0.0. New best score: 0.430


Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Monitored metric val_loss did not improve in the last 3 records. Best score: 0.430. Signaling Trainer to stop.


# Load epoch 15 checkpoint
model = BookModel.load_from_checkpoint('checkpoints/book_model-epoch=42-val_loss=0.48.ckpt')

# Validate the model
trainer.validate(model, data_module.val_dataloader())

In [23]:
# Load the best model
best_model = BookModel.load_from_checkpoint(trainer.checkpoint_callback.best_model_path)

In [24]:
# Validate the model
trainer.validate(best_model, data_module.val_dataloader())

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Validation: |          | 0/? [00:00<?, ?it/s]

[{'val_loss_epoch': 0.43004903197288513, 'val_acc_epoch': 0.7891973257064819}]

# Save the model
torch.save(model.state_dict(), 'book_model.pth')