In [1]:
import os
import json
import tomllib
import pandas
import numpy
import torch

from copy import deepcopy
from itertools import chain
from pandas import DataFrame, Series
from pprint import pprint
from tqdm import tqdm
from time import time, ctime
from joblib import Parallel, delayed
from book_impact.preprocess import parallel_preprocess


CONFIG = tomllib.load(open("config.toml", "rb"))
CONFIG_PREPROCESS = CONFIG["preprocess"]
CONFIG_MODEL = CONFIG["model"]

tracker: dict = deepcopy(CONFIG)

In [2]:
data = pandas.read_csv("books_task.csv")
tick = time()
x, y = parallel_preprocess(data, CONFIG_PREPROCESS)
tock = time()
tracker["time_to_preprocess"] = tock - tick
len(x), len(y)

  return bound(*args, **kwds)
parallel processing: chunks-2; processes-2: 100%|██████████| 2/2 [00:00<00:00, 34.96it/s]


(138724, 138724)

In [3]:
# quick inspection of the data
idx = 3
x[idx], y[idx]

('Title - Whispers of the Wicked Saints; description - Julia Thomas finds her life spinning out of control after the death of her husband, Richard. Julia turns to her minister for comfort when she finds herself falling for him with a passion that is forbidden by the church. Heath Sparks is a man of God who is busy taking care of his quadriplegic wife who was seriously injured in a sever car accident. In an innocent effort to reach out to a lonely member of his church, Heath finds himself as the man and not the minister as Heath and Julia surrender their bodies to each other and face the wrath of God. Julia finds herself in over her head as she faces a deadly disease, the loss of her home and whispers about her wicked affair. Julia leaves the states offering her body as a living sacrifice in hopes of finding a cure while her heart remains thousands of miles away hoping to one day reunite with the man who holds it hostage.Whispers of the Wicked Saints is a once in a lifetime romance that

### Data Loader, Tokenizing, and model specific preprocessing

In [4]:
import os
import datasets
from book_impact.model import BertRegressor
from transformers import AutoTokenizer
import torch.optim as optim
from torchmetrics import MeanAbsolutePercentageError

os.environ["TOKENIZERS_PARALLELISM"] = "false"

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
train_data = datasets.Dataset.from_dict({
                                            "x": x,
                                            "y": y,
                                        })
data_dict = train_data.train_test_split(test_size=CONFIG_PREPROCESS["test_split"])
data_dict

DatasetDict({
    train: Dataset({
        features: ['x', 'y'],
        num_rows: 124851
    })
    test: Dataset({
        features: ['x', 'y'],
        num_rows: 13873
    })
})

In [6]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

def tokenize_text(record):
    return tokenizer(record['x'], truncation=True,
            padding="max_length", max_length=300)

# tokenizing the text
tick = time()
data_dict["train"] = data_dict["train"].map(tokenize_text, batched=True, num_proc=CONFIG_PREPROCESS["workers"])
data_dict["test"] = data_dict["test"].map(tokenize_text, batched=True, num_proc=CONFIG_PREPROCESS["workers"])
data_dict["train"] = data_dict["train"].with_format("torch")
data_dict["test"] = data_dict["test"].with_format("torch")
tracker["tokenization_time"] = time() - tick
data_dict

Map (num_proc=8): 100%|██████████| 124851/124851 [00:14<00:00, 8668.59 examples/s] 
Map (num_proc=8): 100%|██████████| 13873/13873 [00:07<00:00, 1746.03 examples/s]


DatasetDict({
    train: Dataset({
        features: ['x', 'y', 'input_ids', 'attention_mask'],
        num_rows: 124851
    })
    test: Dataset({
        features: ['x', 'y', 'input_ids', 'attention_mask'],
        num_rows: 13873
    })
})

In [7]:
model = BertRegressor()
tracker["model_architechture"] = print(model)

# loss function, Optimizer
def loss_fn(output, target):
    """
    Calculates Mean Absolute Percentage Error (MAPE) between model output and target values.

    Args:
        output (torch.Tensor): Model predictions tensor
        target (torch.Tensor): Ground truth target values tensor

    Returns:
        torch.Tensor: MAPE loss value computed on the device specified in CONFIG_MODEL
    """
    mape = MeanAbsolutePercentageError().to(CONFIG_MODEL["device"])
    return mape(output.reshape(-1), target)

optimizer = optim.Adam(model.parameters(), lr=CONFIG_MODEL["lr"])

BertRegressor(
  (bert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            (lin1): Linear(i

In [8]:
from book_impact.trainer import BertRegressorTraining

trainer = BertRegressorTraining(data_dict=data_dict, optimizer=optimizer, loss_fn=loss_fn, model=model, config=CONFIG_MODEL)
trainer.prepare_data_loader()
trainer.train_model()

Epoch 1/10 : 100%|██████████| 976/976 [01:56<00:00,  8.39it/s]


train_mape: 3430.125244140625 | val_mape: 0.35510003566741943


Epoch 2/10 : 100%|██████████| 976/976 [01:57<00:00,  8.30it/s]


train_mape: 3470.838623046875 | val_mape: 0.3550058901309967


Epoch 3/10 : 100%|██████████| 976/976 [01:55<00:00,  8.47it/s]


train_mape: 3470.058349609375 | val_mape: 0.35490643978118896


Epoch 4/10 : 100%|██████████| 976/976 [01:54<00:00,  8.52it/s]


train_mape: 3446.673828125 | val_mape: 0.35478267073631287


Epoch 5/10 : 100%|██████████| 976/976 [01:57<00:00,  8.34it/s]


train_mape: 3457.511474609375 | val_mape: 0.3548888564109802


Epoch 6/10 : 100%|██████████| 976/976 [01:56<00:00,  8.35it/s]


train_mape: 3440.36376953125 | val_mape: 0.35486936569213867


Epoch 7/10 : 100%|██████████| 976/976 [01:51<00:00,  8.72it/s]


train_mape: 3427.102294921875 | val_mape: 0.3547574579715729


Epoch 8/10 : 100%|██████████| 976/976 [01:49<00:00,  8.92it/s]


train_mape: 3445.083984375 | val_mape: 0.35478177666664124


Epoch 9/10 : 100%|██████████| 976/976 [01:49<00:00,  8.89it/s]


train_mape: 3460.54833984375 | val_mape: 0.35483747720718384


Epoch 10/10 : 100%|██████████| 976/976 [01:49<00:00,  8.90it/s]


train_mape: 3458.347900390625 | val_mape: 0.3547438979148865


In [9]:
group1, group2 = trainer.get_train_val_history()
tracker["train_history_MAPE"], tracker["val_history_MAPE"] = [t.item() for t in group1], [t.item() for t in group2]
tracker["remarks"] = ""

In [10]:
# logging experiment
current_time = ctime()
with open(f"experiments/{current_time}.json", "w") as f:
    json.dump(tracker, f, indent=4)