In [1]:
import os
import json
import tomllib
import pandas
import numpy
import torch

from copy import deepcopy
from itertools import chain
from pandas import DataFrame, Series
from pprint import pprint
from tqdm import tqdm
from time import time, ctime
from joblib import Parallel, delayed
from model import BertRegressor
from transformers import AutoTokenizer

CONFIG = tomllib.load(open("config.toml", "rb"))
CONFIG_PREPROCESS = CONFIG["preprocess"]
CONFIG_MODEL = CONFIG["model"]

tracker: dict = deepcopy(CONFIG)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# pre-processors
def clean_authors(author_list: str) -> str:
    return author_list.replace("'", "").replace("[", "").replace("]", "")

def extract_year(date_str: str) -> str:
    return date_str.split("-")[0]

def date_cleaner(string: str) -> str:
    numbers =  ''.join(filter(str.isdigit, string))
    if len(numbers) == 4:
        return numbers
    else:
        return numpy.nan

def category_cleaner(category_string: str) -> str:
    return category_string.replace("[", "").replace("]", "").replace("'", "").lower()

def stringify_single(record: dict) -> str:
    string = ""
    for key, value in record.items():
        string += f"{key} - {value}; "
    return string

def stringify_df(df: DataFrame) -> list:
    records = df.to_dict("records")[:5]
    return [stringify_single(record) for record in records]


def preprocess_pipe(df: DataFrame, config: dict) -> list| list:
    #* Basic preprocessing
    df = df.drop(config["drop_columns"], axis=1)
    df["description"] = df["description"].fillna(config["missing_string"])
    df["authors"] = df["authors"].fillna(config["missing_string"])
    df["publishedDate"] = df["publishedDate"].fillna(config["missing_string"])
    df["authors"] = df["authors"].map(clean_authors)
    df["publishedDate"] = df["publishedDate"].map(extract_year).map(date_cleaner).dropna()
    df["categories"] = df["categories"].map(category_cleaner)

    #* model specific preprocessing
    x_df =  df.drop(columns = ["Impact"], axis=1)
    y: list = df["Impact"].copy().to_list()
    x: list = x_df.to_dict("records")
    string_x = stringify_df(x_df)
    return string_x, y

def parallel_preprocess(df: DataFrame, workers: int=CONFIG_PREPROCESS["workers"], chunks: int= CONFIG_PREPROCESS["chunks"], description: str = "processing", config: dict=CONFIG_PREPROCESS) -> (list, list):
    chunked_df: list = numpy.array_split(df, chunks)

    taskq =tqdm([delayed(preprocess_pipe)(chunk, config) for chunk in chunked_df], total=len(chunked_df), desc=description)
    with Parallel(n_jobs=workers, verbose=0) as parallel:
        chunk_xy = parallel(taskq) #[(x, y), (x, y)]

    x, y = [], []
    for chunk in chunk_xy:
        chunk_x, chunk_y = chunk
        x.extend(chunk_x)
        y.extend(chunk_y)
    return x, y

In [3]:
data = pandas.read_csv("books_task.csv")
tick = time()
x, y = parallel_preprocess(data)
tock = time()
tracker["time_to_preprocess"] = tock - tick

  return bound(*args, **kwds)
processing: 100%|██████████| 80/80 [00:00<00:00, 126.20it/s]


In [4]:
# quick inspection of the data
idx = 3
x[idx], y[idx]

('Title - Whispers of the Wicked Saints; description - Julia Thomas finds her life spinning out of control after the death of her husband, Richard. Julia turns to her minister for comfort when she finds herself falling for him with a passion that is forbidden by the church. Heath Sparks is a man of God who is busy taking care of his quadriplegic wife who was seriously injured in a sever car accident. In an innocent effort to reach out to a lonely member of his church, Heath finds himself as the man and not the minister as Heath and Julia surrender their bodies to each other and face the wrath of God. Julia finds herself in over her head as she faces a deadly disease, the loss of her home and whispers about her wicked affair. Julia leaves the states offering her body as a living sacrifice in hopes of finding a cure while her heart remains thousands of miles away hoping to one day reunite with the man who holds it hostage.Whispers of the Wicked Saints is a once in a lifetime romance that

In [5]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
x_inputs = tokenizer(x[:10], padding=True, truncation=True, return_tensors="pt", max_length=CONFIG_MODEL["max_len"], add_special_tokens=True).to(CONFIG_MODEL["device"])
y_inputs = torch.tensor(y[:10], dtype=torch.float32).to(CONFIG_MODEL["device"])

In [6]:
model = BertRegressor().to(CONFIG_MODEL["device"])
outs = model(ids=x_inputs["input_ids"], mask=x_inputs["attention_mask"])

In [7]:
outs

tensor([[[-0.0448],
         [-0.0602],
         [-0.0464],
         ...,
         [-0.0396],
         [-0.0376],
         [-0.0470]],

        [[-0.0379],
         [-0.0468],
         [-0.0321],
         ...,
         [-0.0327],
         [-0.0373],
         [-0.0214]],

        [[-0.0363],
         [-0.0599],
         [-0.0612],
         ...,
         [-0.0483],
         [-0.0386],
         [-0.0403]],

        ...,

        [[-0.0625],
         [-0.0759],
         [-0.0539],
         ...,
         [-0.0649],
         [-0.0607],
         [-0.0634]],

        [[-0.0558],
         [-0.0521],
         [-0.0492],
         ...,
         [-0.0472],
         [-0.0374],
         [-0.0464]],

        [[-0.0422],
         [-0.0645],
         [-0.0516],
         ...,
         [-0.0374],
         [-0.0449],
         [-0.0320]]], device='cuda:0', grad_fn=<ViewBackward0>)

In [8]:
# logging experiment
current_time = ctime()
with open(f"experiments/{current_time}.json", "w") as f:
    json.dump(tracker, f, indent=4)