In [3]:
import pandas as pd
import numpy as np

In [4]:
books = pd.read_csv("../data/extended_books_google.csv")

In [16]:
books.columns

Index(['ISBN', 'title', 'subtitle', 'authors', 'publisher', 'publishedDate',
       'description', 'pageCount', 'maturityRating', 'language', 'categories',
       'ratingsCount', 'averageRating', 'textSnippet', 'categories_list',
       'category_embeddings'],
      dtype='object')

In [5]:
# Fill missing categories with a placeholder
books['categories'] = books['categories'].fillna("unknown")

# Split categories into lists
books['categories_list'] = books['categories'].apply(lambda x: [cat.strip() for cat in x.split(",")])
books['categories_list']

0                  [unknown]
1                  [Medical]
2                  [Fiction]
3        [Adventure stories]
4                  [Fiction]
                ...         
16594              [Fiction]
16595     [Juvenile Fiction]
16596          [Bookbinders]
16597              [Fiction]
16598              [Fiction]
Name: categories_list, Length: 16599, dtype: object

In [None]:
import gensim.downloader as api
glove_embeddings = api.load("glove-wiki-gigaword-100")  # 100-dimensional vectors

def embed_categories(categories_list):
    embeddings = []
    for category in categories_list:
        # Convert category to lowercase
        category = category.lower()
        # Split into individual words
        words = category.split()
        word_embeddings = []
        for word in words:
            if word in glove_embeddings:
                word_embeddings.append(glove_embeddings[word])
            elif word_embeddings:  # Use the previous word's embedding if it's not the first word so mean can stay the same
                word_embeddings.append(word_embeddings[-1])
            else:  # If no valid embedding exists, append a zero vector
                word_embeddings.append(np.zeros(100))
        # Average word embeddings within the category
        curr_emb = np.mean(word_embeddings, axis=0) if word_embeddings else np.zeros(100)
        embeddings.append(curr_emb)
    # Average embeddings across all categories for a book
    return np.mean(embeddings, axis=0) if embeddings else np.zeros(100)

# Apply embedding function to categories
books['category_embeddings'] = books['categories_list'].apply(embed_categories)


In [None]:
books['category_embeddings']

0        [-0.026542, -0.049169, 0.56758, 0.23964, 0.137...
1        [0.068163, 0.094016, -0.4367, 0.15944, -0.3249...
2        [-0.23013, 0.28106, 0.29434, -0.080361, 1.0595...
3        [-0.43716, 0.426305, 0.31327, -0.231497, 0.268...
4        [-0.23013, 0.28106, 0.29434, -0.080361, 1.0595...
                               ...                        
16594    [-0.23013, 0.28106, 0.29434, -0.080361, 1.0595...
16595    [0.369055, 0.1764065, 0.23267001, -0.2975305, ...
16596    [0.22282, -0.017365, -0.37452, 0.093967, 0.190...
16597    [-0.23013, 0.28106, 0.29434, -0.080361, 1.0595...
16598    [-0.23013, 0.28106, 0.29434, -0.080361, 1.0595...
Name: category_embeddings, Length: 16599, dtype: object

In [None]:
# # save as numpy array
# np.save("../data/category_embeddings.npy", np.stack(books['category_embeddings']))

In [4]:
def combine_text(row):
    # Combine title, subtitle, and description with proper handling of missing values
    return ' '.join(filter(None, [
        str(row.get('title', '')).strip(),
        str(row.get('subtitle', '')).strip(),
        str(row.get('description', '')).strip()
    ]))

books['combined_text'] = books.apply(combine_text, axis=1)

In [6]:
from transformers import AutoTokenizer, AutoModel
import torch

model_name = 'BAAI/bge-small-en-v1.5'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
model.eval()  # Ensure the model is in evaluation mode


tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/743 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 384, padding_idx=0)
    (position_embeddings): Embedding(512, 384)
    (token_type_embeddings): Embedding(2, 384)
    (LayerNorm): LayerNorm((384,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=384, out_features=384, bias=True)
            (key): Linear(in_features=384, out_features=384, bias=True)
            (value): Linear(in_features=384, out_features=384, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=384, out_features=384, bias=True)
            (LayerNorm): LayerNorm((384,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

In [11]:
from tqdm import tqdm  # For progress bar

def get_batch_embeddings(batch_texts):
    # Tokenize a batch of texts
    inputs = tokenizer(
        batch_texts,
        return_tensors='pt',
        truncation=True,
        padding='max_length',
        max_length=512  # Adjust max length based on your text size
    )
    with torch.no_grad():
        outputs = model(**inputs)
    # Mean pooling for dense embedding
    embeddings = outputs.last_hidden_state.mean(dim=1)
    return embeddings


In [12]:
from torch.utils.data import DataLoader
from tqdm import tqdm

# Parameters
batch_size = 32  # Adjust based on available GPU/CPU memory

# Create a DataLoader for batch processing
dataloader = DataLoader(
    books['combined_text'].tolist(),
    batch_size=batch_size,
    shuffle=False
)

# Generate embeddings in batches
all_embeddings = []
for batch_texts in tqdm(dataloader, desc='Generating embeddings'):
    batch_embeddings = get_batch_embeddings(batch_texts)
    all_embeddings.append(batch_embeddings)

# Concatenate all embeddings into a single tensor
text_embeddings_tensor = torch.cat(all_embeddings)

Generating embeddings: 100%|██████████| 519/519 [26:36<00:00,  3.08s/it]


In [13]:
torch.save({
    'ISBNs': books['ISBN'].values,  # Save the ISBNs for mapping later
    'embeddings': text_embeddings_tensor
}, 'book_text_embeddings_bge.pt')
