In [18]:
import pandas as pd
from transformers import BertTokenizer, BertModel
import torch
from sklearn.model_selection import train_test_split
import numpy as np

def read_and_prepare_data(file_path):
    data = pd.read_csv(file_path)
    columns_to_combine = ['title', 'make', 'model', 'features', 'accessories']
    existing_columns = [col for col in columns_to_combine if col in data.columns]
    if not existing_columns:
        raise ValueError("None of the specified columns exist in the CSV file.")
    data['combined'] = data[existing_columns].apply(
        lambda x: ' '.join(f"{col}: {str(x[col]) if pd.notna(x[col]) else ''}" for col in existing_columns), axis=1)
    return data

tokenizer = BertTokenizer.from_pretrained('bert-large-cased')
model = BertModel.from_pretrained('bert-large-cased')
model.eval()  # Set the model to inference mode

def generate_bert_vectors(data, column_name, batch_size=10):
    vectors = []
    total_batches = (len(data) + batch_size - 1) // batch_size
    records_processed = 0
    for batch_index in range(total_batches):
        batch = data[batch_index * batch_size:(batch_index + 1) * batch_size]
        encoded_input = tokenizer(list(batch[column_name]), padding=True, truncation=True, return_tensors='pt')
        with torch.no_grad():
            outputs = model(**encoded_input)
        batch_vectors = outputs.last_hidden_state[:, 0, :].cpu().numpy()
        vectors.append(batch_vectors)
        records_processed += len(batch)
        if (batch_index + 1) % (1000 // batch_size) == 0 or batch_index == total_batches - 1:
            print(f"Processed {batch_index + 1} batches of {batch_size} records each, total records processed: {records_processed}.")
    return np.vstack(vectors)

# Read and process the data
data = read_and_prepare_data('data/train.csv')
# data = data.head(20)  # Limit the data for debugging purposes

# Generate BERT vectors and save to a .npy file
bert_vectors = generate_bert_vectors(data, 'combined', batch_size=32)
np.save('data/processed/train_full_vectors.npy', bert_vectors)

# Split data into train and validation sets after processing
train_indices, valid_indices = train_test_split(np.arange(len(data)), test_size=0.2, random_state=42)
train_vectors = bert_vectors[train_indices]
valid_vectors = bert_vectors[valid_indices]

# Save train and validation vectors
np.save('data/processed/train_vectors.npy', train_vectors)
np.save('data/processed/valid_vectors.npy', valid_vectors)

# Print to confirm
print("Train full vectors shape:", bert_vectors.shape)
print("Train vectors shape:", train_vectors.shape)
print("Validation vectors shape:", valid_vectors.shape)


Processed 31 batches of 32 records each, total records processed: 992.
Processed 62 batches of 32 records each, total records processed: 1984.
Processed 93 batches of 32 records each, total records processed: 2976.
Processed 124 batches of 32 records each, total records processed: 3968.
Processed 155 batches of 32 records each, total records processed: 4960.
Processed 186 batches of 32 records each, total records processed: 5952.
Processed 217 batches of 32 records each, total records processed: 6944.
Processed 248 batches of 32 records each, total records processed: 7936.
Processed 279 batches of 32 records each, total records processed: 8928.
Processed 310 batches of 32 records each, total records processed: 9920.
Processed 341 batches of 32 records each, total records processed: 10912.
Processed 372 batches of 32 records each, total records processed: 11904.
Processed 403 batches of 32 records each, total records processed: 12896.
Processed 434 batches of 32 records each, total reco

In [19]:
# Read and process the data
data = read_and_prepare_data('data/test.csv')
# data = data.head(20)  # Limit the data for debugging purposes
test_vectors = generate_bert_vectors(data, 'combined', batch_size=32)

# Save the vectors to a .npy file
np.save('data/processed/test_vectors.npy', test_vectors)

print("Validation vectors shape:", test_vectors.shape)

Processed 31 batches of 32 records each, total records processed: 992.
Processed 62 batches of 32 records each, total records processed: 1984.
Processed 93 batches of 32 records each, total records processed: 2976.
Processed 124 batches of 32 records each, total records processed: 3968.
Processed 155 batches of 32 records each, total records processed: 4960.
Processed 186 batches of 32 records each, total records processed: 5952.
Processed 217 batches of 32 records each, total records processed: 6944.
Processed 248 batches of 32 records each, total records processed: 7936.
Processed 279 batches of 32 records each, total records processed: 8928.
Processed 310 batches of 32 records each, total records processed: 9920.
Processed 313 batches of 32 records each, total records processed: 10000.
Validation vectors shape: (10000, 1024)
