In [1]:
import json
import os
import sys

import json
from tqdm import tqdm

def load_jsonl_to_memory(filepath, fraction=10):
    # Determine the total number of lines to calculate the size of the fraction
    with open(filepath, 'r', encoding='utf-8') as file:
        total_lines = sum(1 for _ in file)
    
    # Calculate the number of lines to process based on the fraction
    lines_to_process = total_lines // fraction
    
    # Preallocate the list with None values for the fraction of data
    data = [None] * lines_to_process
    
    with open(filepath, 'r', encoding='utf-8') as file:
        processed_lines = 0  # Keep track of how many lines have been processed
        for index, line in enumerate(tqdm(file, total=total_lines, desc="Processing")):
            if index % fraction == 0:  # Process only every fraction-th line
                # Parse the JSON content from the line and add it to the data list
                data[processed_lines] = json.loads(line)
                processed_lines += 1
                if processed_lines >= lines_to_process:
                    break  # Stop if we've processed the intended number of lines
    
    return data

data = load_jsonl_to_memory('/mnt/datassd/processed_file.jsonl')

Processing:  84%|████████▍ | 44727499/53209647 [00:16<00:03, 2737385.50it/s]

In [None]:
# data in GB
sys.getsizeof(data) / 1024**3

In [None]:
from transformers import AutoTokenizer
from sklearn.model_selection import train_test_split
import numpy as np

# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Function to tokenize a batch of titles
def batch_encode(tokenizer, texts, batch_size=32, max_length=512):
    batch_tokens = []
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        tokens = tokenizer(batch, max_length=max_length, padding='max_length', truncation=True, return_tensors="np")
        batch_tokens.append(tokens)
    return batch_tokens

# Data generator
def data_generator(data, batch_size=32):
    # Shuffle data
    np.random.shuffle(data)
    titles = [entry['title'] for entry in data]
    view_counts = [entry['view_count'] for entry in data]
    for i in range(0, len(titles), batch_size):
        batch_titles = titles[i:i+batch_size]
        batch_view_counts = view_counts[i:i+batch_size]
        tokens = tokenizer(batch_titles, max_length=512, padding='max_length', truncation=True, return_tensors="np")
        yield tokens['input_ids'], np.array(batch_view_counts)

# Split data into training and test sets
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

# Example usage of the generator
train_generator = data_generator(train_data, batch_size=8)


In [None]:
from transformers import TFAutoModel
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.optimizers import Adam

os.environ["KERAS_BACKEND"] = "pytorch"


# Load pre-trained BERT model
bert = TFAutoModel.from_pretrained("bert-base-uncased")

# Define input layer
input_ids = Input(shape=(None,), dtype='int32')

# Get BERT's output
bert_output = bert(input_ids)[0]

# Custom head for regression
x = Dense(512, activation='relu')(bert_output[:, 0, :])
output = Dense(1, activation='linear')(x)

# Compile model
model = Model(inputs=input_ids, outputs=output)
model.compile(optimizer=Adam(learning_rate=1e-5), loss='mean_squared_error')

model.summary()


In [None]:
# Calculate the number of steps per epoch for training
train_steps = len(train_data) // 8  # Assuming batch_size=8
val_steps = len(test_data) // 8  # Assuming batch_size=8 for validation as well

# Train the model using the generator
model.fit(train_generator, steps_per_epoch=train_steps, epochs=3)


In [None]:
model.evaluate(X_test, np.array(y_test))
