In [None]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm


In [None]:
conda install -p /home/admin/projects/active/Bert Agent/.conda ipykernel --update-deps --force-reinstall

In [None]:

# Assuming your dataset is stored in a file named "dataset.txt"
data_file = "concatenate.txt"

# Load BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased')

# Function to preprocess each line of the dataset
def preprocess_line(line):
    # Split the line based on the delimiter
    parts = line.strip().split(" +++$+++ ")
    # Extract the primary content (last part)
    content = parts[-1]
    return content

# Function to tokenize and preprocess the text
def preprocess_text(text):
    # Tokenize the text
    tokens = tokenizer.tokenize(text)
    # Add [CLS] and [SEP] tokens
    tokens = ['[CLS]'] + tokens + ['[SEP]']
    # Convert tokens to ids
    input_ids = tokenizer.convert_tokens_to_ids(tokens)
    # Convert to tensor
    input_ids = torch.tensor(input_ids)
    return input_ids

# Read and preprocess the dataset
input_sequences = []
with open(data_file, 'r') as f:
    for line in f:
        content = preprocess_line(line)
        input_ids = preprocess_text(content)
        input_sequences.append(input_ids)

# Pad sequences to a fixed length
max_length = max(len(seq) for seq in input_sequences)
input_sequences = [torch.cat([seq, torch.zeros(max_length - len(seq), dtype=torch.long)]) for seq in input_sequences]

# Convert to tensor dataset
input_sequences = torch.stack(input_sequences)
labels = torch.zeros(input_sequences.shape[0], dtype=torch.long)  # Dummy labels for example

# Create DataLoader
dataset = TensorDataset(input_sequences, labels)
dataloader = DataLoader(dataset, batch_size=8, shuffle=True)

# Fine-tune BERT
optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)
loss_fn = torch.nn.CrossEntropyLoss()

model.train()
for epoch in range(3):  # Adjust number of epochs as needed
    for batch in tqdm(dataloader, desc="Training"):
        optimizer.zero_grad()
        inputs, labels = batch
        outputs = model(inputs)[0]
        loss = loss_fn(outputs, labels)
        loss.backward()
        optimizer.step()