In [None]:
!pip install -q torch
!pip install -q transformers
!pip install -q pandas


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m50.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m32.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m96.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m81.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m295.0/295.0 kB[0m [31m31.7 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import os
import torch
import time
import pandas as pd  # For CSV output
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizerFast, BertForQuestionAnswering
import json
from torch.optim import AdamW

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Directory and File Paths
model_dir = "/content/drive/My Drive/trained_models"
os.makedirs(model_dir, exist_ok=True)
model_file = os.path.join(model_dir, "trained_model_bert_csv")

Mounted at /content/drive


In [None]:
# Define the Model
model = BertForQuestionAnswering.from_pretrained('bert-base-uncased')
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)
optimizer = AdamW(model.parameters(), lr=5e-5)

# Dataset Class
class QA_Dataset(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings
    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
    def __len__(self):
        return len(self.encodings.input_ids)

# Data Loading and Preprocessing Functions
def load_data(file_path):
    # Check if the data is already downloaded, if not, download it
    if not os.path.exists(file_path):
        os.makedirs('squad', exist_ok=True)
        print(f"{file_path} not found. Downloading now...")
        if 'train' in file_path:
            !wget https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json -O squad/train-v2.0.json
        elif 'dev' in file_path:
            !wget https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json -O squad/dev-v2.0.json
        else:
            print("Invalid file path. Cannot download data.")
            return None

    # Load the data
    with open(file_path, 'rb') as f:
        squad_data = json.load(f)
    texts, queries, answers = [], [], []
    for group in squad_data['data']:
        for passage in group['paragraphs']:
            context = passage['context']
            for qa in passage['qas']:
                question = qa['question']
                for answer in qa['answers']:
                    answer['answer_end'] = answer['answer_start'] + len(answer['text'])  # Add this line
                    texts.append(context)
                    queries.append(question)
                    answers.append(answer)
    return texts, queries, answers

def add_token_positions(encodings, answers, tokenizer):
    start_positions, end_positions = [], []
    for i in range(len(answers)):
        start_positions.append(encodings.char_to_token(i, answers[i]['answer_start']))
        end_positions.append(encodings.char_to_token(i, answers[i]['answer_end'] - 1))
        if start_positions[-1] is None:
            start_positions[-1] = tokenizer.model_max_length
        if end_positions[-1] is None:
            end_positions[-1] = tokenizer.model_max_length
    encodings.update({'start_positions': start_positions, 'end_positions': end_positions})


Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.weight', 'qa_outputs.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Initialize tokenizer
tokenizer = BertTokenizerFast.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')

# Load and process data
train_texts, train_queries, train_answers = load_data('squad/train-v2.0.json')
val_texts, val_queries, val_answers = load_data('squad/dev-v2.0.json')

# Tokenize data
train_encodings = tokenizer(train_texts, train_queries, truncation=True, padding=True)
print(train_encodings.keys())
val_encodings = tokenizer(val_texts, val_queries, truncation=True, padding=True)

# Prepare datasets and dataloaders
train_dataset = QA_Dataset(train_encodings)
val_dataset = QA_Dataset(val_encodings)
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)
add_token_positions(train_encodings, train_answers, tokenizer)
add_token_positions(val_encodings, val_answers, tokenizer)

# Initialize model and optimizer
model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
optimizer = AdamW(model.parameters(), lr=5e-5)

# Check for GPU availability
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)


# Training and Evaluation Loop
train_losses = []
val_losses = []
print_every = 500
epochs = 3
for epoch in range(epochs):
    epoch_time = time.time()
    model.train()
    loss_of_epoch = 0
    print("Begin Training Here")
    for batch_idx, batch in enumerate(train_loader):
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_positions = batch['start_positions'].to(device)
        end_positions = batch['end_positions'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        loss_of_epoch += loss.item()
        if (batch_idx + 1) % print_every == 0:
            print(f"Batch {batch_idx + 1} / {len(train_loader)}\nLoss: {round(loss.item(), 1)}\n")
    loss_of_epoch /= len(train_loader)
    train_losses.append(loss_of_epoch)
    model.eval()
    loss_of_epoch = 0
    print("Begin Evaluation Here")
    for batch_idx, batch in enumerate(val_loader):
        with torch.no_grad():
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            start_positions = batch['start_positions'].to(device)
            end_positions = batch['end_positions'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions)
            loss = outputs.loss
            loss_of_epoch += loss.item()
        if (batch_idx + 1) % print_every == 0:
            print(f"Batch {batch_idx + 1} / {len(val_loader)}\nLoss: {round(loss.item(), 1)}\n")
    loss_of_epoch /= len(val_loader)
    val_losses.append(loss_of_epoch)
    print(f"\n------- Epoch {epoch + 1} -------\nTraining Loss: {train_losses[-1]}\nValidation Loss: {val_losses[-1]}\nTime: {(time.time() - epoch_time)}\n-----------------------\n\n")

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/443 [00:00<?, ?B/s]

squad/train-v2.0.json not found. Downloading now...
--2023-10-22 06:06:04--  https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json
Resolving rajpurkar.github.io (rajpurkar.github.io)... 185.199.111.153, 185.199.109.153, 185.199.108.153, ...
Connecting to rajpurkar.github.io (rajpurkar.github.io)|185.199.111.153|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 42123633 (40M) [application/json]
Saving to: ‘squad/train-v2.0.json’


2023-10-22 06:06:07 (330 MB/s) - ‘squad/train-v2.0.json’ saved [42123633/42123633]

squad/dev-v2.0.json not found. Downloading now...
--2023-10-22 06:06:08--  https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json
Resolving rajpurkar.github.io (rajpurkar.github.io)... 185.199.111.153, 185.199.109.153, 185.199.108.153, ...
Connecting to rajpurkar.github.io (rajpurkar.github.io)|185.199.111.153|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4370528 (4.2M) [application/json]
Saving to: ‘squ

Downloading model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-large-uncased-whole-word-masking-finetuned-squad were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Begin Training Here
Batch 1000 / 10853
Loss: 1.3

Batch 2000 / 10853
Loss: 0.6

Batch 3000 / 10853
Loss: 0.8

Batch 4000 / 10853
Loss: 1.4

Batch 5000 / 10853
Loss: 1.8

Batch 6000 / 10853
Loss: 0.3

Batch 7000 / 10853
Loss: 1.8

Batch 8000 / 10853
Loss: 1.2

Batch 9000 / 10853
Loss: 1.5

Batch 10000 / 10853
Loss: 1.3

Begin Evaluation Here
Batch 1000 / 2538
Loss: 1.3

Batch 2000 / 2538
Loss: 0.7


------- Epoch 1 -------
Training Loss: 1.0283940129787332
Validation Loss: 1.2759204969768558
Time: 6876.9543471336365
-----------------------


Begin Training Here
Batch 1000 / 10853
Loss: 0.3

Batch 2000 / 10853
Loss: 0.5

Batch 3000 / 10853
Loss: 1.4

Batch 4000 / 10853
Loss: 0.3

Batch 5000 / 10853
Loss: 0.3

Batch 6000 / 10853
Loss: 0.3

Batch 7000 / 10853
Loss: 1.3

Batch 8000 / 10853
Loss: 1.6

Batch 9000 / 10853
Loss: 1.5

Batch 10000 / 10853
Loss: 1.4

Begin Evaluation Here
Batch 1000 / 2538
Loss: 1.1

Batch 2000 / 2538
Loss: 0.6


------- Epoch 2 -------
Training Loss: 0.9148554868

In [None]:
def output_to_csv(texts, queries, answers, output_file):
    df = pd.DataFrame({
        'Context': texts,
        'Question': queries,
        'Answer_Start': [ans['answer_start'] for ans in answers],
        'Answer_End': [ans.get('answer_end', -1) for ans in answers],
        'Answer_Text': [ans['text'] for ans in answers]
    })
    df.to_csv(output_file, index=False)

# Save the context, questions, and answers to CSV
output_file = os.path.join(model_dir, "squad_data.csv")
output_to_csv(train_texts, train_queries, train_answers, output_file)

# Save the trained model and tokenizer
model.save_pretrained(model_file)
tokenizer.save_pretrained(model_file)

['squad_data.csv', 'trained_model_bert_csv']
