In [7]:
!pip install transformers scikit-learn gradio
import json
import time  
import torch
from pathlib import Path
from transformers import AutoTokenizer, AdamW, BertForQuestionAnswering
from torch.utils.data import DataLoader
from sklearn.feature_extraction.text import TfidfVectorizer
from torch.cuda.amp import GradScaler, autocast  # Automatic Mixed Precision
import gradio as gr


# Load data function to reuse for both train and validation datasets
def load_squad_data(file_path):
    with open(file_path, 'rb') as f:
        squad_dict = json.load(f)

    contexts, questions, answers = [], [], []
    for group in squad_dict['data']:
        for passage in group['paragraphs']:
            context = passage['context']
            for qa in passage['qas']:
                question = qa['question']
                for answer in qa['answers']:
                    contexts.append(context)
                    questions.append(question)
                    answers.append(answer)
    return contexts, questions, answers

# Load train and validation data
train_texts, train_queries, train_answers = load_squad_data(Path(r"D:\Downloads\archive (2)\train-v1.1.json"))
val_texts, val_queries, val_answers = load_squad_data(Path(r"D:\Downloads\archive (2)\dev-v1.1.json"))

# Tokenizer setup
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

train_encodings = tokenizer(train_texts, train_queries, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, val_queries, truncation=True, padding=True)

# Function to calculate and add token positions
def add_token_positions(encodings, answers):
    start_positions = []
    end_positions = []

    for i in range(len(answers)):
        start_idx = answers[i]['answer_start']
        end_idx = start_idx + len(answers[i]['text'])

        start_positions.append(encodings.char_to_token(i, start_idx))
        end_positions.append(encodings.char_to_token(i, end_idx - 1))

        # Handle truncation cases where token positions are not found
        if start_positions[-1] is None:
            start_positions[-1] = tokenizer.model_max_length
        if end_positions[-1] is None:
            end_positions[-1] = tokenizer.model_max_length

    encodings.update({'start_positions': start_positions, 'end_positions': end_positions})

# Apply token positions function to both train and validation encodings
add_token_positions(train_encodings, train_answers)
add_token_positions(val_encodings, val_answers)

# Custom Dataset Class
class SquadDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings['input_ids'])

# Create Dataset and DataLoader with pin_memory and larger batch size
train_dataset = SquadDataset(train_encodings)
val_dataset = SquadDataset(val_encodings)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, pin_memory=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False, pin_memory=True)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Model setup
model = BertForQuestionAnswering.from_pretrained('bert-base-uncased').to(device)

# Optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=3e-5)

# Automatic Mixed Precision setup
scaler = GradScaler()

# Hyperparameters
epochs = 3
print_every = 500
grad_accumulation_steps = 2  # Simulate larger batch sizes

# Training loop
train_losses = []
val_losses = []

whole_train_eval_time = time.time()

for epoch in range(epochs):
    model.train()
    total_train_loss = 0
    print(f"\nEpoch {epoch + 1}/{epochs}")

    for batch_idx, batch in enumerate(train_loader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_positions = batch['start_positions'].to(device)
        end_positions = batch['end_positions'].to(device)

        optimizer.zero_grad()

        # Automatic Mixed Precision (AMP)
        with autocast():
            outputs = model(input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions)
            loss = outputs.loss / grad_accumulation_steps  # Normalize loss for gradient accumulation

        scaler.scale(loss).backward()

        # Update model parameters after accumulating gradients
        if (batch_idx + 1) % grad_accumulation_steps == 0:
            scaler.step(optimizer)
            scaler.update()

        total_train_loss += loss.item() * grad_accumulation_steps

        if (batch_idx + 1) % print_every == 0:
            print(f"Batch {batch_idx + 1}/{len(train_loader)} - Loss: {loss.item():.4f}")

    avg_train_loss = total_train_loss / len(train_loader)
    train_losses.append(avg_train_loss)
    print(f"Training Loss: {avg_train_loss:.4f}")

    # Validation phase
    model.eval()
    total_val_loss = 0

    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            start_positions = batch['start_positions'].to(device)
            end_positions = batch['end_positions'].to(device)

            with autocast():
                outputs = model(input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions)
                loss = outputs.loss

            total_val_loss += loss.item()

    avg_val_loss = total_val_loss / len(val_loader)
    val_losses.append(avg_val_loss)
    print(f"Validation Loss: {avg_val_loss:.4f}")

total_time = time.time() - whole_train_eval_time
print(f"Total Training Time: {total_time:.2f} seconds")

# Save the model
torch.save(model.state_dict(), "finetunedmodel.pt")

# TF-IDF Vectorizer for context retrieval
vectorizer = TfidfVectorizer().fit(train_texts)

# Function to retrieve the most relevant context
def retrieve_relevant_context(question, contexts):
    question_vector = vectorizer.transform([question])
    context_vectors = vectorizer.transform(contexts)
    similarity_scores = (context_vectors * question_vector.T).toarray()
    most_relevant_context_idx = similarity_scores.argmax()
    return contexts[most_relevant_context_idx]

# Function to perform inference
def answer_question(context, question):
    inputs = tokenizer.encode_plus(question, context, return_tensors="pt").to(device)
    
    # Get model predictions
    with torch.no_grad():
        outputs = model(**inputs)
    
    start_scores = outputs.start_logits
    end_scores = outputs.end_logits

    start_idx = torch.argmax(start_scores)
    end_idx = torch.argmax(end_scores) + 1

    answer_tokens = inputs['input_ids'][0][start_idx:end_idx]
    answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(answer_tokens))

    return answer

# Example usage
context = retrieve_relevant_context("What did Albert Einstein develop?", train_texts)
question = "What did Albert Einstein develop?"

# Perform inference
answer = answer_question(context, question)
print(f"Question: {question}")
print(f"Answer: {answer}")




Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  scaler = GradScaler()
  with autocast():



Epoch 1/3


  attn_output = torch.nn.functional.scaled_dot_product_attention(


Batch 500/5475 - Loss: 1.0645
Batch 1000/5475 - Loss: 0.7119
Batch 1500/5475 - Loss: 0.7037
Batch 2000/5475 - Loss: 0.5603
Batch 2500/5475 - Loss: 0.6453
Batch 3000/5475 - Loss: 0.8732
Batch 3500/5475 - Loss: 0.3840
Batch 4000/5475 - Loss: 0.6124
Batch 4500/5475 - Loss: 0.4306
Batch 5000/5475 - Loss: 0.4597
Training Loss: 1.5553


  with autocast():


Validation Loss: 1.1927

Epoch 2/3
Batch 500/5475 - Loss: 0.7118
Batch 1000/5475 - Loss: 0.3461
Batch 1500/5475 - Loss: 0.5781
Batch 2000/5475 - Loss: 0.4724
Batch 2500/5475 - Loss: 0.9673
Batch 3000/5475 - Loss: 0.5110
Batch 3500/5475 - Loss: 0.7702
Batch 4000/5475 - Loss: 1.0179
Batch 4500/5475 - Loss: 0.5730
Batch 5000/5475 - Loss: 0.4679
Training Loss: 1.0039
Validation Loss: 1.1516

Epoch 3/3
Batch 500/5475 - Loss: 0.3241
Batch 1000/5475 - Loss: 0.3788
Batch 1500/5475 - Loss: 0.3265
Batch 2000/5475 - Loss: 0.3812
Batch 2500/5475 - Loss: 0.2204
Batch 3000/5475 - Loss: 0.8653
Batch 3500/5475 - Loss: 0.5740
Batch 4000/5475 - Loss: 0.6528
Batch 4500/5475 - Loss: 0.1769
Batch 5000/5475 - Loss: 0.2374
Training Loss: 0.8235
Validation Loss: 1.1465
Total Training Time: 4328.62 seconds
Question: What did Albert Einstein develop?
Answer: einsteinhaus, from 1903 to 1905, the year in which the annus mirabilis papers


In [11]:
# Save the model
model.save_pretrained("./my-finetuned-bert-model")

# Save the tokenizer
tokenizer.save_pretrained("./my-finetuned-bert-model")

('./my-finetuned-bert-model\\tokenizer_config.json',
 './my-finetuned-bert-model\\special_tokens_map.json',
 './my-finetuned-bert-model\\vocab.txt',
 './my-finetuned-bert-model\\added_tokens.json',
 './my-finetuned-bert-model\\tokenizer.json')

In [12]:
import torch
from transformers import AutoTokenizer, BertForQuestionAnswering

# Load the fine-tuned model and tokenizer
tokenizer = AutoTokenizer.from_pretrained('./my-finetuned-bert-model')
model = BertForQuestionAnswering.from_pretrained('./my-finetuned-bert-model')

# Move model to the appropriate device (GPU/CPU)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
model.eval()  # Set the model to evaluation mode

# Function to perform inference
def answer_question(context, question):
    inputs = tokenizer.encode_plus(question, context, return_tensors="pt").to(device)
    
    with torch.no_grad():
        outputs = model(**inputs)

    start_scores = outputs.start_logits
    end_scores = outputs.end_logits

    start_idx = torch.argmax(start_scores)
    end_idx = torch.argmax(end_scores) + 1  # End index is inclusive

    answer_tokens = inputs['input_ids'][0][start_idx:end_idx]
    answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(answer_tokens))

    return answer

In [13]:
import json
import torch
from pathlib import Path
from transformers import AutoTokenizer, BertForQuestionAnswering
from sklearn.feature_extraction.text import TfidfVectorizer
import gradio as gr

# Load data function to reuse for context retrieval
def load_squad_data(file_path):
    with open(file_path, 'rb') as f:
        squad_dict = json.load(f)

    contexts, questions, answers = [], [], []
    for group in squad_dict['data']:
        for passage in group['paragraphs']:
            context = passage['context']
            for qa in passage['qas']:
                question = qa['question']
                for answer in qa['answers']:
                    contexts.append(context)
                    questions.append(question)
                    answers.append(answer)
    return contexts, questions, answers

# Load train data for context retrieval
train_texts, train_queries, train_answers = load_squad_data(Path(r"D:\Downloads\archive (2)\train-v1.1.json"))

# Load the fine-tuned model and tokenizer from the saved directory
model = BertForQuestionAnswering.from_pretrained('./my-finetuned-bert-model')
tokenizer = AutoTokenizer.from_pretrained('./my-finetuned-bert-model')

# Set the model to evaluation mode and move it to the appropriate device
model.eval()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# TF-IDF Vectorizer for context retrieval
vectorizer = TfidfVectorizer().fit(train_texts)

# Function to retrieve the most relevant context
def retrieve_relevant_context(question, contexts):
    question_vector = vectorizer.transform([question])
    context_vectors = vectorizer.transform(contexts)
    similarity_scores = (context_vectors * question_vector.T).toarray()
    most_relevant_context_idx = similarity_scores.argmax()
    return contexts[most_relevant_context_idx]

# Function to answer the user's question based on the retrieved context
def answer_question(question):
    # Retrieve the most relevant context based on the question
    context = retrieve_relevant_context(question, train_texts)
    
    # Tokenize input
    inputs = tokenizer.encode_plus(question, context, return_tensors="pt").to(device)
    
    # Get model predictions
    with torch.no_grad():
        outputs = model(**inputs)
    
    # Get the most likely beginning and end of the answer
    start_scores = outputs.start_logits
    end_scores = outputs.end_logits
    start_idx = torch.argmax(start_scores)
    end_idx = torch.argmax(end_scores) + 1  # End index is inclusive

    # Convert token indices back to words
    answer_tokens = inputs['input_ids'][0][start_idx:end_idx]
    answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(answer_tokens))

    return answer

# Gradio interface function
def qa_chatbot(question):
    answer = answer_question(question)
    return answer

# Launch the Gradio chatbot interface
interface = gr.Interface(
    fn=qa_chatbot,
    inputs="text",
    outputs="text",
    title="BERT Q&A Chatbot",
    description="Ask any question, and the model will automatically retrieve the most relevant context and answer your question."
)

# Run the Gradio interface
interface.launch()


* Running on local URL:  http://127.0.0.1:7861

To create a public link, set `share=True` in `launch()`.


