In [1]:
import torch
from transformers import DistilBertTokenizer, DistilBertForQuestionAnswering, Trainer, TrainingArguments
from datasets import Dataset
import pandas as pd
from torch.utils.data import Dataset as TorchDataset

2024-09-13 19:37:55.027921: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-09-13 19:37:55.051720: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-09-13 19:37:55.060019: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-09-13 19:37:55.079655: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [None]:
 # Load your custom dataset from CSV
df = pd.read_csv("/home/dnn/Storage8TB/pythonCodeArea/srijita/Text/qna_output.csv")
dataset = Dataset.from_pandas(df)

In [2]:
# Custom Dataset class to handle labels
class CustomDataset(TorchDataset):
    def __init__(self, input_ids, attention_mask, start_positions, end_positions):
        self.input_ids = input_ids
        self.attention_mask = attention_mask
        self.start_positions = start_positions
        self.end_positions = end_positions

    def __getitem__(self, idx):
        return {
            'input_ids': self.input_ids[idx],
            'attention_mask': self.attention_mask[idx],
            'start_positions': self.start_positions[idx],
            'end_positions': self.end_positions[idx]
        }

    def __len__(self):
        return len(self.input_ids)

In [3]:
# Custom Data Collator
class CustomDataCollator:
    def __init__(self, tokenizer):
        self.tokenizer = tokenizer

    def __call__(self, features):
        batch = {key: torch.stack([f[key] for f in features]) for key in features[0].keys()}
        return batch

# Load models and tokenizers
def load_models_and_tokenizers():
    qa_model = DistilBertForQuestionAnswering.from_pretrained("distilbert-base-cased-distilled-squad")
    qa_tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-cased-distilled-squad")

    if torch.cuda.is_available():
        qa_model = qa_model.to("cuda")

    return qa_model, qa_tokenizer

In [8]:
# Preprocess the dataset
def preprocess_function(examples, qa_tokenizer):
    inputs = examples['text']
    answers = examples['answer']
    
    model_inputs = qa_tokenizer(inputs, max_length=256, truncation=True, padding="max_length")
    
    start_positions = []
    end_positions = []

    for i in range(len(inputs)):
        context = inputs[i]
        answer = answers[i]
        
        start, end = find_answer_positions(context, answer, qa_tokenizer)
        start_positions.append(start)
        end_positions.append(end)

    return {
        'input_ids': model_inputs['input_ids'],
        'attention_mask': model_inputs['attention_mask'],
        'start_positions': start_positions,
        'end_positions': end_positions
    }

In [9]:
# Find start and end positions for the answers in the context
def find_answer_positions(context, answer, tokenizer):
    tokenized_context = tokenizer(context, add_special_tokens=False)['input_ids']
    tokenized_answer = tokenizer(answer, add_special_tokens=False)['input_ids']
    for i in range(len(tokenized_context) - len(tokenized_answer) + 1):
        if tokenized_context[i:i+len(tokenized_answer)] == tokenized_answer:
            return i, i + len(tokenized_answer) - 1
    return 0, 0  # Default to 0,0 if answer is not found

In [10]:
# Train the model with user-defined parameters
def train_model(model, tokenizer, train_dataset, valid_dataset, epochs, batch_size, learning_rate):
    training_args = TrainingArguments(
        output_dir="./results",
        evaluation_strategy="epoch",
        learning_rate=learning_rate,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        num_train_epochs=epochs,
        weight_decay=0.01,
        logging_dir='./logs',
        logging_steps=10,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=valid_dataset,
        tokenizer=tokenizer,
        data_collator=CustomDataCollator(tokenizer),
    )

    trainer.train()

In [None]:
# Function to generate answers based on context and question
def generate_answer(context, question, qa_model, qa_tokenizer):
    inputs = qa_tokenizer.encode_plus(question, context, return_tensors='pt', max_length=512, truncation=True)
    if torch.cuda.is_available():
        inputs = {k: v.to('cuda') for k, v in inputs.items()}

    outputs = qa_model(**inputs)
    answer_start = torch.argmax(outputs.start_logits)
    answer_end = torch.argmax(outputs.end_logits) + 1
    return qa_tokenizer.convert_tokens_to_string(qa_tokenizer.convert_ids_to_tokens(inputs['input_ids'][0][answer_start:answer_end]))

In [11]:
# Main execution
if __name__ == "__main__":
    # Load models and tokenizers
    qa_model, qa_tokenizer = load_models_and_tokenizers()

  return torch._C._cuda_getDeviceCount() > 0


In [13]:

train_test_split = dataset.train_test_split(test_size=0.2)

# Preprocess dataset
train_dataset = train_test_split['train'].map(lambda x: preprocess_function(x, qa_tokenizer), batched=True)
valid_dataset = train_test_split['test'].map(lambda x: preprocess_function(x, qa_tokenizer), batched=True)


Map:   0%|          | 0/40 [00:00<?, ? examples/s]

Map:   0%|          | 0/11 [00:00<?, ? examples/s]

In [14]:
  # Convert to PyTorch Dataset
train_torch_dataset = CustomDataset(
    input_ids=torch.tensor(train_dataset['input_ids']),
    attention_mask=torch.tensor(train_dataset['attention_mask']),
    start_positions=torch.tensor(train_dataset['start_positions']),
    end_positions=torch.tensor(train_dataset['end_positions'])
)

valid_torch_dataset = CustomDataset(
    input_ids=torch.tensor(valid_dataset['input_ids']),
    attention_mask=torch.tensor(valid_dataset['attention_mask']),
    start_positions=torch.tensor(valid_dataset['start_positions']),
    end_positions=torch.tensor(valid_dataset['end_positions'])
)


In [15]:

epochs = 50
batch_size = 8
learning_rate = 2e-5

# Train the model
train_model(qa_model, qa_tokenizer, train_torch_dataset, valid_torch_dataset, epochs, batch_size, learning_rate)




Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33msrijitaghoshhajra9[0m ([33msrijitaghoshhajra9-college-board[0m). Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss
1,No log,2.248546
2,No log,2.417482
3,No log,2.317642
4,1.873700,1.961863
5,1.873700,1.848046
6,1.873700,1.88665
7,0.623400,2.165314
8,0.623400,2.307266
9,0.623400,2.334587
10,0.201700,2.186985


In [17]:
# Example input text
example_text = "I love to read. Reading is my hobby. I read books belonging to all genres. But it is adventure and mystery stories that keep me the most engaged. I enjoy reading Sherlock Holmes stories by Arthur Conan Doyle. "

# Generate questions and answers
question = "What is my hobby?"
answer = generate_answer(example_text, question, qa_model, qa_tokenizer)

# Print results
print(f"\nInput Text: {example_text}")
print(f"Generated Question: {question}")
print(f"Generated Answer: {answer}")


Input Text: I love to read. Reading is my hobby. I read books belonging to all genres. But it is adventure and mystery stories that keep me the most engaged. I enjoy reading Sherlock Holmes stories by Arthur Conan Doyle. 
Generated Question: What is my hobby?
Generated Answer: Reading
