# AI-Powered Question Generation from Text

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Import libraries
import pandas as pd
import numpy as np
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration, BartTokenizer, BartForConditionalGeneration
from datasets import Dataset
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from torch.utils.data import DataLoader
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer

In [None]:
# Load dataset from Google Drive
data_path = '/content/drive/MyDrive/train.csv'
df = pd.read_csv(data_path)

In [None]:
df = df.sample(n=10000, random_state=42).reset_index(drop=True)

In [None]:
df.isnull().sum()

Unnamed: 0,0
id,0
title,0
context,0
question,0
answers,0


In [None]:
df.duplicated().sum()

np.int64(0)

In [None]:
df.head()

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
def preprocess_data(row):
    context = row['context']
    question = row['question']
    input_text = f"generate question: {context}"
    return {'input_text': input_text, 'target_text': question}

In [None]:
# Apply preprocessing
processed_data = df.apply(preprocess_data, axis=1)
processed_df = pd.DataFrame(processed_data.tolist())

In [None]:
# Split into train and test sets
train_df, test_df = train_test_split(processed_df, test_size=0.2, random_state=42)

In [None]:
# Convert to Hugging Face Dataset
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

In [None]:
# Tokenization for T5
t5_tokenizer = T5Tokenizer.from_pretrained('t5-small')
def tokenize_t5(examples):
    inputs = t5_tokenizer(examples['input_text'], max_length=512, truncation=True, padding='max_length')
    targets = t5_tokenizer(examples['target_text'], max_length=128, truncation=True, padding='max_length')
    inputs['labels'] = targets['input_ids']
    return inputs

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [None]:
# Tokenization for BART
bart_tokenizer = BartTokenizer.from_pretrained('facebook/bart-base')
def tokenize_bart(examples):
    inputs = bart_tokenizer(examples['input_text'], max_length=512, truncation=True, padding='max_length')
    targets = bart_tokenizer(examples['target_text'], max_length=128, truncation=True, padding='max_length')
    inputs['labels'] = targets['input_ids']
    return inputs

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.72k [00:00<?, ?B/s]

In [None]:
# Tokenize datasets
t5_train_dataset = train_dataset.map(tokenize_t5, batched=True)
t5_test_dataset = test_dataset.map(tokenize_t5, batched=True)
bart_train_dataset = train_dataset.map(tokenize_bart, batched=True)
bart_test_dataset = test_dataset.map(tokenize_bart, batched=True)

Map:   0%|          | 0/8000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/8000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [None]:
# Set format for PyTorch
t5_train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])
t5_test_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])
bart_train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])
bart_test_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])

In [None]:
# Load models
t5_model = T5ForConditionalGeneration.from_pretrained('t5-small')
bart_model = BartForConditionalGeneration.from_pretrained('facebook/bart-base')

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/558M [00:00<?, ?B/s]

In [None]:
import torch
import os
from torch.utils.data import DataLoader

# Set environment variable to avoid memory fragmentation
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

In [None]:
def train_model(model, train_dataset, test_dataset, epochs=5, batch_size=8, learning_rate=0.0001):  # Reduced batch size to 8
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)

    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=batch_size)

    train_losses = []
    eval_losses = []
    train_accuracies = []
    eval_accuracies = []

    for epoch in range(epochs):
        model.train()
        total_train_loss = 0
        total_train_correct = 0
        total_train_samples = 0

        for batch in train_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            optimizer.zero_grad()
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()

            total_train_loss += loss.item()
            # Compute training accuracy
            logits = outputs.logits
            predictions = torch.argmax(logits, dim=-1)
            total_train_correct += (predictions == labels).sum().item()
            total_train_samples += labels.numel()

            # Clear memory after each batch
            del input_ids, attention_mask, labels, outputs, logits, predictions
            torch.cuda.empty_cache()

        avg_train_loss = total_train_loss / len(train_loader)
        train_accuracy = total_train_correct / total_train_samples
        train_losses.append(avg_train_loss)
        train_accuracies.append(train_accuracy)
        print(f"Epoch {epoch + 1}/{epochs} - Training Loss: {avg_train_loss:.4f}, Training Accuracy: {train_accuracy:.4f}")

        # Evaluation
        model.eval()
        total_eval_loss = 0
        total_eval_correct = 0
        total_eval_samples = 0

        with torch.no_grad():
            for batch in test_loader:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['labels'].to(device)

                outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
                total_eval_loss += outputs.loss.item()
                # Compute validation accuracy
                logits = outputs.logits
                predictions = torch.argmax(logits, dim=-1)
                total_eval_correct += (predictions == labels).sum().item()
                total_eval_samples += labels.numel()

                # Clear memory after each batch
                del input_ids, attention_mask, labels, outputs, logits, predictions
                torch.cuda.empty_cache()

        avg_eval_loss = total_eval_loss / len(test_loader)
        eval_accuracy = total_eval_correct / total_eval_samples
        eval_losses.append(avg_eval_loss)
        eval_accuracies.append(eval_accuracy)
        print(f"Epoch {epoch + 1}/{epochs} - Evaluation Loss: {avg_eval_loss:.4f}, Validation Accuracy: {eval_accuracy:.4f}")

    return model, train_losses, eval_losses, train_accuracies, eval_accuracies

In [None]:
# Define smoothing function
smoothie = SmoothingFunction().method1

def compute_metrics(model, tokenizer, test_dataset, device='cuda'):
    model.eval()
    bleu_scores = []
    rouge_scores = {'rouge1': [], 'rouge2': [], 'rougeL': []}

    test_loader = DataLoader(test_dataset, batch_size=8)

    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_length=128, num_beams=4)

            pred_texts = tokenizer.batch_decode(outputs, skip_special_tokens=True)
            ref_texts = tokenizer.batch_decode(labels, skip_special_tokens=True)

            # Compute BLEU with smoothing
            for pred, ref in zip(pred_texts, ref_texts):
                bleu = sentence_bleu([ref.split()], pred.split(), smoothing_function=smoothie)
                bleu_scores.append(bleu)

            # Compute ROUGE
            scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
            for pred, ref in zip(pred_texts, ref_texts):
                scores = scorer.score(ref, pred)
                rouge_scores['rouge1'].append(scores['rouge1'].fmeasure)
                rouge_scores['rouge2'].append(scores['rouge2'].fmeasure)
                rouge_scores['rougeL'].append(scores['rougeL'].fmeasure)

    avg_bleu = np.mean(bleu_scores)
    avg_rouge1 = np.mean(rouge_scores['rouge1'])
    avg_rouge2 = np.mean(rouge_scores['rouge2'])
    avg_rougeL = np.mean(rouge_scores['rougeL'])

    return {'BLEU': avg_bleu, 'ROUGE-1': avg_rouge1, 'ROUGE-2': avg_rouge2, 'ROUGE-L': avg_rougeL}

In [None]:
# Training the T5 model
print("Training T5 model...")
t5_model, t5_train_losses, t5_eval_losses, t5_train_accs, t5_eval_accs = train_model(t5_model, t5_train_dataset, t5_test_dataset)

Training T5 model...


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch 1/5 - Training Loss: 0.4589, Training Accuracy: 0.9318
Epoch 1/5 - Evaluation Loss: 0.2567, Validation Accuracy: 0.9515
Epoch 2/5 - Training Loss: 0.2671, Training Accuracy: 0.9496
Epoch 2/5 - Evaluation Loss: 0.2468, Validation Accuracy: 0.9525
Epoch 3/5 - Training Loss: 0.2525, Training Accuracy: 0.9514
Epoch 3/5 - Evaluation Loss: 0.2423, Validation Accuracy: 0.9530
Epoch 4/5 - Training Loss: 0.2413, Training Accuracy: 0.9528
Epoch 4/5 - Evaluation Loss: 0.2395, Validation Accuracy: 0.9533
Epoch 5/5 - Training Loss: 0.2305, Training Accuracy: 0.9540
Epoch 5/5 - Evaluation Loss: 0.2390, Validation Accuracy: 0.9534


In [None]:
# Training the BART model
print("Training BART model...")
bart_model, bart_train_losses, bart_eval_losses, bart_train_accs, bart_eval_accs = train_model(bart_model, bart_train_dataset, bart_test_dataset)

Training BART model...
Epoch 1/5 - Training Loss: 0.4448, Training Accuracy: 0.9354
Epoch 1/5 - Evaluation Loss: 0.2349, Validation Accuracy: 0.9542
Epoch 2/5 - Training Loss: 0.2112, Training Accuracy: 0.9564
Epoch 2/5 - Evaluation Loss: 0.2428, Validation Accuracy: 0.9538
Epoch 3/5 - Training Loss: 0.1718, Training Accuracy: 0.9615
Epoch 3/5 - Evaluation Loss: 0.2559, Validation Accuracy: 0.9522


In [None]:
# Plotting accuracies
plt.figure(figsize=(12, 6))
plt.plot(t5_train_accs, label='T5 Training Accuracy', color='blue', linestyle='--')
plt.plot(t5_eval_accs, label='T5 Validation Accuracy', color='blue')
plt.plot(bart_train_accs, label='BART Training Accuracy', color='orange', linestyle='--')
plt.plot(bart_eval_accs, label='BART Validation Accuracy', color='orange')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.title('Training and Validation Accuracy Over Epochs')
plt.legend()
plt.grid(True, linestyle='--', alpha=0.7)
plt.show()

In [None]:
# 1. Training Loss Comparison (Line Plot)
plt.figure(figsize=(10, 6))
plt.plot(t5_train_losses, label='T5 Training Loss')
plt.plot(bart_train_losses, label='BART Training Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training Loss Comparison')
plt.legend()
plt.show()

In [None]:
# 2. Evaluation Loss Comparison (Line Plot)
plt.figure(figsize=(10, 6))
plt.plot(t5_eval_losses, label='T5 Evaluation Loss')
plt.plot(bart_eval_losses, label='BART Evaluation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Evaluation Loss Comparison')
plt.legend()
plt.show()