In [None]:
!pip install transformers datasets torch sentencepiece


Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupt

In [None]:
# Step 1: Import necessary libraries
import pandas as pd
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
from datasets import Dataset, load_dataset # Instead of load_metric, use load_dataset
from google.colab import files

In [None]:
# Step 2: Upload and Load Dataset
uploaded = files.upload()
file_name = list(uploaded.keys())[0]
df = pd.read_csv(file_name)

In [None]:
df = df[['context', 'generated_questions']]
df.rename(columns={"context": "input_text", "generated_questions": "target_text"}, inplace=True)
df["input_text"] = df["input_text"].astype(str)
df["target_text"] = df["target_text"].astype(str)

In [None]:
# Step 3: Add T5 prefixes and convert to HuggingFace Dataset
df['input_text'] = "generate question: " + df['input_text']
dataset = Dataset.from_pandas(df)
# Split dataset into training and validation sets
train_test_split = dataset.train_test_split(test_size=0.1)
train_dataset = train_test_split['train']
valid_dataset = train_test_split['test']


In [None]:
# Step 4: Load Tokenizer and Model
model_name = "t5-small"  # Change to "t5-base" or "t5-large" for better performance
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)


In [None]:
# Step 5: Preprocessing Function
def preprocess_function(examples):
    inputs = [str(i) for i in examples['input_text']]
    targets = [str(t) for t in examples['target_text']]

    # Tokenize inputs and targets
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")
    labels = tokenizer(targets, max_length=128, truncation=True, padding="max_length").input_ids

    # Replace padding token ID in labels with -100 for loss calculation
    labels = [
        [(label if label != tokenizer.pad_token_id else -100) for label in label_seq]
        for label_seq in labels
    ]
    model_inputs["labels"] = labels
    return model_inputs

# Tokenize datasets
tokenized_train = train_dataset.map(preprocess_function, batched=True)
tokenized_valid = valid_dataset.map(preprocess_function, batched=True)

In [None]:
# Step 6: Define Training Arguments (Optimized for ROUGE Score)
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=3e-5,  # Lower LR for better generalization
    per_device_train_batch_size=16,  # Increased batch size
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.01,
    save_total_limit=2,
    logging_dir='./logs',
    logging_steps=10,
    push_to_hub=False,
    remove_unused_columns=False


)


In [None]:
# Step 7: Set up the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_valid,
    tokenizer=tokenizer,
)

In [None]:
# Step 8: Fine-tune the Model
#a42198e1744198c7b883b6b170b0fafc290d330b
trainer.train()

In [None]:
# Step 9: Save the Model
model.save_pretrained("t5_question_generation_model_v2")
tokenizer.save_pretrained("t5_question_generation_model_v2")


In [None]:
# Step 10: Define Function to Generate Questions (Customizable Count)
def generate_questions(text, num_questions=1):
    input_text = "generate question: " + text
    input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to(model.device)

    output_ids = model.generate(
        input_ids,
        max_length=128,
        num_return_sequences=num_questions,  # Generates multiple questions
        do_sample=True,
        top_k=50,
        top_p=0.95
    )

    generated_questions = [tokenizer.decode(q, skip_special_tokens=True) for q in output_ids]
    return generated_questions
# Example Usage
sample_text = "The Eiffel Tower is a famous landmark in Paris, France."
print(generate_questions(sample_text, num_questions=3))


In [None]:
# Step 11: Evaluate Model Performance (ROUGE and BLEU)
rouge = load_metric("rouge")
bleu = load_metric("bleu")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Compute ROUGE scores
    rouge_scores = rouge.compute(predictions=decoded_preds, references=decoded_labels)

    # Compute BLEU score (convert text to list of words)
    predictions_tokens = [pred.split() for pred in decoded_preds]
    references_tokens = [[label.split()] for label in decoded_labels]
    bleu_score = bleu.compute(predictions=predictions_tokens, references=references_tokens)

    return {
        "rouge1": rouge_scores["rouge1"].mid.fmeasure,
        "rouge2": rouge_scores["rouge2"].mid.fmeasure,
        "rougeL": rouge_scores["rougeL"].mid.fmeasure,
        "bleu": bleu_score["bleu"]
    }

# Evaluate Model on Validation Set
trainer.evaluate()

In [None]:
# Step 1: Install necessary libraries
!pip install -q transformers datasets huggingface_hub pandas

# Step 2: Import necessary libraries
import pandas as pd
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
from datasets import Dataset

# Step 3: Load your dataset
from google.colab import files
uploaded = files.upload()

# Load the dataset
file_name = list(uploaded.keys())[0]
df = pd.read_csv(file_name)

# Ensure dataset has required columns and remove NaN values
df = df[['context', 'generated_questions']].dropna()

# Rename columns
df.rename(columns={"context": "input_text", "generated_questions": "target_text"}, inplace=True)

# Ensure all values are strings to avoid tensor errors
df["input_text"] = df["input_text"].astype(str)
df["target_text"] = df["target_text"].astype(str)

# Step 4: Prepare the dataset for HuggingFace
df['input_text'] = "generate question: " + df['input_text']

# Convert DataFrame to HuggingFace Dataset
dataset = Dataset.from_pandas(df)

# Split dataset
train_test_split = dataset.train_test_split(test_size=0.1)
train_dataset = train_test_split['train']
valid_dataset = train_test_split['test']

# Step 5: Load the T5 tokenizer and model
model_name = "t5-small"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

# Preprocessing function
def preprocess_function(examples):
    inputs = [str(i) for i in examples['input_text']]
    targets = [str(t) for t in examples['target_text']]

    # Tokenize inputs and targets
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length", return_tensors="pt")
    labels = tokenizer(targets, max_length=128, truncation=True, padding="max_length", return_tensors="pt").input_ids

    # Replace padding token ID in labels with -100 for loss calculation
    labels = [
        [(label if label != tokenizer.pad_token_id else -100) for label in label_seq]
        for label_seq in labels
    ]
    model_inputs["labels"] = labels
    return model_inputs

# Apply preprocessing with batched=True
tokenized_train = train_dataset.map(preprocess_function, batched=True)
tokenized_valid = valid_dataset.map(preprocess_function, batched=True)


Saving final_year_project.csv to final_year_project (2).csv


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


Map:   0%|          | 0/899 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [None]:
# Step 6: Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=8,
    weight_decay=0.01,
    save_total_limit=2,
    logging_dir='./logs',
    logging_steps=10,
    push_to_hub=False
)

# Step 7: Set up the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_valid,
    tokenizer=tokenizer,
)

# Step 8: Fine-tune the model
trainer.train()

# Step 9: Save the trained model
model.save_pretrained("t5_question_generation_model")
tokenizer.save_pretrained("t5_question_generation_model")

print("Model training complete and saved successfully!")


  trainer = Trainer(
[34m[1mwandb[0m: Currently logged in as: [33mtimoshr[0m ([33mkarunyainstitute[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss
1,2.2875,1.883738
2,2.0884,1.752385
3,1.7981,1.678031
4,1.8083,1.645002
5,1.8266,1.614032
6,1.7421,1.599595


Epoch,Training Loss,Validation Loss
1,2.2875,1.883738
2,2.0884,1.752385
3,1.7981,1.678031
4,1.8083,1.645002
5,1.8266,1.614032
6,1.7421,1.599595
7,1.6548,1.590675
8,1.6625,1.587175


Model training complete and saved successfully!


In [None]:
!pip install -q rouge-score nltk sacrebleu
!pip install -q evaluate
import torch
import nltk
import evaluate
from transformers import T5Tokenizer, T5ForConditionalGeneration

# Download NLTK tokenizer
nltk.download('punkt')

# Load the trained model and tokenizer
model_path = "t5_question_generation_model"
tokenizer = T5Tokenizer.from_pretrained(model_path)
model = T5ForConditionalGeneration.from_pretrained(model_path)

# Load the evaluation dataset (we will use the validation set)
valid_texts = valid_dataset['input_text']
valid_labels = valid_dataset['target_text']

# Define evaluation functions
def generate_questions(texts, model, tokenizer, max_length=128):
    model.eval()
    inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True, max_length=512)

    with torch.no_grad():
        outputs = model.generate(
            input_ids=inputs.input_ids.to(model.device),
            attention_mask=inputs.attention_mask.to(model.device),
            max_length=max_length
        )

    generated_questions = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    return generated_questions

# Generate predictions
generated_questions = generate_questions(valid_texts, model, tokenizer)

# Compute ROUGE Score
rouge = evaluate.load("rouge")
rouge_scores = rouge.compute(predictions=generated_questions, references=valid_labels)

# Compute BLEU Score
bleu = evaluate.load("sacrebleu")
bleu_score = bleu.compute(predictions=generated_questions, references=[[r] for r in valid_labels])

# Print evaluation results
print("\n📊 **Model Evaluation Results**:")
print(f"🔹 ROUGE-1: {rouge_scores['rouge1']:.4f}")
print(f"🔹 ROUGE-2: {rouge_scores['rouge2']:.4f}")
print(f"🔹 ROUGE-L: {rouge_scores['rougeL']:.4f}")
print(f"🔹 BLEU Score: {bleu_score['score']:.4f}")


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25h

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/8.15k [00:00<?, ?B/s]


📊 **Model Evaluation Results**:
🔹 ROUGE-1: 0.3936
🔹 ROUGE-2: 0.2145
🔹 ROUGE-L: 0.3640
🔹 BLEU Score: 15.5995


In [None]:
def generate_questions(text, num_questions=1):
    input_text = "generate question: " + text
    input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to(model.device)

    output_ids = model.generate(
        input_ids,
        max_length=128,
        num_return_sequences=num_questions,  # Generates multiple questions
        do_sample=True,  # Enables randomness for diverse questions
        top_k=50,  # Limits to top-k probable words
        top_p=0.95  # Nucleus sampling
    )

    generated_questions = [tokenizer.decode(q, skip_special_tokens=True) for q in output_ids]
    return generated_questions

# Example Usage
sample_text = "As data collection increases, governments are enacting laws to protect user privacy, but challenges exist in enforcement and ethical data usage."
print(generate_questions(sample_text, num_questions=3))

['Assess the importance of ethical data practices in protecting user privacy.', 'Evaluate the effectiveness of legislation in the enforcement of ethical data protection measures.', 'Evaluate the effectiveness of cybersecurity legislation in assessing user privacy.']


In [None]:
!pip install -q huggingface_hub
!huggingface-cli login



    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) n
Token is valid (permission: fineGrained).
The token `NLP_10` has been saved to /root/.cache/huggingface/stored_tokens
Your token has been saved to /root/.cache/huggingface/token
Login successful.
The current active token is: `NLP_10`


In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

model_name = "t5_question_generation_model"

# Save model and tokenizer
model.save_pretrained(model_name)
tokenizer.save_pretrained(model_name)


('t5_question_generation_model/tokenizer_config.json',
 't5_question_generation_model/special_tokens_map.json',
 't5_question_generation_model/spiece.model',
 't5_question_generation_model/added_tokens.json')

In [None]:
from huggingface_hub import notebook_login, HfApi

# Login (if not already done)
notebook_login()

# Define repo name (change 'your-hf-username' to your Hugging Face username)
repo_name = "nlp_10"

# Upload model
from huggingface_hub import upload_folder
upload_folder(
    folder_path=model_name,
    repo_id=f"Timosh-nlp/{repo_name}",
    repo_type="model"
)


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Timosh-nlp/nlp_10/commit/a32d791c8c9a1a6244f3624d889ec0d28f22968a', commit_message='Upload folder using huggingface_hub', commit_description='', oid='a32d791c8c9a1a6244f3624d889ec0d28f22968a', pr_url=None, repo_url=RepoUrl('https://huggingface.co/Timosh-nlp/nlp_10', endpoint='https://huggingface.co', repo_type='model', repo_id='Timosh-nlp/nlp_10'), pr_revision=None, pr_num=None)