In [None]:
!pip install datasets transformers evaluate



In [None]:
import pandas as pd

# Loading CSV file, handling unterminated strings
data = pd.read_csv('/content/questionsdf.csv', quoting=3, on_bad_lines='skip')

# Preview the dataset
print(data.head())

                              Entity  \
0  Non-alcoholic Fatty Liver Disease   
1  Non-alcoholic Fatty Liver Disease   
2  Non-alcoholic Fatty Liver Disease   
3  Non-alcoholic Fatty Liver Disease   
4  Non-alcoholic Fatty Liver Disease   

                                            Question  \
0         What is Non-alcoholic Fatty Liver Disease?   
1         Explain Non-alcoholic Fatty Liver Disease.   
2  What does Non-alcoholic Fatty Liver Disease mean?   
3  Can you explain Non-alcoholic Fatty Liver Dise...   
4  Give a definition for Non-alcoholic Fatty Live...   

                                              Answer  
0  A term referring to fatty replacement of the h...  
1  A term referring to fatty replacement of the h...  
2  A term referring to fatty replacement of the h...  
3  A term referring to fatty replacement of the h...  
4  A term referring to fatty replacement of the h...  


In [None]:
from datasets import Dataset

# Convert the dataframe into Hugging Face Dataset
qa_dataset = Dataset.from_pandas(data)

# Preview dataset
print(qa_dataset)


Dataset({
    features: ['Entity', 'Question', 'Answer'],
    num_rows: 84320
})


In [None]:
from sklearn.model_selection import train_test_split

# Split into training and testing
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

# Convert to Hugging Face datasets
train_dataset = Dataset.from_pandas(train_data)
test_dataset = Dataset.from_pandas(test_data)


In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# Load tokenizer and model
model_name = "t5-small"  # Change to your desired model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)


In [None]:
def preprocess_function(example):
    inputs = tokenizer(
        example["Question"], max_length=512, truncation=True, padding="max_length"
    )
    outputs = tokenizer(
        example["Answer"], max_length=512, truncation=True, padding="max_length"
    )
    inputs["labels"] = outputs["input_ids"]
    return inputs

# Apply preprocessing
train_tokenized = train_dataset.map(preprocess_function, batched=True)
test_tokenized = test_dataset.map(preprocess_function, batched=True)


Map:   0%|          | 0/67456 [00:00<?, ? examples/s]

Map:   0%|          | 0/16864 [00:00<?, ? examples/s]

In [None]:
from transformers import Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    weight_decay=0.01,
    save_total_limit=2,
    num_train_epochs=3,
    predict_with_generate=True,
    logging_dir="./logs",
)




In [None]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized,
    eval_dataset=test_tokenized,
    tokenizer=tokenizer,
)

# Train the model
trainer.train()


Epoch,Training Loss,Validation Loss
1,0.0726,0.055206
2,0.0516,0.039331


Buffered data was truncated after reaching the output size limit.

In [None]:
from transformers import AutoModelForSeq2SeqLM

latest_checkpoint = "./results/checkpoint-41500"  # Path to the latest checkpoint
model = AutoModelForSeq2SeqLM.from_pretrained(latest_checkpoint)

In [None]:
from transformers import Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir="./results",  # Same as before
    per_device_train_batch_size=16,  # Adjust if needed
    num_train_epochs=3,  # Total number of epochs (not the remaining epochs)
    learning_rate=5e-5,  # Same learning rate
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./logs",  # Optional logging directory
    logging_steps=500,
    save_total_limit=2,  # Keep only the last 2 checkpoints
)




## Here a code chunk is missing where I trained the last epoch using the checkpoint 41500.

In [None]:
import os

checkpoints = [d for d in os.listdir("./results") if d.startswith("checkpoint")]
checkpoints.sort(key=lambda x: int(x.split('-')[-1]))  # Sort by step number
print("Available checkpoints:", checkpoints)


Available checkpoints: ['checkpoint-41500', 'checkpoint-50592']


In [None]:
latest_checkpoint = "./results/checkpoint-50592"
print(f"Using latest checkpoint: {latest_checkpoint}")


Using latest checkpoint: ./results/checkpoint-50592


In [None]:
from transformers import T5ForConditionalGeneration, T5Tokenizer

model = T5ForConditionalGeneration.from_pretrained(latest_checkpoint)
tokenizer = T5Tokenizer.from_pretrained(latest_checkpoint)


You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [None]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    per_device_eval_batch_size=16,  # Set batch size for prediction
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    tokenizer=tokenizer,
)


In [None]:
import torch

# Force model and trainer to use CPU
trainer.model.to("cpu")

# Move dataset to CPU
def move_to_cpu(example):
    return {key: torch.tensor(value).to("cpu") if isinstance(value, list) else value for key, value in example.items()}

test_tokenized = test_tokenized.map(move_to_cpu)


Map:   0%|          | 0/16864 [00:00<?, ? examples/s]

## I had to move to CPU as GPU memory was full and the sessions were crashing continously

In [None]:
batch_size = 8
num_batches = len(test_tokenized) // batch_size + int(len(test_tokenized) % batch_size != 0)

predicted_answers = []
true_answers = []

for i in range(num_batches):
    print(f"Processing batch {i+1}/{num_batches}...")
    start_idx = i * batch_size
    end_idx = min(start_idx + batch_size, len(test_tokenized))

    # Select batch
    batch_data = test_tokenized.select(range(start_idx, end_idx))
    input_ids = torch.tensor(batch_data["input_ids"]).to("cpu")
    attention_mask = torch.tensor(batch_data["attention_mask"]).to("cpu")

    # Generate predictions using the model
    with torch.no_grad():
        outputs = trainer.model.generate(input_ids=input_ids, attention_mask=attention_mask)

    # Decode predictions and append
    predicted_answers.extend(tokenizer.batch_decode(outputs, skip_special_tokens=True))
    true_answers.extend(test_dataset["Answer"][start_idx:end_idx])  # Adjust if test_dataset is a DataFrame


Processing batch 1/2108...




Processing batch 2/2108...
Processing batch 3/2108...
Processing batch 4/2108...
Processing batch 5/2108...
Processing batch 6/2108...
Processing batch 7/2108...
Processing batch 8/2108...
Processing batch 9/2108...
Processing batch 10/2108...
Processing batch 11/2108...
Processing batch 12/2108...
Processing batch 13/2108...
Processing batch 14/2108...
Processing batch 15/2108...
Processing batch 16/2108...
Processing batch 17/2108...
Processing batch 18/2108...
Processing batch 19/2108...
Processing batch 20/2108...
Processing batch 21/2108...
Processing batch 22/2108...
Processing batch 23/2108...
Processing batch 24/2108...
Processing batch 25/2108...
Processing batch 26/2108...
Processing batch 27/2108...
Processing batch 28/2108...
Processing batch 29/2108...
Processing batch 30/2108...
Processing batch 31/2108...
Processing batch 32/2108...
Processing batch 33/2108...
Processing batch 34/2108...
Processing batch 35/2108...
Processing batch 36/2108...
Processing batch 37/2108...


In [None]:
model.save_pretrained("./fine_tuned_model")
tokenizer.save_pretrained("./fine_tuned_model")


('./fine_tuned_model/tokenizer_config.json',
 './fine_tuned_model/special_tokens_map.json',
 './fine_tuned_model/spiece.model',
 './fine_tuned_model/added_tokens.json')

In [None]:
!pip install streamlit transformers


Collecting streamlit
  Downloading streamlit-1.41.0-py2.py3-none-any.whl.metadata (8.5 kB)
Collecting watchdog<7,>=2.1.5 (from streamlit)
  Downloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.3/44.3 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting gitpython!=3.1.19,<4,>=3.0.7 (from streamlit)
  Downloading GitPython-3.1.43-py3-none-any.whl.metadata (13 kB)
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Collecting gitdb<5,>=4.0.1 (from gitpython!=3.1.19,<4,>=3.0.7->streamlit)
  Downloading gitdb-4.0.11-py3-none-any.whl.metadata (1.2 kB)
Collecting smmap<6,>=3.0.1 (from gitdb<5,>=4.0.1->gitpython!=3.1.19,<4,>=3.0.7->streamlit)
  Downloading smmap-5.0.1-py3-none-any.whl.metadata (4.3 kB)
Downloading streamlit-1.41.0-py2.py3-none-any.whl (23.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.4/23.4 M

In [None]:
%%writefile app.py
import streamlit as st
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline

# Load the fine-tuned QA model and tokenizer
model_path = "./fine_tuned_model"  # Replace with your model directory
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSeq2SeqLM.from_pretrained(model_path)

# Initialize the QA pipeline
qa_pipeline = pipeline("text2text-generation", model=model, tokenizer=tokenizer)

# Streamlit app
st.title("Biomedical QA Model")
st.write("Enter a question about biomedical entities, and the model will answer based on its knowledge.")

# User input
question = st.text_input("Ask a question:")

# Generate the answer
if question:
    response = qa_pipeline(question, max_length=512, num_return_sequences=1)
    answer = response[0]["generated_text"]

    # Display the answer
    st.write("Answer:", answer)


Writing app.py


In [None]:
!streamlit run app.py &>/content/logs.txt &


In [None]:
!npm install -g localtunnel
!npx localtunnel --port 8501


[K[?25h
added 22 packages, and audited 23 packages in 2s

3 packages are looking for funding
  run `npm fund` for details

1 [33m[1mmoderate[22m[39m severity vulnerability

To address all issues (including breaking changes), run:
  npm audit fix --force

Run `npm audit` for details.
[37;40mnpm[0m [0m[36;40mnotice[0m[35m[0m 
[0m[37;40mnpm[0m [0m[36;40mnotice[0m[35m[0m New [31mmajor[39m version of npm available! [31m8.19.4[39m -> [32m10.9.2[39m
[0m[37;40mnpm[0m [0m[36;40mnotice[0m[35m[0m Changelog: [36mhttps://github.com/npm/cli/releases/tag/v10.9.2[39m
[0m[37;40mnpm[0m [0m[36;40mnotice[0m[35m[0m Run [32mnpm install -g npm@10.9.2[39m to update!
[0m[37;40mnpm[0m [0m[36;40mnotice[0m[35m[0m 
[0myour url is: https://blue-boxes-march.loca.lt
^C


Metrics to evaluate

In [None]:
def exact_match(predictions, true_answers):
    matches = [1 if pred.strip() == true.strip() else 0 for pred, true in zip(predictions, true_answers)]
    return sum(matches) / len(matches) * 100

# Compute EM
em_score = exact_match(predicted_answers, true_answers)
print(f"Exact Match Score: {em_score:.2f}%")


Exact Match Score: 29.61%


In [None]:
from sklearn.metrics import f1_score
from collections import Counter

def compute_f1(predictions, true_answers):
    def get_tokens(text):
        return Counter(text.split())

    f1_scores = []
    for pred, true in zip(predictions, true_answers):
        pred_tokens = get_tokens(pred)
        true_tokens = get_tokens(true)
        common = pred_tokens & true_tokens
        num_same = sum(common.values())

        if num_same == 0:
            f1_scores.append(0)
        else:
            precision = num_same / sum(pred_tokens.values())
            recall = num_same / sum(true_tokens.values())
            f1_scores.append(2 * precision * recall / (precision + recall))

    return sum(f1_scores) / len(f1_scores) * 100

# Compute F1
f1_score = compute_f1(predicted_answers, true_answers)
print(f"F1 Score: {f1_score:.2f}")


F1 Score: 52.57


In [None]:
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

def compute_bleu(predictions, true_answers):
    smoothing_fn = SmoothingFunction().method4  # Smooth for short sentences
    bleu_scores = [sentence_bleu([true.split()], pred.split(), smoothing_function=smoothing_fn) for pred, true in zip(predictions, true_answers)]
    return sum(bleu_scores) / len(bleu_scores) * 100

# Compute BLEU
bleu_score = compute_bleu(predicted_answers, true_answers)
print(f"BLEU Score: {bleu_score:.2f}")


BLEU Score: 40.70


In [None]:
pip install rouge



In [None]:
from rouge import Rouge

def compute_rouge(predictions, true_answers):
    rouge = Rouge()
    scores = rouge.get_scores(predictions, true_answers, avg=True)
    return scores

# Compute ROUGE
rouge_scores = compute_rouge(predicted_answers, true_answers)
print(f"ROUGE Scores: {rouge_scores}")


ROUGE Scores: {'rouge-1': {'r': 0.5175842381847008, 'p': 0.5991390809865483, 'f': 0.5417793390207815}, 'rouge-2': {'r': 0.43484453710122956, 'p': 0.49677263944790756, 'f': 0.4537840860773102}, 'rouge-l': {'r': 0.5109337250976455, 'p': 0.5902916130668928, 'f': 0.5345991962416808}}
