In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("praneshmukhopadhyay/amazon-questionanswer-dataset")

print("Path to dataset files:", path)


Downloading from https://www.kaggle.com/api/v1/datasets/download/praneshmukhopadhyay/amazon-questionanswer-dataset?dataset_version_number=1...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 426M/426M [00:04<00:00, 92.6MB/s]

Extracting files...





Path to dataset files: /root/.cache/kagglehub/datasets/praneshmukhopadhyay/amazon-questionanswer-dataset/versions/1


In [None]:
import pandas as pd


multi_questions = pd.read_csv(f"{path}/multi_questions.csv")
multi_answers = pd.read_csv(f"{path}/multi_answers.csv")
single_qna = pd.read_csv(f"{path}/single_qna.csv")

merged_multi = pd.merge(multi_questions, multi_answers, on="QuestionID", how="inner")


merged_multi = merged_multi[["QuestionText", "AnswerText", "Category"]]
single_qna = single_qna[["Question", "Answer", "Category"]]


merged_multi.columns = ["question", "answer", "category"]
single_qna.columns = ["question", "answer", "category"]

# Combine both datasets
combined_dataset = pd.concat([merged_multi, single_qna], ignore_index=True)


combined_dataset = combined_dataset.dropna()


print("Combined Dataset Sample:")
print(combined_dataset.head(3))


Combined Dataset Sample:
                         question  \
0  will they fit 2013 f350 dually   
1  will they fit 2013 f350 dually   
2  will they fit 2013 f350 dually   

                                              answer    category  
0  It's all custom mounting, where there's a will...  Automotive  
1  You will need to drill another hole in Mud fla...  Automotive  
2  It's been a while since I installed them, but ...  Automotive  


In [None]:
qa_data = combined_dataset[["question", "answer"]]


In [None]:
print(combined_dataset["question"])

0                             will they fit 2013 f350 dually
1                             will they fit 2013 f350 dually
2                             will they fit 2013 f350 dually
3                             will they fit 2013 f350 dually
4                             will they fit 2013 f350 dually
                                 ...                        
5416635    Does the adaptor cord for the iPhone 5 work wi...
5416636                         will it charge a kidle fire?
5416637             What are the dimensions of this product?
5416638    Does this have connector for 5C? I think 5C is...
5416639    so I just bought the Gembonics Battery Back up...
Name: question, Length: 5416319, dtype: object


In [None]:
# Prepare dataset for T5-small
t5_data = {
    "input_text": [],
    "output_text": []
}

# Limit to the first 1000 rows
limit = 1000
count = 0

for index, row in combined_dataset.iterrows():
    if count >= limit:
        break
    question = row["question"]
    context = row["answer"]
    answer = row["answer"]

    input_text = f"question: {question} context: {context}"
    output_text = answer

    t5_data["input_text"].append(input_text)
    t5_data["output_text"].append(output_text)
    count += 1

# Convert to pandas DataFrame (for Hugging Face Trainer)
t5_dataset = pd.DataFrame(t5_data)
print("Prepared Dataset for T5-small (First 1000 rows):")
print(t5_dataset.head())


t5_dataset.to_csv("prepared_data.csv", index=False)


Prepared Dataset for T5-small (First 1000 rows):
                                          input_text  \
0  question: will they fit 2013 f350 dually conte...   
1  question: will they fit 2013 f350 dually conte...   
2  question: will they fit 2013 f350 dually conte...   
3  question: will they fit 2013 f350 dually conte...   
4  question: will they fit 2013 f350 dually conte...   

                                         output_text  
0  It's all custom mounting, where there's a will...  
1  You will need to drill another hole in Mud fla...  
2  It's been a while since I installed them, but ...  
3           1 pair rear flaps and mounting hardware.  
4  I didn't buy these for myself I bought them fo...  


In [None]:
! pip install transformers datasets

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m480.6/480.6 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ

In [None]:
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer


df = pd.read_csv("prepared_data.csv")


dataset = Dataset.from_pandas(df)


tokenizer = AutoTokenizer.from_pretrained("t5-small")


def preprocess_function(examples):
    model_inputs = tokenizer(
        examples["input_text"], max_length=512, padding="max_length", truncation=True
    )
    labels = tokenizer(
        examples["output_text"], max_length=128, padding="max_length", truncation=True
    )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


tokenized_dataset = dataset.map(preprocess_function, batched=True)


train_test_split = tokenized_dataset.train_test_split(test_size=0.2, seed=42)
train_dataset = train_test_split["train"]
val_dataset = train_test_split["test"]


print("Train dataset size:", len(train_dataset))
print("Validation dataset size:", len(val_dataset))
print("Sample tokenized input:", train_dataset[0]["input_ids"][:10])
print("Sample tokenized output:", train_dataset[0]["labels"][:10])


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Train dataset size: 800
Validation dataset size: 200
Sample tokenized input: [822, 10, 84, 3, 52, 210, 52, 52, 1109, 152]
Sample tokenized output: [2841, 5, 572, 103, 25, 987, 58, 1, 0, 0]


In [None]:

from transformers import AutoModelForSeq2SeqLM, TrainingArguments, Trainer # Import necessary classes

model = AutoModelForSeq2SeqLM.from_pretrained("t5-small")




# Define training arguments
training_args = TrainingArguments(
    output_dir="./t5_finetuned_final",  
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./logs",  
    logging_steps=100,  
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=5e-5,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=3,
    push_to_hub=False  
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer
)


trainer.train()

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

  trainer = Trainer(
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ¬∑¬∑¬∑¬∑¬∑¬∑¬∑¬∑¬∑¬∑


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss
1,2.1819,0.157058
2,0.1613,0.058339
3,0.0933,0.03131


TrainOutput(global_step=300, training_loss=0.8121694882710775, metrics={'train_runtime': 8409.439, 'train_samples_per_second': 0.285, 'train_steps_per_second': 0.036, 'total_flos': 324820323532800.0, 'train_loss': 0.8121694882710775, 'epoch': 3.0})

In [None]:
# Save the model
model.save_pretrained("./t5_finetuned_final")
tokenizer.save_pretrained("./t5_finetuned_final")


('./t5_finetuned_final/tokenizer_config.json',
 './t5_finetuned_final/special_tokens_map.json',
 './t5_finetuned_final/spiece.model',
 './t5_finetuned_final/added_tokens.json',
 './t5_finetuned_final/tokenizer.json')

In [None]:
# Evaluate the model
results = trainer.evaluate()
print("Evaluation Results:", results)


Evaluation Results: {'eval_loss': 0.031310372054576874, 'eval_runtime': 174.1713, 'eval_samples_per_second': 1.148, 'eval_steps_per_second': 0.144, 'epoch': 3.0}


In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM


fine_tuned_model = AutoModelForSeq2SeqLM.from_pretrained("./t5_finetuned_final")
fine_tuned_tokenizer = AutoTokenizer.from_pretrained("./t5_finetuned_final")


question = "will they fit 2013 f350 dually"
context = "You will need to drill another hole in Mud flap & bed frame for added stability. the added bolt & nut makes it a tighter fit, you only get two existing holes on the bed frame with plastic mud flaps when you bought the truck. These Dee Zee flaps are much heavier than the plastic/rubber flaps originally provided on truck."


input_text = f"question: {question} context: {context}"

inputs = fine_tuned_tokenizer(input_text, return_tensors="pt", truncation=True, padding=True)


outputs = fine_tuned_model.generate(
    inputs.input_ids,
    max_length=150,  
    num_beams=5,     
    no_repeat_ngram_size=2,  
    early_stopping=True   
)


answer = fine_tuned_tokenizer.decode(outputs[0], skip_special_tokens=True)

print("Generated Answer:", answer)


Generated Answer: You will need to drill another hole in Mud flap & bed frame for added stability


In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM


fine_tuned_model = AutoModelForSeq2SeqLM.from_pretrained("./t5_finetuned_final")
fine_tuned_tokenizer = AutoTokenizer.from_pretrained("./t5_finetuned_final")


question = "Will this sofa fit in a space of 80 inches?"
context = "The dimensions of the sofa are 78 inches in width, 35 inches in depth, and 34 inches in height. It is designed to fit in compact living spaces."


input_text = f"question: {question} context: {context}"


inputs = fine_tuned_tokenizer(input_text, return_tensors="pt", truncation=True, padding=True)


outputs = fine_tuned_model.generate(
    inputs.input_ids,
    max_length=150,  
    num_beams=5,     
    no_repeat_ngram_size=2, 
    early_stopping=True   
)


answer = fine_tuned_tokenizer.decode(outputs[0], skip_special_tokens=True)

print("Generated Answer:", answer)


Generated Answer: It is designed to fit in compact living spaces.


In [None]:
print(df.head(1))

                                          input_text  \
0  question: will they fit 2013 f350 dually conte...   

                                         output_text  
0  It's all custom mounting, where there's a will...  


In [None]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer


model_save_path = "./t5_finetuned_model"


fine_tuned_model.save_pretrained(model_save_path)


fine_tuned_tokenizer.save_pretrained(model_save_path)

print("Model and tokenizer saved!")


Model and tokenizer saved!


In [None]:
import shutil

model_save_path = "./t5_finetuned_final"

shutil.make_archive("t5_finetuned_final", 'zip', model_save_path)

print("Model zipped!")


Model zipped!


In [None]:
from google.colab import files


files.download("t5_finetuned_final.zip")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>