In [1]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("praneshmukhopadhyay/amazon-questionanswer-dataset")

print("Path to dataset files:", path)


Downloading from https://www.kaggle.com/api/v1/datasets/download/praneshmukhopadhyay/amazon-questionanswer-dataset?dataset_version_number=1...


100%|██████████| 426M/426M [00:07<00:00, 57.2MB/s]

Extracting files...





Path to dataset files: /root/.cache/kagglehub/datasets/praneshmukhopadhyay/amazon-questionanswer-dataset/versions/1


In [2]:
import pandas as pd

# Load the datasets
multi_questions = pd.read_csv(f"{path}/multi_questions.csv")
multi_answers = pd.read_csv(f"{path}/multi_answers.csv")
single_qna = pd.read_csv(f"{path}/single_qna.csv")
# Merge multi_questions and multi_answers on QuestionID
merged_multi = pd.merge(multi_questions, multi_answers, on="QuestionID", how="inner")

# Keep only relevant columns
merged_multi = merged_multi[["QuestionText", "AnswerText", "Category"]]
single_qna = single_qna[["Question", "Answer", "Category"]]

# Standardize column names
merged_multi.columns = ["question", "answer", "category"]
single_qna.columns = ["question", "answer", "category"]

# Combine both datasets
combined_dataset = pd.concat([merged_multi, single_qna], ignore_index=True)

# Drop missing or invalid entries (optional)
combined_dataset = combined_dataset.dropna()

# Display a sample of the dataset
print("Combined Dataset Sample:")
print(combined_dataset.head(3))


Combined Dataset Sample:
                         question  \
0  will they fit 2013 f350 dually   
1  will they fit 2013 f350 dually   
2  will they fit 2013 f350 dually   

                                              answer    category  
0  It's all custom mounting, where there's a will...  Automotive  
1  You will need to drill another hole in Mud fla...  Automotive  
2  It's been a while since I installed them, but ...  Automotive  


In [3]:
qa_data = combined_dataset[["question", "answer"]]


In [4]:
print(combined_dataset["question"])

0                             will they fit 2013 f350 dually
1                             will they fit 2013 f350 dually
2                             will they fit 2013 f350 dually
3                             will they fit 2013 f350 dually
4                             will they fit 2013 f350 dually
                                 ...                        
5416635    Does the adaptor cord for the iPhone 5 work wi...
5416636                         will it charge a kidle fire?
5416637             What are the dimensions of this product?
5416638    Does this have connector for 5C? I think 5C is...
5416639    so I just bought the Gembonics Battery Back up...
Name: question, Length: 5416319, dtype: object


In [5]:
# Prepare dataset for T5-small
t5_data = {
    "input_text": [],
    "output_text": []
}

# Limit to the first 20000 rows
limit = 40000
count = 0

for index, row in combined_dataset.iterrows():
    if count >= limit:
        break
    question = row["question"]
    context = row["answer"]
    answer = row["answer"]

    input_text = f"question: {question} context: {context}"
    output_text = answer

    t5_data["input_text"].append(input_text)
    t5_data["output_text"].append(output_text)
    count += 1

# Convert to pandas DataFrame (for Hugging Face Trainer)
t5_dataset = pd.DataFrame(t5_data)
print("Prepared Dataset for T5-small (First 1000 rows):")
print(t5_dataset.head())

# Save to CSV if needed
t5_dataset.to_csv("prepared_data.csv", index=False)
print(len(t5_dataset))

Prepared Dataset for T5-small (First 1000 rows):
                                          input_text  \
0  question: will they fit 2013 f350 dually conte...   
1  question: will they fit 2013 f350 dually conte...   
2  question: will they fit 2013 f350 dually conte...   
3  question: will they fit 2013 f350 dually conte...   
4  question: will they fit 2013 f350 dually conte...   

                                         output_text  
0  It's all custom mounting, where there's a will...  
1  You will need to drill another hole in Mud fla...  
2  It's been a while since I installed them, but ...  
3           1 pair rear flaps and mounting hardware.  
4  I didn't buy these for myself I bought them fo...  
40000


In [6]:
! pip install transformers datasets

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m15.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl 

In [7]:
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer

# Load the dataset
df = pd.read_csv("prepared_data.csv")

# Convert the dataframe to a Hugging Face Dataset
dataset = Dataset.from_pandas(df)

# Load the tokenizer for T5-small
tokenizer = AutoTokenizer.from_pretrained("t5-small")

# Tokenize the dataset
def preprocess_function(examples):
    model_inputs = tokenizer(
        examples["input_text"], max_length=512, padding="max_length", truncation=True
    )
    labels = tokenizer(
        examples["output_text"], max_length=128, padding="max_length", truncation=True
    )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Tokenize the dataset
tokenized_dataset = dataset.map(preprocess_function, batched=True)

# Split into train and validation sets
train_test_split = tokenized_dataset.train_test_split(test_size=0.2, seed=42)
train_dataset = train_test_split["train"]
val_dataset = train_test_split["test"]

# Inspect dataset
print("Train dataset size:", len(train_dataset))
print("Validation dataset size:", len(val_dataset))
print("Sample tokenized input:", train_dataset[0]["input_ids"][:10])
print("Sample tokenized output:", train_dataset[0]["labels"][:10])


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

Map:   0%|          | 0/40000 [00:00<?, ? examples/s]

Train dataset size: 32000
Validation dataset size: 8000
Sample tokenized input: [822, 10, 156, 27, 278, 31, 17, 43, 3, 10891]
Sample tokenized output: [2163, 6, 182, 1, 0, 0, 0, 0, 0, 0]


In [8]:
# Load the pre-trained T5-small model
from transformers import AutoModelForSeq2SeqLM, TrainingArguments, Trainer # Import necessary classes

model = AutoModelForSeq2SeqLM.from_pretrained("t5-small")




# Define training arguments
training_args = TrainingArguments(
    output_dir="./t5_finetuned_final",  # Fine-tuned model will be saved here
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./logs",  # Directory for logs
    logging_steps=100,  # Log every 100 steps
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=5e-5,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=3,
    push_to_hub=False  # Set to True if you want to push the model to Hugging Face Hub
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer
)

# Train the model
trainer.train()

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

  trainer = Trainer(
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss
1,0.0039,0.001661
2,0.002,0.001347
3,0.0012,0.001329


TrainOutput(global_step=12000, training_loss=0.02143858535401523, metrics={'train_runtime': 4913.2386, 'train_samples_per_second': 19.539, 'train_steps_per_second': 2.442, 'total_flos': 1.2992812941312e+16, 'train_loss': 0.02143858535401523, 'epoch': 3.0})

In [9]:
# Save the model
model.save_pretrained("./t5_finetuned_final_2")
tokenizer.save_pretrained("./t5_finetuned_final_2")


('./t5_finetuned_final_2/tokenizer_config.json',
 './t5_finetuned_final_2/special_tokens_map.json',
 './t5_finetuned_final_2/spiece.model',
 './t5_finetuned_final_2/added_tokens.json',
 './t5_finetuned_final_2/tokenizer.json')

In [10]:
# Evaluate the model
results = trainer.evaluate()
print("Evaluation Results:", results)


Evaluation Results: {'eval_loss': 0.0013291728682816029, 'eval_runtime': 122.2838, 'eval_samples_per_second': 65.422, 'eval_steps_per_second': 8.178, 'epoch': 3.0}


In [12]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# Load the fine-tuned model and tokenizer
fine_tuned_model = AutoModelForSeq2SeqLM.from_pretrained("./t5_finetuned_final_2")
fine_tuned_tokenizer = AutoTokenizer.from_pretrained("./t5_finetuned_final_2")

# Define the question and context
question = "will they fit 2013 f350 dually"
context = "You will need to drill another hole in Mud flap & bed frame for added stability. the added bolt & nut makes it a tighter fit, you only get two existing holes on the bed frame with plastic mud flaps when you bought the truck. These Dee Zee flaps are much heavier than the plastic/rubber flaps originally provided on truck."

# Format the input (with context)
input_text = f"question: {question} context: {context}"

# Tokenize the input
inputs = fine_tuned_tokenizer(input_text, return_tensors="pt", truncation=True, padding=True)

# Generate the answer with adjusted max_length
outputs = fine_tuned_model.generate(
    inputs.input_ids,
    max_length=150,  # Increase the max length to allow for a longer output
    num_beams=5,     # Use beam search for better quality answers
    no_repeat_ngram_size=2,  # Prevent repetition
    early_stopping=True   # Stop generation once an answer is complete
)

# Decode the output
answer = fine_tuned_tokenizer.decode(outputs[0], skip_special_tokens=True)

print("Generated Answer:", answer)


Generated Answer: You will need to drill another hole in Mud flap & bed frame for added stability. the added bolt  and nut makes it a tighter fit, you only get two existing holes on the bed frames with plastic mud flaps when you bought the truck. These Dee Zee flap is much heavier than the plastic/rubber flap


In [13]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# Load the fine-tuned model and tokenizer
fine_tuned_model = AutoModelForSeq2SeqLM.from_pretrained("./t5_finetuned_final_2")
fine_tuned_tokenizer = AutoTokenizer.from_pretrained("./t5_finetuned_final_2")

# Define the question and context
question = "Will this sofa fit in a space of 80 inches?"
context = "The dimensions of the sofa are 78 inches in width, 35 inches in depth, and 34 inches in height. It is designed to fit in compact living spaces."

# Format the input (with context)
input_text = f"question: {question} context: {context}"

# Tokenize the input
inputs = fine_tuned_tokenizer(input_text, return_tensors="pt", truncation=True, padding=True)

# Generate the answer with adjusted max_length
outputs = fine_tuned_model.generate(
    inputs.input_ids,
    max_length=150,  # Increase the max length to allow for a longer output
    num_beams=5,     # Use beam search for better quality answers
    no_repeat_ngram_size=2,  # Prevent repetition
    early_stopping=True   # Stop generation once an answer is complete
)

# Decode the output
answer = fine_tuned_tokenizer.decode(outputs[0], skip_special_tokens=True)

print("Generated Answer:", answer)


Generated Answer: It is designed to fit in compact living spaces.


In [None]:
print(df.head(1))

In [None]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

# Path where you want to save the model and tokenizer
model_save_path = "./t5_finetuned_model"

# Save the model
fine_tuned_model.save_pretrained(model_save_path)

# Save the tokenizer
fine_tuned_tokenizer.save_pretrained(model_save_path)

print("Model and tokenizer saved!")


In [None]:
import shutil

model_save_path = "./t5_finetuned_final_2"
# Zip the directory
shutil.make_archive("t5_finetuned_final_@", 'zip', model_save_path)

print("Model zipped!")


In [None]:
from google.colab import files

# Download the zip file
files.download("t5_finetuned_final.zip")
