In [44]:
!pip install datasets



In [65]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments

In [66]:
from datasets import Dataset
from torch.utils.data import DataLoader

In [67]:
qa_data = pd.read_csv('input_table.csv')

In [68]:
if 'Date' in qa_data.columns:
    qa_data['Date'] = pd.to_datetime(qa_data['Date'], errors='coerce')

In [69]:
train_data, val_data = train_test_split(qa_data, test_size=0.2, random_state=42)

In [70]:
tokenizer = T5Tokenizer.from_pretrained("t5-small")

In [71]:
def preprocess_data(examples):
    inputs = [
        f"Invoice {inv} in {city} for {product} of {qty} items, "
        f"total price {total}, paid via {payment} on {date}. Rating: {rating}"
        for inv, city, product, qty, total, payment, date, rating in zip(
            examples["Invoice ID"], examples["City"], examples["Product line"],
            examples["Quantity"], examples["Total"], examples["Payment"],
            examples["Date"], examples["Rating"]
        )
    ]
    targets = [str(total) for total in examples["Total"]]
    model_inputs = tokenizer(inputs, padding=True, truncation=True, max_length=256)
    labels = tokenizer(targets, padding=True, truncation=True, max_length=64).input_ids
    model_inputs["labels"] = labels
    return model_inputs

In [72]:
from torch.utils.data import DataLoader
from datasets import Dataset
train_dataset = Dataset.from_pandas(train_data)
val_dataset = Dataset.from_pandas(val_data)
train_dataset = train_dataset.map(preprocess_data, batched=True)
val_dataset = val_dataset.map(preprocess_data, batched=True)
train_dataloader = DataLoader(train_dataset.with_format("torch"), batch_size=20, shuffle=True)
val_dataloader = DataLoader(val_dataset.with_format("torch"), batch_size=20)

Map:   0%|          | 0/800 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

In [73]:
from transformers import T5ForConditionalGeneration, T5Tokenizer
model_name = "t5-small"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)
model.gradient_checkpointing_enable()

In [88]:
training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    gradient_accumulation_steps=4,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./logs",
    logging_steps=10,  # Log training loss every 10 steps
    report_to="all",  # Logs training metrics (default: TensorBoard)
    fp16=True,
    optim="adamw_torch",
    num_train_epochs=5
)



In [89]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

In [90]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,0.0826,0.114545
2,0.0764,0.117934
3,0.0844,0.11655
4,0.0713,0.115204


TrainOutput(global_step=60, training_loss=0.07560023764769236, metrics={'train_runtime': 56.6967, 'train_samples_per_second': 70.551, 'train_steps_per_second': 1.058, 'total_flos': 53967543336960.0, 'train_loss': 0.07560023764769236, 'epoch': 4.64})

In [91]:
model.save_pretrained("./t5_qa_model")
tokenizer.save_pretrained("./t5_qa_model")
print("Training complete! Model saved at ./t5_qa_model")

Training complete! Model saved at ./t5_qa_model


In [92]:
tokenizer = T5Tokenizer.from_pretrained("./t5_qa_model")
model = T5ForConditionalGeneration.from_pretrained("./t5_qa_model")

In [93]:
test_df = pd.read_excel("/content/QA_dataset_share+.xlsx")
test_questions = test_df["question"].tolist()

In [95]:
train_dataset = Dataset.from_pandas(val_data).map(preprocess_data, batched=True)
trainer.train()

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss
1,0.0818,0.118705
2,0.0893,0.111522
3,0.0802,0.113631
4,0.0753,0.112597


TrainOutput(global_step=60, training_loss=0.07924808859825135, metrics={'train_runtime': 55.9225, 'train_samples_per_second': 71.528, 'train_steps_per_second': 1.073, 'total_flos': 53967543336960.0, 'train_loss': 0.07924808859825135, 'epoch': 4.64})

In [96]:
def answer_question(question):
    input_text = f"answer: {question}"
    input_ids = tokenizer(input_text, return_tensors="pt").input_ids
    output_ids = model.generate(input_ids, max_length=50)
    answer = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    return answer
for question in test_questions:
    print(f"Q: {question}")
    print(f"A: {answer_question(question)}\n")

Q: What product line is in the latest entry?
A: True: What product line is in the latest entry?

Q: On what date did the first transaction occur?
A: Répon:  quelle date a eu lieu la première transaction?

Q: What is the latest transaction date?
A: True: What is the latest transaction date?

Q: what is the max rating given in home and lifestyle?
A: : What is the max rating given in home and lifestyle?

Q: How many transactions involved Male customers and a rating of 9.1?
A: True

Q: What is the average unit price?
A: Répon: What is the average unit price?

Q: What is the average unit price for transactions in Yangon city with a rating of 9.1?
A: True

Q: What is the total gross income for transactions in Yangon city with a quantity of 7?
A: True

Q: What is the total gross income for transactions involving Health and beauty and a unit price of 36.26?
A: True

Q: How many transactions involved Cash payment method and a gross income of 3.82?
A: : Wie viele opérations implizierten Cash pay

KeyboardInterrupt: 