<a href="https://colab.research.google.com/github/Mars0827/bitwiseui/blob/calculator/Transformers.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [19]:
!pip install transformers datasets torch scikit-learn tqdm



In [20]:
data = [
    {
        "input": "A + AB",
        "output": [
            {"step": "A + AB → A(1 + B)", "rule": "Distributive Law"},
            {"step": "A(1 + B) → A", "rule": "Identity Law"}
        ]
    },
    {
        "input": "AB + A'B",
        "output": [
            {"step": "AB + A'B → (A + A')B", "rule": "Distributive Law"},
            {"step": "(A + A')B → 1·B", "rule": "Complement Law"},
            {"step": "1·B → B", "rule": "Identity Law"}
        ]
    },
    {
        "input": "(A + B)(A + C)",
        "output": [
            {"step": "(A + B)(A + C) → A + BC", "rule": "Distributive Law"}
        ]
    },
    {
        "input": "A + A",
        "output": [
            {"step": "A + A → A", "rule": "Idempotent Law"}
        ]
    },
    {
        "input": "AA'",
        "output": [
            {"step": "AA' → 0", "rule": "Complement Law"}
        ]
    }
]


In [21]:
def format_output(steps):
    return "\n".join([f"Step {i+1}: {s['step']} [{s['rule']}]" for i, s in enumerate(steps)])

formatted_data = [
    {"input": f"simplify: {d['input']}", "output": format_output(d["output"])}
    for d in data
]

In [22]:
from datasets import Dataset

dataset = Dataset.from_list(formatted_data)
dataset

Dataset({
    features: ['input', 'output'],
    num_rows: 5
})

In [23]:
from transformers import T5Tokenizer

tokenizer = T5Tokenizer.from_pretrained('t5-small')

def preprocess(example):
  model_inputs = tokenizer(example["input"], truncation=True, padding="max_length", max_length=64)
  labels = tokenizer(example["output"], truncation=True, padding="max_length", max_length=128)
  model_inputs["labels"] = labels["input_ids"]
  return model_inputs

tokenized_datasets = dataset.map(preprocess)
tokenized_datasets


Map:   0%|          | 0/5 [00:00<?, ? examples/s]

Dataset({
    features: ['input', 'output', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 5
})

In [24]:
from transformers import T5ForConditionalGeneration

model = T5ForConditionalGeneration.from_pretrained("t5-small")

In [25]:
from transformers import TrainingArguments, Trainer
import torch

training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=2,
    num_train_epochs=20,
    logging_steps=1,
    save_strategy="no",
    fp16=torch.cuda.is_available(),
    report_to="none"
)

In [28]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets
)

trainer.train()

Step,Training Loss
1,4.7323
2,2.4452
3,3.2654
4,2.3479
5,1.8081
6,2.4989
7,1.4548
8,2.1154
9,2.5652
10,2.2338


TrainOutput(global_step=60, training_loss=1.9458552787701289, metrics={'train_runtime': 8.6232, 'train_samples_per_second': 11.597, 'train_steps_per_second': 6.958, 'total_flos': 1691772518400.0, 'train_loss': 1.9458552787701289, 'epoch': 20.0})

In [30]:
def simplify(expr):
    input_text = f"simplify: {expr}"
    input_ids = tokenizer(input_text, return_tensors="pt").input_ids

    if torch.cuda.is_available():
        input_ids = input_ids.cuda()
        model.cuda()

    output_ids = model.generate(input_ids, max_length=128, num_beams=4, early_stopping=True)
    output = tokenizer.decode(output_ids[0], skip_special_tokens=True)

    return output


In [31]:
print(simplify("A + AB"))
print(simplify("AB + A'B"))
print(simplify("AA'"))
print(simplify("(A + B)(A + C)"))

B A + B
AB + A'B AB + A'B: AB + A'B : AB + A'B AB + A'B: AB + A'B: AB + AB + A
, Si Simplimpl
Die  (A + B) (A + B)(A + C)
