In [1]:
!pip install transformers[torch] datasets scikit-learn sacrebleu pandas

Collecting sacrebleu
  Downloading sacrebleu-2.5.1-py3-none-any.whl.metadata (51 kB)
[?25l     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m0.0/51.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m51.8/51.8 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
Collecting portalocker (from sacrebleu)
  Downloading portalocker-3.2.0-py3-none-any.whl.metadata (8.7 kB)
Collecting colorama (from sacrebleu)
  Downloading colorama-0.4.6-py2.py3-none-any.whl.metadata (17 kB)
Downloading sacrebleu-2.5.1-py3-none-any.whl (104 kB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m104.1/104.1 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorama-0.4.6-py2.py3-none-any.whl (2

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict

In [3]:
print("--- Phase 2: Preparing Data ---")
file_path = 'deformalization_dataset.csv'  # Make sure this matches the uploaded filename
try:
    df = pd.read_csv(file_path)

    # Rename columns for our task
    df.rename(columns={'input': 'input_text', 'output': 'target_text'}, inplace=True)
    df.dropna(inplace=True)

    # Split the data
    train_val_df, test_df = train_test_split(df, test_size=0.1, random_state=42)
    train_df, val_df = train_test_split(train_val_df, test_size=(1/9), random_state=42)

    # Convert to Hugging Face Dataset objects
    train_dataset = Dataset.from_pandas(train_df)
    val_dataset = Dataset.from_pandas(val_df)
    test_dataset = Dataset.from_pandas(test_df)

    raw_datasets = DatasetDict({
        'train': train_dataset,
        'validation': val_dataset,
        'test': test_dataset
    })
    print("Data successfully loaded and split:")
    print(raw_datasets)
except FileNotFoundError:
    print(f"ERROR: Make sure you have uploaded '{file_path}' to the Colab session!")
except Exception as e:
    print(f"An error occurred during data preparation: {e}")

--- Phase 2: Preparing Data ---
Data successfully loaded and split:
DatasetDict({
    train: Dataset({
        features: ['lean_syntax', 'natural_language', '__index_level_0__'],
        num_rows: 1080
    })
    validation: Dataset({
        features: ['lean_syntax', 'natural_language', '__index_level_0__'],
        num_rows: 135
    })
    test: Dataset({
        features: ['lean_syntax', 'natural_language', '__index_level_0__'],
        num_rows: 135
    })
})


In [11]:
print("\n--- Phase 3: Model Training ---")
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer

model_checkpoint = "t5-base"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

# Preprocessing function to tokenize data
prefix = "translate formal math to english: "
def preprocess_function(examples):
    inputs = [prefix + str(doc) for doc in examples["lean_syntax"]]
    model_inputs = tokenizer(inputs, max_length=128, truncation=True)
    labels = tokenizer(text_target=[str(doc) for doc in examples["natural_language"]], max_length=128, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)

# Define training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./results_deformalization",
    # MODIFIED LINE: Changed 'evaluation_strategy' to 'eval_strategy' for compatibility
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=25,
    predict_with_generate=True,
)

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

# Start training
trainer.train()
print("Training complete.")


--- Phase 3: Model Training ---


Map:   0%|          | 0/1080 [00:00<?, ? examples/s]

Map:   0%|          | 0/135 [00:00<?, ? examples/s]

Map:   0%|          | 0/135 [00:00<?, ? examples/s]

  trainer = Seq2SeqTrainer(


Epoch,Training Loss,Validation Loss
1,No log,1.590628
2,No log,0.927399
3,No log,0.651215
4,No log,0.499864
5,No log,0.387694
6,No log,0.309753
7,No log,0.259516
8,1.057300,0.220136
9,1.057300,0.193993
10,1.057300,0.167534


Training complete.


In [9]:
!pip install evaluate rouge_score sacrebleu


Collecting rouge_score
  Using cached rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=7dc68d3a45cbd2d933ebe5ed9228c60cf437ae4b33a970aebfcc0c61eb1ae55f
  Stored in directory: /root/.cache/pip/wheels/85/9d/af/01feefbe7d55ef5468796f0c68225b6788e85d9d0a281e7a70
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


In [12]:
print("\n--- Phase 4: Evaluating Results ---")
import torch
from torch.utils.data import DataLoader
import evaluate
from tqdm.auto import tqdm

# ------------------------------------------------------------
# Generate predictions
# ------------------------------------------------------------
def generate_predictions(model, tokenizer, dataset, collator):
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model.to(device)
    model.eval()

    dataset.set_format(type="torch", columns=["input_ids", "attention_mask"])
    dataloader = DataLoader(dataset, batch_size=32, collate_fn=collator)

    all_preds = []
    for batch in tqdm(dataloader, desc="Generating predictions"):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)

        with torch.no_grad():
            generated_ids = model.generate(
                input_ids,
                attention_mask=attention_mask,
                max_length=128
            )

        preds = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
        all_preds.extend(preds)

    return all_preds

# ------------------------------------------------------------
# Run predictions
# ------------------------------------------------------------
decoded_preds = generate_predictions(model, tokenizer, tokenized_datasets["test"], data_collator)
decoded_labels = test_df["natural_language"].tolist()

# ------------------------------------------------------------
# Metrics
# ------------------------------------------------------------

# BLEU
bleu = evaluate.load("sacrebleu")
bleu_result = bleu.compute(predictions=decoded_preds, references=decoded_labels)
print(f"\nFINAL BLEU SCORE: {bleu_result['score']:.2f}")

# ROUGE
rouge = evaluate.load("rouge")
rouge_result = rouge.compute(predictions=decoded_preds, references=decoded_labels)
print("\nFINAL ROUGE SCORES:")
for k, v in rouge_result.items():
    print(f"{k}: {v*100:.2f}")  # convert to percentage

# chrF
chrf = evaluate.load("chrf")
chrf_result = chrf.compute(predictions=decoded_preds, references=decoded_labels)
print(f"\nFINAL chrF SCORE: {chrf_result['score']*100:.2f}")

# ------------------------------------------------------------
# Example Predictions
# ------------------------------------------------------------
print("\n--- Example Predictions ---")
for i in range(min(5, len(test_df))):
    print(f"Formal Input:  {test_df.iloc[i]['lean_syntax']}")
    print(f"Ground Truth:  {test_df.iloc[i]['natural_language']}")
    print(f"Model Output:  {decoded_preds[i]}")
    print("-" * 20)



--- Phase 4: Evaluating Results ---


Generating predictions:   0%|          | 0/5 [00:00<?, ?it/s]


FINAL BLEU SCORE: 68.94

FINAL ROUGE SCORES:
rouge1: 77.79
rouge2: 65.11
rougeL: 75.29
rougeLsum: 75.12

FINAL chrF SCORE: 7364.05

--- Example Predictions ---
Formal Input:  n ‚àà T \ X
Ground Truth:  n is in T but not in X
Model Output:  n is in T but not in X
--------------------
Formal Input:  {x : ‚Ñù | 0 < x ‚àß x < 1}
Ground Truth:  The set of all real numbers between 0 and 1
Model Output:  The set of all real numbers between 0 and 1
--------------------
Formal Input:  ‚àÄ x ‚àà T, P x
Ground Truth:  For all x in T, property P holds
Model Output:  For all x in T, property P holds
--------------------
Formal Input:  ùí´(Y) = {S | S ‚äÜ Y}
Ground Truth:  The power set of Y is the set of all subsets of Y
Model Output:  The power set of Y is the set of all subsets of Y
--------------------
Formal Input:  ‚àÉ S ‚äÜ Y, S ‚â† ‚àÖ
Ground Truth:  There exists a subset S of Y that is non-empty
Model Output:  There exists a subset S of Y that is non-empty
--------------------


In [14]:

results_df = pd.DataFrame({
    'Formal Input': test_df['lean_syntax'],
    'Ground Truth': decoded_labels,
    'Model Output': decoded_preds
})


results_df.to_csv('deformalization_results.csv', index=False)

print("\nResults saved to 'deformalization_results.csv'")
display(results_df.head())


Results saved to 'deformalization_results.csv'


Unnamed: 0,Formal Input,Ground Truth,Model Output
289,n ‚àà T \ X,n is in T but not in X,n is in T but not in X
1036,{x : ‚Ñù | 0 < x ‚àß x < 1},The set of all real numbers between 0 and 1,The set of all real numbers between 0 and 1
535,"‚àÄ x ‚àà T, P x","For all x in T, property P holds","For all x in T, property P holds"
346,ùí´(Y) = {S | S ‚äÜ Y},The power set of Y is the set of all subsets of Y,The power set of Y is the set of all subsets of Y
1075,"‚àÉ S ‚äÜ Y, S ‚â† ‚àÖ",There exists a subset S of Y that is non-empty,There exists a subset S of Y that is non-empty


  plt.savefig(


  fig.canvas.draw()
  plt.savefig(
