In [1]:
from datasets import load_dataset
from transformers import AutoTokenizer

# Load your dataset directly
data = load_dataset("json", data_files={"data": "dataset.json"})  # Explicitly name the split as 'data'

# Split the dataset: 70% train, 30% remaining
train_test_split = data["data"].train_test_split(test_size=0.3, seed=42)

# Further split the 30% into 20% validation and 10% test
validation_test_split = train_test_split["test"].train_test_split(test_size=0.3333, seed=42)  # 1/3 of 30%


split_data = {
    "train": train_test_split["train"],
    "validation": validation_test_split["train"],
    "test": validation_test_split["test"],
}

# Initialize tokenizer and model
model_name = "t5-small"  
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Preprocess the data
def preprocess_function(examples):
    inputs = examples["input"]
    outputs = examples["output"]
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")
    labels = tokenizer(outputs, max_length=512, truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Tokenize each split
tokenized_data = {
    "train": split_data["train"].map(preprocess_function, batched=True),
    "validation": split_data["validation"].map(preprocess_function, batched=True),
    "test": split_data["test"].map(preprocess_function, batched=True),
}

# Verify the splits
print(f"Train size: {len(tokenized_data['train'])}")
print(f"Validation size: {len(tokenized_data['validation'])}")
print(f"Test size: {len(tokenized_data['test'])}")


Generating data split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/72 [00:00<?, ? examples/s]

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

Map:   0%|          | 0/11 [00:00<?, ? examples/s]

Train size: 72
Validation size: 20
Test size: 11


In [2]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=2,
    logging_dir="./logs",
    save_steps=500,
)




In [None]:
#Takes a lot of time so had to cut short the dataset from 52000 to 1000 Input/output pairs

from transformers import Trainer

trainer = Trainer(
    model=model_name,
    args=training_args,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["validation"],
    tokenizer=tokenizer,
)

trainer.train()


In [None]:
results = trainer.evaluate()
print(results)


In [None]:
def predict(text):
    inputs = tokenizer(text, return_tensors="pt", max_length=512, truncation=True)
    outputs = model_name.generate(**inputs)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Example
input_text = "Shell prompts look like this :"
output_text = predict(input_text)
print("ASL Grammar:", output_text)
