In [None]:
!pip install ipywidgets

In [1]:
# Set the environment variable before importing transformers
import os
os.environ['HF_HOME'] = '/workspace'
print("HF_HOME is set to:", os.getenv('HF_HOME'))
cache_dir = '/workspace/'

HF_HOME is set to: /workspace


In [2]:
rm -rf ~/.cache/huggingface/transformers


In [3]:
import json
import pandas as pd
from datasets import Dataset

# Load dataset from a JSON file
with open('train.json', 'r') as f:
    dataset = json.load(f)

# Convert to DataFrame for easier manipulation
dataset = pd.DataFrame(dataset)

In [4]:
from transformers import T5ForConditionalGeneration, RobertaTokenizer

# Load the CodeT5 model
model = T5ForConditionalGeneration.from_pretrained("Salesforce/codet5-base")

# Load the correct tokenizer for CodeT5
tokenizer = RobertaTokenizer.from_pretrained("Salesforce/codet5-base")



In [None]:
columns_to_remove = ['synthetic', 'domain', 'test_cases', 'complexity', 'output_type']

# Ensure the columns exist before attempting to remove them
columns_existing = [col for col in columns_to_remove if col in dataset.columns]

# Now you can safely remove the columns
dataset = dataset.drop(columns=columns_existing)


# Ensure the pad token is set
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

def preprocess_function(examples):
    if 'latex_expression' not in examples:
        raise KeyError("'latex_expression' not found in examples")
    if 'solution' not in examples:
        raise KeyError("'solution' not found in examples")

    # Create the input sequences
    inputs = [f"Latex Expression: {ex} Solution:" for ex in examples['latex_expression']]

    # Use only the 'solution' column for labels
    labels = examples['solution']

    # Tokenize inputs
    model_inputs = tokenizer(inputs, padding=True, truncation=True)

    # Tokenize labels
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(labels, padding=True, truncation=True)

    model_inputs["labels"] = labels["input_ids"]

    return model_inputs


def preprocess_and_clean(complexity_data):
    data_as_dict = complexity_data.to_dict(orient='records')
    dataset = Dataset.from_list(data_as_dict)

    processed_dataset = dataset.map(preprocess_function, batched=True)
    
    columns_to_remove = ['task_id', 'sympy_exp', 'latex_expression', 'solution', 'simplified_solution']
    columns_existing = [col for col in columns_to_remove if col in processed_dataset.column_names]
    processed_dataset = processed_dataset.remove_columns(columns_existing)

    return processed_dataset

# Preprocess and clean the dataset
processed_dataset = preprocess_and_clean(dataset)

In [6]:
# Using the processed datasets directly with shuffling
train_test_split = processed_dataset.train_test_split(test_size=0.05, shuffle=True)

train_data = train_test_split['train']
eval_data = train_test_split['test']

In [15]:
from transformers import Seq2SeqTrainingArguments, DataCollatorForSeq2Seq

training_args = Seq2SeqTrainingArguments(
    output_dir="./codeT5_outputs",            # Output directory
    overwrite_output_dir=True,                # Overwrite the output directory
    num_train_epochs=10,                       # Reduce number of epochs to prevent overfitting
    per_device_train_batch_size=6,           # Increase batch size for better gradient estimation
    per_device_eval_batch_size=6,            # Increase eval batch size for faster evaluation
    warmup_steps=500,                         # Keep warmup steps for learning rate stability
    weight_decay=0.05,                         # Increase weight decay to regularize the model more
    logging_dir="./logs",                     # Directory for storing logs
    logging_steps=50,                         # Log every 50 steps for better monitoring
    eval_strategy="steps",                    # Evaluate every X steps
    eval_steps=500,                           # Increase eval steps for less frequent evaluation
    save_steps=1000,                          # Save less frequently to avoid saving redundant checkpoints
    save_total_limit=3,                       # Keep a few more checkpoints to monitor performance
    load_best_model_at_end=True,              # Load the best model when training finishes
    fp16=True,                                # Use 16-bit precision for faster and more efficient training
    learning_rate=3e-5,                       # Increase learning rate slightly for better convergence
    gradient_accumulation_steps=2,            # Accumulate gradients for larger effective batch size
    lr_scheduler_type="cosine",               # Cosine learning rate schedule to help with generalization
)

In [16]:
data_collator=DataCollatorForSeq2Seq(tokenizer, model=model)


In [17]:
from transformers import  Seq2SeqTrainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=eval_data,
    tokenizer=tokenizer,
    data_collator=data_collator
)
trainer.train()

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Step,Training Loss,Validation Loss
500,0.0046,0.004377
1000,0.0045,0.003478
1500,0.0044,0.003494
2000,0.0036,0.003318
2500,0.003,0.003212
3000,0.0032,0.003363
3500,0.0027,0.003136
4000,0.0029,0.003172
4500,0.0032,0.003055
5000,0.0034,0.002932


KeyboardInterrupt: 

In [None]:
# trainer.train(resume_from_checkpoint=True)

In [18]:
# After training, save the model and tokenizer
model.save_pretrained("./Model")
tokenizer.save_pretrained("./Model")

('./Model/tokenizer_config.json',
 './Model/special_tokens_map.json',
 './Model/vocab.json',
 './Model/merges.txt',
 './Model/added_tokens.json')