# GPT for code dictation

In [18]:
from transformers import GPT2Tokenizer, DataCollatorForLanguageModeling, TrainingArguments, Trainer, GPT2LMHeadModel, pipeline, TextDataset
from datasets import Dataset
import pandas as pd

In [4]:
data = pd.read_csv('../data/english_to_latex.csv')

print(data.shape)

data.head(2)

(50, 2)


Unnamed: 0,English,LaTeX
0,integral from a to b of x squared,"\int_{a}^{b} x^2 \,dx"
1,integral from negative 1 to 1 of x squared,"\int_{-1}^{1} x^2 \,dx"


In [5]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

tokenizer.pad_token = tokenizer.eos_token

# Add our singular prompt
CONVERSION_PROMPT = 'LCT\n'  # LaTeX conversion task

CONVERSION_TOKEN = 'LaTeX:'

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

In [6]:
training_examples = f'{CONVERSION_PROMPT}English: ' + data['English'] + '\n' + CONVERSION_TOKEN + ' ' + data['LaTeX'].astype(str)

print(training_examples[0])

LCT
English: integral from a to b of x squared
LaTeX: \int_{a}^{b} x^2 \,dx


In [7]:
task_df = pd.DataFrame({'text': training_examples})

task_df.head(2)

Unnamed: 0,text
0,LCT\nEnglish: integral from a to b of x square...
1,LCT\nEnglish: integral from negative 1 to 1 of...


In [8]:
latex_data = Dataset.from_pandas(task_df)  # turn a pandas DataFrame into a Dataset

def preprocess(examples):  # tokenize our text but don't pad because our collator will pad for us dynamically
    return tokenizer(examples['text'], truncation=True)

latex_data = latex_data.map(preprocess, batched=True)

latex_data = latex_data.train_test_split(train_size=.8)

Map:   0%|          | 0/50 [00:00<?, ? examples/s]

In [9]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [10]:
latex_gpt2 = GPT2LMHeadModel.from_pretrained('gpt2')

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [11]:
training_args = TrainingArguments(
    output_dir="./english_to_latex",
    overwrite_output_dir=True, #overwrite the content of the output directory
    num_train_epochs=10, # number of training epochs
    per_device_train_batch_size=2, # batch size for training
    per_device_eval_batch_size=20,  # batch size for evaluation
    load_best_model_at_end=True,
    logging_steps=5,
    log_level='info',
    evaluation_strategy='epoch',
    save_strategy='epoch'
)

trainer = Trainer(
    model=latex_gpt2,
    args=training_args,
    train_dataset=latex_data["train"],
    eval_dataset=latex_data["test"],
    data_collator=data_collator,
)

trainer.evaluate()

The following columns in the evaluation set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: text. If text are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.

***** Running Evaluation *****
  Num examples = 10
  Batch size = 20
`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mnt_[0m ([33meshaan-rithesh2023-vit-chennai[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


{'eval_loss': 5.111514568328857,
 'eval_model_preparation_time': 0.0055,
 'eval_runtime': 0.7526,
 'eval_samples_per_second': 13.287,
 'eval_steps_per_second': 1.329}

In [12]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: text. If text are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 40
  Num Epochs = 10
  Instantaneous batch size per device = 2
  Total train batch size (w. parallel, distributed & accumulation) = 2
  Gradient Accumulation steps = 1
  Total optimization steps = 200
  Number of trainable parameters = 124,439,808


Epoch,Training Loss,Validation Loss,Model Preparation Time
1,1.5224,1.009291,0.0055
2,0.753,0.715061,0.0055
3,0.7498,0.599575,0.0055
4,0.6267,0.546577,0.0055
5,0.6534,0.544049,0.0055
6,0.4552,0.507555,0.0055
7,0.4027,0.532265,0.0055
8,0.4019,0.559123,0.0055
9,0.3695,0.526749,0.0055
10,0.3133,0.523715,0.0055


The following columns in the evaluation set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: text. If text are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.

***** Running Evaluation *****
  Num examples = 10
  Batch size = 20


Saving model checkpoint to ./english_to_latex/checkpoint-20
Configuration saved in ./english_to_latex/checkpoint-20/config.json
Configuration saved in ./english_to_latex/checkpoint-20/generation_config.json
Model weights saved in ./english_to_latex/checkpoint-20/model.safetensors
Saving Trainer.data_collator.tokenizer by default as Trainer.processing_class is `None`
tokenizer config file saved in ./english_to_latex/checkpoint-20/tokenizer_config.json
Special tokens file saved in ./english_to_latex/checkpoint-20/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: text. If text are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.

***** Running Evaluation *****
  Num examples = 10
  Batch size = 20
Saving model checkpoint to ./english_to_latex/checkpoint-40
Configuration saved in ./english_to_latex/checkpoint-40/config.json
Configuration saved in ./englis

TrainOutput(global_step=200, training_loss=0.7454731261730194, metrics={'train_runtime': 113.5611, 'train_samples_per_second': 3.522, 'train_steps_per_second': 1.761, 'total_flos': 6238347264000.0, 'train_loss': 0.7454731261730194, 'epoch': 10.0})

In [13]:
trainer.evaluate()

The following columns in the evaluation set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: text. If text are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.

***** Running Evaluation *****
  Num examples = 10
  Batch size = 20


{'eval_loss': 0.5075546503067017,
 'eval_model_preparation_time': 0.0055,
 'eval_runtime': 0.0595,
 'eval_samples_per_second': 168.021,
 'eval_steps_per_second': 16.802,
 'epoch': 10.0}

In [19]:
calculus_data = TextDataset(
    tokenizer = tokenizer,
    file_path = "../data/calculus made easy.txt",
    block_size = 32
)

data_collator = DataCollatorForLanguageModeling(
    tokenizer = tokenizer, mlm = False
)

Creating features from dataset file at .
Saving features into cached file ./cached_lm_GPT2Tokenizer_32_calculus made easy.txt [took 0.012 s]


In [21]:
latex_gpt2 = GPT2LMHeadModel.from_pretrained('gpt2')

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--gpt2/snapshots/607a30d783dfa663caf39e06633721c8d4cfcd7e/config.json
Model config GPT2Config {
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "transformers_version": "4.50.3",
  "use_cach

In [23]:
training_args = TrainingArguments(
    output_dir="./calculus_english_to_latex",
    overwrite_output_dir=True,
    num_train_epochs=10,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=20,
    load_best_model_at_end=True,
    logging_steps=5,
    log_level='info',
    evaluation_strategy='epoch',
    save_strategy='epoch'
)

trainer = Trainer(
    model=latex_gpt2,
    args=training_args,
    train_dataset=latex_data["train"],
    eval_dataset=latex_data["test"],
    data_collator=data_collator,
)

trainer.evaluate()

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
The following columns in the evaluation set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: text. If text are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.

***** Running Evaluation *****
  Num examples = 10
  Batch size = 20


Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


{'eval_loss': 5.111514568328857,
 'eval_model_preparation_time': 0.0032,
 'eval_runtime': 0.0592,
 'eval_samples_per_second': 168.878,
 'eval_steps_per_second': 16.888}

In [25]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: text. If text are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 40
  Num Epochs = 10
  Instantaneous batch size per device = 2
  Total train batch size (w. parallel, distributed & accumulation) = 2
  Gradient Accumulation steps = 1
  Total optimization steps = 200
  Number of trainable parameters = 124,439,808


Epoch,Training Loss,Validation Loss,Model Preparation Time
1,1.5224,1.009291,0.0032
2,0.753,0.715061,0.0032
3,0.7498,0.599575,0.0032
4,0.6267,0.546577,0.0032
5,0.6534,0.544049,0.0032
6,0.4552,0.507555,0.0032
7,0.4027,0.532265,0.0032
8,0.4019,0.559123,0.0032
9,0.3695,0.526749,0.0032
10,0.3133,0.523715,0.0032


The following columns in the evaluation set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: text. If text are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.

***** Running Evaluation *****
  Num examples = 10
  Batch size = 20


Saving model checkpoint to ./calculus_english_to_latex/checkpoint-20
Configuration saved in ./calculus_english_to_latex/checkpoint-20/config.json
Configuration saved in ./calculus_english_to_latex/checkpoint-20/generation_config.json
Model weights saved in ./calculus_english_to_latex/checkpoint-20/model.safetensors
Saving Trainer.data_collator.tokenizer by default as Trainer.processing_class is `None`
tokenizer config file saved in ./calculus_english_to_latex/checkpoint-20/tokenizer_config.json
Special tokens file saved in ./calculus_english_to_latex/checkpoint-20/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: text. If text are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.

***** Running Evaluation *****
  Num examples = 10
  Batch size = 20
Saving model checkpoint to ./calculus_english_to_latex/checkpoint-40
Configuration saved in ./calculus_en

TrainOutput(global_step=200, training_loss=0.7454731261730194, metrics={'train_runtime': 123.2432, 'train_samples_per_second': 3.246, 'train_steps_per_second': 1.623, 'total_flos': 6238347264000.0, 'train_loss': 0.7454731261730194, 'epoch': 10.0})

In [26]:
trainer.evaluate()

The following columns in the evaluation set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: text. If text are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.

***** Running Evaluation *****
  Num examples = 10
  Batch size = 20


{'eval_loss': 0.5075546503067017,
 'eval_model_preparation_time': 0.0032,
 'eval_runtime': 0.0499,
 'eval_samples_per_second': 200.217,
 'eval_steps_per_second': 20.022,
 'epoch': 10.0}

In [27]:
trainer.save_model()

Saving model checkpoint to ./calculus_english_to_latex
Configuration saved in ./calculus_english_to_latex/config.json
Configuration saved in ./calculus_english_to_latex/generation_config.json
Model weights saved in ./calculus_english_to_latex/model.safetensors
Saving Trainer.data_collator.tokenizer by default as Trainer.processing_class is `None`
tokenizer config file saved in ./calculus_english_to_latex/tokenizer_config.json
Special tokens file saved in ./calculus_english_to_latex/special_tokens_map.json


In [28]:
loaded_model = GPT2LMHeadModel.from_pretrained('./calculus_english_to_latex')
latex_generator = pipeline('text-generation', model=loaded_model, tokenizer=tokenizer)

loading configuration file ./calculus_english_to_latex/config.json
Model config GPT2Config {
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "torch_dtype": "float32",
  "transformers_version": "4.50.3",
  "use_cache": true,
  "vocab_size": 50257
}

loading weights file ./calcu

In [29]:
text_sample = 'f of x equals integral from 0 to pi of x to the fourth power'
conversion_text_sample = f'{CONVERSION_PROMPT}English: {text_sample}\n{CONVERSION_TOKEN}'

print(conversion_text_sample)

LCT
English: f of x equals integral from 0 to pi of x to the fourth power
LaTeX:


In [30]:
print(latex_generator(
    conversion_text_sample, num_beams=5, early_stopping=True, temperature=0.7,
    max_length=len(tokenizer.encode(conversion_text_sample)) + 20
)[0]['generated_text'])

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


LCT
English: f of x equals integral from 0 to pi of x to the fourth power
LaTeX: f(x) = \int_{0}^{5} x^4 \,dx^


In [31]:
text_sample = 'f of x is sum from 0 to x of x squared'
conversion_text_sample = f'{CONVERSION_PROMPT}English: {text_sample}\n{CONVERSION_TOKEN}'

print(latex_generator(
    conversion_text_sample, num_beams=5, early_stopping=True, temperature=0.7,
    max_length=len(tokenizer.encode(conversion_text_sample)) + 20
)[0]['generated_text'])

LCT
English: f of x is sum from 0 to x of x squared
LaTeX: f(x) = \sum_{0}^{x} x^2 \,dx^


In [32]:
non_finetuned_latex_generator = pipeline(
    'text-generation',
    model=GPT2LMHeadModel.from_pretrained('gpt2'),
    tokenizer=tokenizer
)

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--gpt2/snapshots/607a30d783dfa663caf39e06633721c8d4cfcd7e/config.json
Model config GPT2Config {
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "transformers_version": "4.50.3",
  "use_cach

In [33]:
few_shot_prompt = """LCT
English: f of x is sum from 0 to x of x squared
LaTeX: f(x) = \sum_{0}^{x} x^2 \,dx \
###
LCT
English: f of x equals integral from 0 to pi of x to the fourth power
LaTeX: f(x) = \int_{0}^{\pi} x^4 \,dx \
###
LCT
English: x squared
LaTeX:"""

In [34]:
print(non_finetuned_latex_generator(
    few_shot_prompt, num_beams=5, early_stopping=True, temperature=0.7,
    max_length=len(tokenizer.encode(few_shot_prompt)) + 20
)[0]['generated_text'])

LCT
English: f of x is sum from 0 to x of x squared
LaTeX: f(x) = \sum_{0}^{x} x^2 \,dx ###
LCT
English: f of x equals integral from 0 to pi of x to the fourth power
LaTeX: f(x) = \int_{0}^{\pi} x^4 \,dx ###
LCT
English: x squared
LaTeX: f(x) = \sum_{0}^{x} x^2 \,dx ###


In [35]:
print(non_finetuned_latex_generator(
    conversion_text_sample, num_beams=5, early_stopping=True, temperature=0.7,
    max_length=len(tokenizer.encode(conversion_text_sample)) + 20
)[0]['generated_text'])

LCT
English: f of x is sum from 0 to x of x squared
LaTeX: f of x is sum from 0 to x of x squared
LaTeX: f of x is
