In [2]:
pip install transformers[torch] datasets torch scikit-learn

Collecting transformers
  Downloading transformers-4.52.4-py3-none-any.whl.metadata (38 kB)
Collecting datasets
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting scikit-learn
  Downloading scikit_learn-1.6.1-cp312-cp312-win_amd64.whl.metadata (15 kB)
Collecting huggingface-hub<1.0,>=0.30.0 (from transformers)
  Downloading huggingface_hub-0.32.3-py3-none-any.whl.metadata (14 kB)
Collecting regex!=2019.12.17 (from transformers)
  Downloading regex-2024.11.6-cp312-cp312-win_amd64.whl.metadata (41 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers)
  Downloading tokenizers-0.21.1-cp39-abi3-win_amd64.whl.metadata (6.9 kB)
Collecting safetensors>=0.4.3 (from transformers)
  Downloading safetensors-0.5.3-cp38-abi3-win_amd64.whl.metadata (3.9 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-20.0.0-cp312-cp312-win_amd64.whl.metadata (3.4 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)

In [1]:
import os
from transformers import (
    GPT2LMHeadModel,
    GPT2Tokenizer,
    TextDataset,
    DataCollatorForLanguageModeling,
    Trainer,
    TrainingArguments,
)
import datetime


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model_name = "distilgpt2"
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
output_dir = f"../model/fine-tuned-{timestamp}"


In [3]:
# Load model and tokenizer
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token  # required
model = GPT2LMHeadModel.from_pretrained(model_name)

In [4]:
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

print(device)

cuda


In [5]:
# Load dataset
dataset = TextDataset(
    tokenizer=tokenizer,
    file_path="../data/cleaned_emails_final.txt",
    block_size=128,
)



In [6]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)


In [7]:
training_args = TrainingArguments(
    output_dir=output_dir,
    overwrite_output_dir=True,
    num_train_epochs=2,
    per_device_train_batch_size=4,
    save_steps=500,
    save_total_limit=2,
    logging_steps=500,
    prediction_loss_only=True,
    report_to="none",
)


In [8]:
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
)

In [9]:
trainer.train()

`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
500,4.2777
1000,3.9333
1500,3.7854
2000,3.6879
2500,3.6557
3000,3.4588
3500,3.4277
4000,3.4254
4500,3.3675
5000,3.3903


TrainOutput(global_step=5060, training_loss=3.636182421657879, metrics={'train_runtime': 3370.2877, 'train_samples_per_second': 6.005, 'train_steps_per_second': 1.501, 'total_flos': 661080778997760.0, 'train_loss': 3.636182421657879, 'epoch': 2.0})

In [10]:
# Save model
trainer.save_model(output_dir)
tokenizer.save_pretrained(output_dir)

print(f"✅ Model saved at {output_dir}")

✅ Model saved at ../model/fine-tuned-20250603_134959


In [11]:
model.eval()

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-5): 6 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [12]:
def autocomplete_gpt2(prompt, max_new_tokens=5):
    inputs = tokenizer(prompt, return_tensors="pt").to(device)

    outputs = model.generate(
        inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        pad_token_id=tokenizer.eos_token_id,
        max_new_tokens=max_new_tokens,
        do_sample=False,         # makes it feel "smart"
        top_k=30,
        top_p=0.9,
        temperature=0.5,
        repetition_penalty=1.1
    )

    full_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

    return full_text[len(prompt):].rsplit('.', 1)[0].strip()  # return only new suggestion


In [20]:
autocomplete_gpt2("Continue this email in a helpful tone and stop after end of sentence or '.': wanna grad dinner next")

The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


"week? i'm going"

In [14]:
texts = [
    'here is',
    'have a',
    'please review',
    'please call me',
    'thanks for',
    'let me',
    'Let me know',
    'Let me know if you',
    'this sounds',
    'is this call going to',
    'can you get',
    'is it okay',
    'it should',
    'call if there\'s',
    'gave her a',
    'i will let',
    'i will be',
    'may i get a copy of all the',
    'how is our trade',
    'this looks like a',
    'i am fine with the changes',
    'please be sure this'
]
import pandas as pd
output = list(map(autocomplete_gpt2, texts))
output_df = pd.DataFrame({'input': texts, 'output': output})
# output_df.head(len(output))

The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p'

In [63]:
output_df

Unnamed: 0,input,output
0,here is,a very good place to
1,have a,chance to win
2,please review,and act upon this request
3,please call me,at and I will be
4,thanks for,the opportunity to meet with
5,let me,know what you think
6,Let me know,if you have any questions
7,Let me know if you,have any questions
8,this sounds,like a good idea
9,is this call going to,be a good one
