In [1]:
pip install transformers datasets safetensors

Collecting transformers
  Using cached transformers-4.52.4-py3-none-any.whl.metadata (38 kB)
Collecting datasets
  Using cached datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting safetensors
  Using cached safetensors-0.5.3-cp38-abi3-win_amd64.whl.metadata (3.9 kB)
Collecting huggingface-hub<1.0,>=0.30.0 (from transformers)
  Downloading huggingface_hub-0.32.4-py3-none-any.whl.metadata (14 kB)
Collecting pyyaml>=5.1 (from transformers)
  Downloading PyYAML-6.0.2-cp311-cp311-win_amd64.whl.metadata (2.1 kB)
Collecting regex!=2019.12.17 (from transformers)
  Downloading regex-2024.11.6-cp311-cp311-win_amd64.whl.metadata (41 kB)
Collecting requests (from transformers)
  Downloading requests-2.32.3-py3-none-any.whl.metadata (4.6 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers)
  Using cached tokenizers-0.21.1-cp39-abi3-win_amd64.whl.metadata (6.9 kB)
Collecting tqdm>=4.27 (from transformers)
  Downloading tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Collecting pyarrow>=15

In [1]:
import torch
print(torch.version.cuda)  # CUDA version PyTorch was built with
print(torch.cuda.is_available())
print(torch.cuda.get_device_name(0))  # Your actual GPU

12.6
True
NVIDIA GeForce GTX 1050


In [2]:
import os
from transformers import (
    GPT2LMHeadModel,
    GPT2Tokenizer,
    TextDataset,
    DataCollatorForLanguageModeling,
    Trainer,
    TrainingArguments,
)
from datasets import Dataset
import datetime


  from .autonotebook import tqdm as notebook_tqdm


In [None]:
model_name = "distilgpt2"
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
output_dir = f"../model/fine-tuned-{timestamp}"


In [22]:
# Load model and tokenizer
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token  # required
model = GPT2LMHeadModel.from_pretrained(model_name)

In [23]:
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

print(device)

cuda


In [24]:
import pickle

with open('../data/final_body.pickle', 'rb') as f:
    data = pickle.load(f)

In [25]:
# Load dataset
import pandas as pd
data = pd.DataFrame({"text": data[0]})
data.head()

Unnamed: 0,text
0,This is pretty good God Bless the U S Air Forc...
1,Attached please find the referenced lists On t...
2,mail The following expense report is ready for...
3,I have approved this expense report With regar...
4,sum up my costs and is cost for this trip and...


In [26]:
def chunk_text(text, max_length=30):
        chunks = []
        text = text.split(" ")
        for i in range(0, len(text), max_length):
            chunks.append(' '.join(text[i:i + max_length]))
        return chunks


data.text = data.text.apply(lambda text: chunk_text(text))
data = data.explode('text')

In [27]:
dataset = Dataset.from_pandas(data)

In [28]:
def tokenize(example):
    return tokenizer(example["text"], padding="max_length", truncation=True, max_length=128)

tokenized_dataset = dataset.map(tokenize, batched=True)

Map: 100%|██████████| 6635/6635 [00:02<00:00, 2333.32 examples/s]


In [29]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)


In [33]:
training_args = TrainingArguments(
    output_dir=output_dir,
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=4,
    save_steps=500,
    save_total_limit=2,
    logging_steps=500,
    prediction_loss_only=True,
    report_to="none",
)


In [34]:
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_dataset,
)

In [36]:
trainer.train()

`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
500,3.9182
1000,3.9019
1500,3.9786
2000,3.7217
2500,3.6344
3000,3.6685
3500,3.8849
4000,3.6456
4500,3.653


TrainOutput(global_step=4977, training_loss=3.7650310078705798, metrics={'train_runtime': 1929.848, 'train_samples_per_second': 10.314, 'train_steps_per_second': 2.579, 'total_flos': 650138977566720.0, 'train_loss': 3.7650310078705798, 'epoch': 3.0})

In [37]:
# Save model
trainer.save_model(output_dir)
tokenizer.save_pretrained(output_dir)

print(f"✅ Model saved at {output_dir}")

✅ Model saved at ../model/fine-tuned-20250605_161752


In [43]:
# Load model and tokenizer
model_path = "../model/fine-tuned-20250605_161752"
tokenizer = GPT2Tokenizer.from_pretrained(model_path)
tokenizer.pad_token = tokenizer.eos_token  # required
model = GPT2LMHeadModel.from_pretrained(model_path).to(device)

In [None]:
# pip install protobuf 

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from huggingface_hub import HfApi, create_repo

# Step 1: Log in and create repo
create_repo("smart-compose-model", private=False)  # public = free inference

# Step 2: Load and push your model
model = AutoModelForCausalLM.from_pretrained("path/to/your/model")
tokenizer = AutoTokenizer.from_pretrained("path/to/your/model")

model.push_to_hub("your-username/smart-compose-model")
tokenizer.push_to_hub("your-username/smart-compose-model")


In [None]:
# 134959 - first version

In [44]:
model.eval()

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-5): 6 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [45]:
def autocomplete_gpt2(prompt, max_new_tokens=5):
    inputs = tokenizer(prompt, return_tensors="pt").to(device)

    outputs = model.generate(
        inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        pad_token_id=tokenizer.eos_token_id,
        max_new_tokens=max_new_tokens,
        do_sample=False,         # makes it feel "smart"
        # top_k=30,
        # top_p=0.9,
        # temperature=0.5,
        # repetition_penalty=1.1
    )

    full_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

    return full_text[len(prompt):].rsplit('.', 1)[0].strip()  # return only new suggestion


In [46]:
autocomplete_gpt2("Continue this email in a helpful tone and stop after end of sentence or '.': wanna grad dinner next")

"week or ' ' '"

In [47]:
texts = [
    'here is',
    'have a',
    'please review',
    'please call me',
    'thanks for',
    'let me',
    'Let me know',
    'Let me know if you',
    'this sounds',
    'is this call going to',
    'can you get',
    'is it okay',
    'it should',
    'call if there\'s',
    'gave her a',
    'i will let',
    'i will be',
    'may i get a copy of all the',
    'how is our trade',
    'this looks like a',
    'i am fine with the changes',
    'please be sure this'
]
import pandas as pd
output = list(map(autocomplete_gpt2, texts))
output_df = pd.DataFrame({'input': texts, 'output': output})
# output_df.head(len(output))

In [48]:
output_df

Unnamed: 0,input,output
0,here is,the latest issue of our
1,have a,chance to talk to you
2,please review,and act upon this information
3,please call me,at if you have any
4,thanks for,your help in this matter
5,let me,know if you have any
6,Let me know,if you have any questions
7,Let me know if you,have any questions Thanks for
8,this sounds,like a good idea to
9,is this call going to,be a good one for
