In [39]:
import math

import pandas as pd
from tqdm import tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from transformers import AutoTokenizer

# Build GPT from Scratch

Vivid Lessons:
1. understand what data is required for training
2. understand the dimension of input (size of token embeddings and position embeddings should be equivalent to vocab size and block size)
3. learn to use tokenizer flexibly, when to padd when to truncate when to add EOS
4. Add wandb and tqdm to know how many hours needed for training and how well the model is

## Dataset Processing


In [40]:
class CoverLetterDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length=200):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_length = max_length

        if self.tokenizer.pad_token is None:
            self.tokenizer.add_special_tokens({'pad_token': '[PAD]'})

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        prompt = (
            f"Job Title: {row['Job Title']}\n"
            f"Preferred Qualifications: {row['Preferred Qualifications']}\n"
            f"Hiring Company: {row['Hiring Company']}\n"
            f"Applicant Name: {row['Applicant Name']}\n"
            f"Past Working Experience: {row['Past Working Experience']}\n"
            f"Current Working Experience: {row['Current Working Experience']}\n"
            f"Skillsets: {row['Skillsets']}\n"
            f"Qualifications: {row['Qualifications']}\n"
            f"Cover Letter:"
        )
        target = row['Cover Letter'] + self.tokenizer.eos_token
        full_text = prompt + "\n" + target

        encodings = self.tokenizer(
            full_text,
            max_length=self.max_length,
            truncation=True,
            padding="max_length",
            return_tensors="pt"
        )

        input_ids = encodings.input_ids.squeeze(0)

        return input_ids, input_ids.clone()


### GPT2

In [41]:
class MaskedSelfAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super().__init__()

        assert d_model % num_heads == 0
        self.d_k = d_model // num_heads
        self.num_heads = num_heads
        self.q_linear = nn.Linear(d_model, d_model, bias=False)
        self.k_linear = nn.Linear(d_model, d_model, bias=False)
        self.v_linear = nn.Linear(d_model, d_model, bias=False)
        self.out = nn.Linear(d_model, d_model, bias=False)

    def forward(self, q, k, v, mask=None):
        batch_size = q.size(0)

        q = self.q_linear(q).view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)
        k = self.k_linear(k).view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)
        v = self.v_linear(v).view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)

        # Scaled Dot-Product Attention
        scores = q @ k.transpose(-2, -1) / torch.sqrt(torch.tensor(self.d_k, dtype=torch.float32))

        if mask is not None:
            scores = scores.masked_fill(mask == 0, float('-inf'))

        attn = F.softmax(scores, dim=-1)
        output = attn @ v
        concat = output.transpose(1, 2).contiguous().view(batch_size, -1, self.num_heads * self.d_k)
        output = self.out(concat)

        return output, attn


class DecoderBlock(nn.Module):
    def __init__(self, n_embd, n_head, dropout=0.2):
        super().__init__()
        self.attn = MaskedSelfAttention(n_embd, n_head)
        self.mlp = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.GELU(),
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(dropout),
        )
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x, mask=None):
        x = self.ln1(x)
        q = k = v = x
        attn_output, _ = self.attn(q, k, v, mask=mask)
        x = x + attn_output
        mlp_output = self.mlp(self.ln2(x))
        x = x + mlp_output
        return x

class GPT(nn.Module):
    def __init__(self, vocab_size, block_size, n_layers, n_heads, n_embds):
        super().__init__()

        self.n_embds = n_embds
        self.block_size = block_size
        self.token_embedding_table = nn.Embedding(vocab_size, n_embds)
        self.position_embedding_table = nn.Embedding(block_size, n_embds)
        self.blocks = nn.Sequential(*[DecoderBlock(n_embds, n_heads) for _ in range(n_layers)])
        self.ln_f = nn.LayerNorm(n_embds)
        self.lm_head = nn.Linear(n_embds, vocab_size, bias=False)
        self.block_size = block_size

        self.apply(self.init_weights)

    def init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0, std=0.02)

    def forward(self, x, mask=None):
        B, T = x.size()

        tokens = self.token_embedding_table(x)
        positions_id = torch.arange(0, x.size(1), device=x.device)
        positions = self.position_embedding_table(positions_id)
        x = tokens + positions

        if mask is None:
            # mask = generate_causal_mask(T, device=x.device)
            mask = generate_causal_mask(
            seq_len=T,
            batch_size=B,
            num_heads=self.blocks[0].attn.num_heads,
            device=x.device
        )

        for block in self.blocks:
            x = block(x, mask=mask)

        x = self.ln_f(x)
        logits = self.lm_head(x)

        return logits

    def generate(self, idx, max_len, mask=None):
        for _ in range(max_len):
            # only keep last context
            idx_cond = idx[:, -self.block_size:]

            # get the prediction
            logits = self(idx_cond)

            # focus only on the last time step
            logits = logits[:, -1, :]  #(batch_size, seq_len, vocab_size)
            probs = F.softmax(logits, dim=-1)

            # randomly sample from the multinominal distribution
            idx_next = torch.multinomial(probs, 1)

            # add the sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1)  # (batch_size, seq_len + 1)

        return idx  # shape (B, max_len + 1)

    def resize_token_embeddings(self, new_vocab_size):
        """
        Resize token embeddings and output projection to match new tokenizer size.
        Similar to Hugging Face's resize_token_embeddings.
        """
        old_vocab_size = self.token_embedding_table.num_embeddings
        if new_vocab_size == old_vocab_size:
            return  # no need to resize

        # Resize token embedding table
        new_embedding = nn.Embedding(new_vocab_size, self.n_embds)
        new_embedding.weight.data[:old_vocab_size] = self.token_embedding_table.weight.data
        self.token_embedding_table = new_embedding

        # Resize LM head
        new_lm_head = nn.Linear(self.n_embds, new_vocab_size, bias=False)
        new_lm_head.weight.data[:old_vocab_size] = self.lm_head.weight.data
        self.lm_head = new_lm_head

        self.vocab_size = new_vocab_size


In [42]:

def generate_causal_mask(seq_len, batch_size, num_heads, device):
    mask = torch.tril(torch.ones(seq_len, seq_len, device=device)).bool()
    return mask.unsqueeze(0).unsqueeze(0).expand(batch_size, num_heads, seq_len, seq_len)



## Train Loop

In [43]:
class GPTTrainer():
    def __init__(self, vocab_size, block_size, n_layers, n_heads, n_embds, tokenizer):
        self.tokenizer = tokenizer
        self.model = GPT(vocab_size, block_size, n_layers, n_heads, n_embds)
        self.model.resize_token_embeddings(len(self.tokenizer)) # resize the tokenizer when manually adding pad token

    def train(self, loader, num_epochs, lr, save_filename='GPT2TW.pt', device='cuda'):
        """
        Main training loop
        :param loader: The training data loader
        :param num_epochs: The number of epochs to learn
        :param lr: The learning rate
        :param save_filename: The filename to save the model pt file
        :param device: The device to use ('cpu' or 'cuda')
        :return:
        """
        self.model.train()
        self.model.to(device)

        criterion = nn.CrossEntropyLoss(ignore_index=self.tokenizer.pad_token_id)
        optimizer = torch.optim.AdamW(self.model.parameters(), lr=lr)

        for epoch in range(num_epochs):
            running_loss = 0.0
            for i, (x_batch, y_batch) in enumerate(tqdm(loader, desc=f"Epoch {epoch+1}/{num_epochs}")):
                x_batch, y_batch = x_batch.to(device), y_batch.to(device=device)

                optimizer.zero_grad()

                seq_len = x_batch.size(1)
                batch_size = x_batch.size(0)
                num_heads = self.model.blocks[0].attn.num_heads  # or set as self.n_heads earlier

                mask = generate_causal_mask(seq_len, batch_size, num_heads, device)

                logits = self.model(x_batch, mask=mask)
                logits = logits.view(-1, logits.size(-1))
                y_batch = y_batch.view(-1)

                loss = criterion(logits, y_batch)
                loss.backward()
                optimizer.step()
                running_loss += loss.item()

            epoch_loss = running_loss / len(loader)
            print(f"Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss:.4f}")

        torch.save(self.model.state_dict(), save_filename)

    def test(self, loader, checkpoint='GPT2TW.pt', device='cuda'):
        if checkpoint:
            self.model.load_state_dict(torch.load(checkpoint))

        self.model.eval()
        self.model.to(device)

        criterion = nn.CrossEntropyLoss()
        total_loss = 0.0
        total_tokens = 0 # why

        with torch.no_grad():
            for i, (x_batch, y_batch) in enumerate(loader):
                x_batch, y_batch = x_batch.to(device), y_batch.to(device=device)

                seq_len = x_batch.size(1)
                batch_size = x_batch.size(0)
                num_heads = self.model.blocks[0].attn.num_heads
                mask = generate_causal_mask(seq_len, batch_size, num_heads, device)

                logits = self.model(x_batch, mask=mask)
                logits = logits.view(-1, logits.size(-1))
                y_batch = y_batch.view(-1)

                loss = criterion(logits, y_batch)
                total_loss += loss.item()
                total_tokens += y_batch.size(0)

        avg_loss = total_loss / len(loader)
        perplexity = math.exp(avg_loss)
        print(f"Test Loss: {avg_loss:.4f}")
        print(f"Test Perplexity: {perplexity:.4f}")
        return avg_loss, perplexity

    def generate_text(self, job_title, summarized_jd, max_len=100, device='cpu'):
        self.model.eval()
        self.model.to(device)

        prompt_text = (
            f"Job Title: {job_title}\n"
            f"Preferred Qualifications: {summarized_jd}\n"
            f"Hiring Company: Apple\n"
            f"Applicant Name: Tracy Wu\n"
            f"Past Working Experience: 3 internships in Data Science\n"
            f"Current Working Experience: Data Scientist Intern\n"
            f"Skillsets: Python, SQL, R, AWS, Azure\n"
            f"Qualifications: Master of Science in Statistics from University of Michigan\n"
            "Cover Letter: \n"
        )

        encodings = self.tokenizer(
            prompt_text,
            max_length=max_len,
            truncation=True,
            padding="max_length",
            return_tensors="pt"
        )

        input_ids = encodings.input_ids.squeeze(0)
        input_tensors = input_ids.unsqueeze(0).to(device)

        with torch.no_grad():
            output_tensor = self.model.generate(
                input_tensors,
                max_len=max_len)

        output_ids = output_tensor[0].tolist()
        return self.tokenizer.decode(output_ids, skip_special_tokens=True)


In [44]:
# Hyperparameters
vocab_size =  50257 # Vocabulary size
batch_size = 2
block_size = 200    # Sequence length (Context) for prediction, make sure it is equal or bigger than max_length in Dataset
n_layers = 6      # Number of transformer layers
n_heads = 6      # Number of attention heads
n_embds = 384      # Embedding size
learning_rate = 2e-5
num_epochs = 10   # Number of training epochs
eval_iters = 200
eval_interval = 500

# Initialize the model, loss function, and optimizer
df_cl_train = pd.read_csv("/Users/tracy/Desktop/留学/UMich/SI 630/Final Project/Data/CoverLetter_train.csv")
df_cl_test = pd.read_csv("/Users/tracy/Desktop/留学/UMich/SI 630/Final Project/Data/CoverLetter_test.csv")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

tokenizer = AutoTokenizer.from_pretrained("gpt2")

dataset_train = CoverLetterDataset(df_cl_train, tokenizer)
dataset_test = CoverLetterDataset(df_cl_test, tokenizer)

train_loader = DataLoader(dataset_train, batch_size=2, shuffle=True)
test_loader = DataLoader(dataset_test, batch_size=2, shuffle=True)

trainer = GPTTrainer(vocab_size=vocab_size, block_size=block_size, n_layers=n_layers, n_heads=n_heads, n_embds=n_embds, tokenizer=tokenizer)


In [45]:
trainer.train(train_loader, num_epochs=num_epochs, lr=learning_rate, device='cpu')

Epoch 1/10: 100%|██████████| 407/407 [01:52<00:00,  3.63it/s]


Epoch 1/10, Loss: 6.2397


Epoch 2/10: 100%|██████████| 407/407 [01:54<00:00,  3.57it/s]


Epoch 2/10, Loss: 2.3243


Epoch 3/10: 100%|██████████| 407/407 [01:50<00:00,  3.69it/s]


Epoch 3/10, Loss: 1.3385


Epoch 4/10: 100%|██████████| 407/407 [01:49<00:00,  3.72it/s]


Epoch 4/10, Loss: 0.9373


Epoch 5/10: 100%|██████████| 407/407 [01:50<00:00,  3.69it/s]


Epoch 5/10, Loss: 0.7104


Epoch 6/10: 100%|██████████| 407/407 [01:49<00:00,  3.72it/s]


Epoch 6/10, Loss: 0.5601


Epoch 7/10: 100%|██████████| 407/407 [01:49<00:00,  3.71it/s]


Epoch 7/10, Loss: 0.4556


Epoch 8/10: 100%|██████████| 407/407 [01:49<00:00,  3.73it/s]


Epoch 8/10, Loss: 0.3785


Epoch 9/10: 100%|██████████| 407/407 [01:50<00:00,  3.68it/s]


Epoch 9/10, Loss: 0.3173


Epoch 10/10: 100%|██████████| 407/407 [01:49<00:00,  3.71it/s]


Epoch 10/10, Loss: 0.2697


In [46]:
trainer.generate_text(
    job_title="Data Analyst",
    summarized_jd="Automate Data pipelines and conduct data analysis",
    max_len=100,
    device=device,
)

'Job Title: Data Analyst\nPreferred Qualifications: Automate Data pipelines and conduct data analysis\nHiring Company: Apple\nApplicant Name: Tracy Wu\nPast Working Experience: 3 internships in Data Science\nCurrent Working Experience: Data Scientist Intern\nSkillsets: Python, SQL, R, AWS, Azure\nQualifications: Master of Science in Statistics from University of Michigan\nPlease write a Cover Letter for Tracy Wu\n understanding understanding understanding understanding understanding understanding understanding understanding understanding understanding understanding understanding understanding understanding understanding understanding understanding understanding understanding understanding understanding understanding understanding understanding understanding understanding understanding understanding understanding understanding understanding understanding understanding understanding understanding understanding understanding understanding understanding understanding understanding understa