In [18]:
from transformers import Trainer, TrainingArguments, GPT2LMHeadModel, GPT2Tokenizer
from torch.utils.data import Dataset
import evaluate

import pandas as pd
import numpy as np

import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [None]:
from huggingface_hub import notebook_login
notebook_login()

In [19]:
import wandb
from datetime import datetime

run_name = "gpt2_" + datetime.now().strftime("%y%m%d_%H%M")
wandb.init(project="GPT2_ft", name=run_name)

0,1
eval/loss,█▃▂▁▁▁
eval/rouge_rouge1,▁▆▇███
eval/rouge_rouge2,▁▆▇███
eval/rouge_rougeL,▁▆▇███
eval/rouge_rougeLsum,▁▆▇███
eval/runtime,▄█▆▄▁▁
eval/samples_per_second,▅▁▃▅██
eval/steps_per_second,▆▁▃▄██
train/epoch,▁▁▂▂▂▂▂▁▁▂▂▂▂▂▃▃▃▄▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇████▁
train/global_step,▁▁▂▂▂▂▂▁▁▂▂▂▂▂▃▃▃▄▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇████▁

0,1
eval/loss,0.8673
eval/rouge_rouge1,78.59528
eval/rouge_rouge2,58.56226
eval/rouge_rougeL,73.91513
eval/rouge_rougeLsum,77.29441
eval/runtime,531.7919
eval/samples_per_second,0.656
eval/steps_per_second,0.041
total_flos,414903168000000.0
train/epoch,0.19608


In [20]:
class CoverLetterDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length=200):
        self.df = dataframe.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.max_length = max_length

        # Ensure tokenizer has a pad token (GPT2 doesn't by default)
        if tokenizer.pad_token is None:
            tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token})

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]

        prompt = (
            f"Job Title: {row['Job Title']}\n"
            f"Preferred Qualifications: {row['Preferred Qualifications']}\n"
            f"Hiring Company: {row['Hiring Company']}\n"
            f"Applicant Name: {row['Applicant Name']}\n"
            f"Past Working Experience: {row['Past Working Experience']}\n"
            f"Current Working Experience: {row['Current Working Experience']}\n"
            f"Skillsets: {row['Skillsets']}\n"
            f"Qualifications: {row['Qualifications']}\n"
            f"Cover Letter: {row['Cover Letter']} {self.tokenizer.eos_token}"
        )

        # Tokenize the combined prompt and cover letter
        encodings = self.tokenizer(
            prompt,
            truncation=True,
            padding="max_length",
            max_length=self.max_length,
            return_tensors="pt"
        )

        return {
            "input_ids": encodings["input_ids"].squeeze(0),
            "attention_mask": encodings["attention_mask"].squeeze(0),
            "labels": encodings["input_ids"].squeeze(0).clone()
        }

In [21]:
model_name = "gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token  # GPT-2 doesn't have a pad token

model = GPT2LMHeadModel.from_pretrained(model_name)

In [22]:
df_train = pd.read_csv("/Users/tracy/Desktop/留学/UMich/SI 630/Final Project/Data/CoverLetter_train.csv")
df_eval = pd.read_csv("/Users/tracy/Desktop/留学/UMich/SI 630/Final Project/Data/CoverLetter_eval.csv")

train_dataset = CoverLetterDataset(dataframe=df_train, tokenizer=tokenizer)
eval_dataset = CoverLetterDataset(dataframe=df_eval, tokenizer=tokenizer)

In [23]:
rouge = evaluate.load("rouge")

def compute_metrics(eval_preds):
    predictions, labels = eval_preds

    # Unwrap predictions if they’re in a tuple
    if isinstance(predictions, tuple):
        predictions = predictions[0]
    predictions = np.argmax(predictions, axis=-1)

    # Decode predictions
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)

    # Decode labels
    processed_labels = []
    for label in labels:
        processed_l = []
        for l in label:
            if l != -100:
                processed_l.append(l)
            else:
                processed_l.append(tokenizer.pad_token_id)
        processed_labels.append(processed_l)

    decoded_labels = tokenizer.batch_decode(processed_labels, skip_special_tokens=True)

    # Compute ROUGE
    rouge_result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    # Handle format difference
    if hasattr(list(rouge_result.values())[0], "mid"):
        rouge_result = {
            f"rouge_{key}": value.mid.fmeasure * 100
            for key, value in rouge_result.items()
        }
    else:
        rouge_result = {
            f"rouge_{key}": value * 100
            for key, value in rouge_result.items()
        }

    return rouge_result


In [24]:
training_args = TrainingArguments(
    output_dir="./GPT2-coverletter",
    eval_strategy="epoch",
    learning_rate=1e-4,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.02,
    logging_dir="./logs",
    logging_steps=10,
    use_cpu=True
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    processing_class=tokenizer,
    compute_metrics=compute_metrics,
)

In [25]:
trainer.train()

Epoch,Training Loss,Validation Loss,Rouge Rouge1,Rouge Rouge2,Rouge Rougel,Rouge Rougelsum
1,0.9964,0.863794,78.358705,58.354626,73.737969,77.110188
2,0.8484,0.763048,80.473347,62.61497,76.791279,79.526907
3,0.7246,0.721585,81.601683,64.6995,78.205636,80.767674
4,0.6786,0.702138,82.301545,65.991253,78.929268,81.50787
5,0.6927,0.694238,82.511855,66.619736,79.337842,81.813278


TrainOutput(global_step=255, training_loss=0.8698728037815467, metrics={'train_runtime': 4239.6873, 'train_samples_per_second': 0.959, 'train_steps_per_second': 0.06, 'total_flos': 414903168000000.0, 'train_loss': 0.8698728037815467, 'epoch': 5.0})

In [34]:
trainer.evaluate(eval_dataset=eval_dataset)

{'eval_loss': 0.6942383646965027,
 'eval_rouge_rouge1': 82.51185466206925,
 'eval_rouge_rouge2': 66.6197361919401,
 'eval_rouge_rougeL': 79.33784221859365,
 'eval_rouge_rougeLsum': 81.81327797596344,
 'eval_runtime': 654.4844,
 'eval_samples_per_second': 0.533,
 'eval_steps_per_second': 0.034,
 'epoch': 5.0}

In [28]:
prompt = (
            f"Job Title: Data Scientist\n"
            f"Preferred Qualifications: Master degree in Statistics\n"
            f"Hiring Company: Google\n"
            f"Applicant Name: Tracy Wu\n"
            f"Past Working Experience: Data Engineer\n"
            f"Current Working Experience: Research Assistant in University of Michigan\n"
            f"Skillsets: Python\n"
            f"Qualifications: Statistics\n"
            f"Cover Letter:  "
        )

inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
outputs = model.generate(
    inputs["input_ids"],
    max_length=300,
    temperature=0.7,
    top_k=50,
    top_p=0.95,
    repetition_penalty=1.2,
    do_sample=True,
    eos_token_id=tokenizer.eos_token_id
)

print(tokenizer.decode(outputs[0], skip_special_tokens=True))


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Job Title: Data Scientist
Preferred Qualifications: Master degree in Statistics
Hiring Company: Google
Applicant Name: Tracy Wu
Past Working Experience: Data Engineer
Current Working Experience: Research Assistant in University of Michigan
Skillsets: Python
Qualifications: Statistics
Cover Letter:  


In [29]:
prompt = (
            f"Job Title: Data Scientist\n"
            f"Preferred Qualifications: Master degree in Statistics\n"
            f"Applicant Name: Tracy Wu\n"
            "Cover Letter:  "
        )

inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
outputs = model.generate(
    inputs["input_ids"],
    max_length=300,
    temperature=0.7,
    top_k=50,
    top_p=0.95,
    repetition_penalty=1.2,
    do_sample=True,
    eos_token_id=tokenizer.eos_token_id
)

print(tokenizer.decode(outputs[0], skip_special_tokens=True))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Job Title: Data Scientist
Preferred Qualifications: Master degree in Statistics
Applicant Name: Tracy Wu
Cover Letter:   I am a self employed professional with 3+ years of experience working as an analyst at XYZ Company. My previous roles include Analyst and Senior Business Intelligence Manager for ABC Corporation, where my analytical skills combined have allowed me to excel on both business intelligence (4-7 hours per week) & strategic decision making tasks using Excel tools such the Power BI toolkit or Quick Looker Suite software suites. Prior work includes managing company data packages from time series analysis through project management; developing dashboards that detail product performance metrics while also presenting insights about current state/performance trends within individual products providing value driven coaching opportunities by senior executives interested in leveraging this opportunity into their team's strategy development efforts. In addition, over 5 year experien

In [31]:
trainer.push_to_hub()

training_args.bin:   0%|          | 0.00/5.30k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/498M [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

CommitInfo(commit_url='https://huggingface.co/TracyWu32/GPT2-coverletter/commit/5294f00f559515ee0994082fef316149f6a3a0e2', commit_message='GPT2-CL', commit_description='', oid='5294f00f559515ee0994082fef316149f6a3a0e2', pr_url=None, repo_url=RepoUrl('https://huggingface.co/TracyWu32/GPT2-coverletter', endpoint='https://huggingface.co', repo_type='model', repo_id='TracyWu32/GPT2-coverletter'), pr_revision=None, pr_num=None)