In [1]:
%pip install -q transformers datasets

In [2]:
%pip install -q pytorch-lightning wandb 

In [3]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)



Mon Jan  2 18:52:19 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   54C    P0    29W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

Import

In [4]:
import pandas as pd
import time
from transformers import pipeline,T5Tokenizer,GPT2Tokenizer, AutoTokenizer
from datasets import load_dataset, load_metric
import torch
torch.cuda.get_device_name(0)

'Tesla T4'

# Load dataset

In [5]:
letter_datasets = load_dataset(data_files="LM_Dataset_all.csv", path='data', sep=';')
letter_datasets



  0%|          | 0/1 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['name', 'skills', 'exp', 'job', 'lettre'],
        num_rows: 1599
    })
})

In [6]:
train_test = letter_datasets["train"].train_test_split(test_size=0.2)
train_validation = train_test["test"].train_test_split(test_size=0.5)
letter_datasets["train"] = train_test["train"]
letter_datasets["validation"] = train_validation["train"]
letter_datasets["test"] = train_validation["test"]

letter_datasets

DatasetDict({
    train: Dataset({
        features: ['name', 'skills', 'exp', 'job', 'lettre'],
        num_rows: 1279
    })
    validation: Dataset({
        features: ['name', 'skills', 'exp', 'job', 'lettre'],
        num_rows: 160
    })
    test: Dataset({
        features: ['name', 'skills', 'exp', 'job', 'lettre'],
        num_rows: 160
    })
})

In [7]:
example = letter_datasets['train'][0]

print('name:', example['name'])
print('job:', example['job'])
print('text:', example['lettre'])

name: Josephina M. Carrera
job: Associate Director
text: Dear Ms. Hansen:
Upon learning of your need for a proven and dedicated Associate Director, I felt compelled to submit my resume for your review. As an organized and motivated professional with experience providing comprehensive administrative and operational support to optimize organizational performance and efficiency, I am confident that I would be a valuable asset to your team at the Jensen Museum.
My background includes excellent experience supporting management staff in running all facets of operations and programs. From developing and implementing procedures and coordinating meetings to overseeing activities and ensuring goal achievement, my experience has prepared me to excel in this role at your organization. Backed by my proven communication and multitasking capabilities, I excel at providing organizational and time-management expertise and ensuring seamless business operations for teams of any size.
Below are just a few

In [8]:
tokenizer = T5Tokenizer.from_pretrained("t5-small")

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-small automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [None]:
# tokenizer = RobertaTokenizer.from_pretrained("Salesforce/codet5-small")

In [9]:
max_input = 256
max_target = 2048

def preprocess_data(example):
    names = example['name']
    skill = example['skills']
    experience = example['exp']
    job = example['job']
    texts = example['lettre']

    texts = [texts[i].replace(job[i],'<extra_id_0>').replace(experience[i],'<extra_id_1>').replace(skill[i],'<extra_id_2>').replace(names[i],'<extra_id_3>')  for i in range(len(names))]
    

    inputs =  ['Writte cover letter with this information: name: '+ names[i] + ', skill: ' + skill[i] + ', experience: '+ experience[i] + ', job: ' + job[i] for i in range(len(names))]
    # inputs =  ['<extra_id_0>' + job[i] + '<extra_id_1>'+ experience[i] + '<extra_id_2>' + skill[i] + '<extra_id_3>' + names[i] for i in range(len(names))]
    model_inputs = tokenizer(inputs, max_length=max_input, padding='max_length', truncation=True)
    
    labels = tokenizer(texts, max_length=max_target, padding="max_length", truncation=True).input_ids

    labels_with_ignore_index = []
    for labels_example in labels:
      labels_example = [label if label != 0 else -100 for label in labels_example]
      labels_with_ignore_index.append(labels_example)
    
    model_inputs["labels"] = labels_with_ignore_index

    return model_inputs

In [10]:
letter_datasets = letter_datasets.map(preprocess_data, batched=True)
letter_datasets

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['name', 'skills', 'exp', 'job', 'lettre', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 1279
    })
    validation: Dataset({
        features: ['name', 'skills', 'exp', 'job', 'lettre', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 160
    })
    test: Dataset({
        features: ['name', 'skills', 'exp', 'job', 'lettre', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 160
    })
})

In [11]:
from torch.utils.data import DataLoader

letter_datasets.set_format(type="torch", columns=['input_ids', 'attention_mask', 'labels'])
train_dataloader = DataLoader(letter_datasets['train'], shuffle=True, num_workers=4)
valid_dataloader = DataLoader(letter_datasets['validation'], num_workers=4)
test_dataloader = DataLoader(letter_datasets['test'], num_workers=4)

In [12]:
batch = next(iter(train_dataloader))
print(batch.keys())

dict_keys(['input_ids', 'attention_mask', 'labels'])


In [13]:
labels = batch['labels'][0]
tokenizer.decode([label for label in labels if label != -100])

'Dear Mr. Blackwell: In response to your posting for a new<extra_id_0> to join your staff at Happy Hill Elementary School, please accept the enclosed resume for your review. As an accomplished professional with eight years of solid experience in youth athletic instruction and training for the Greenville YMCA, I am well positioned to leverage my background and skills to excel in this role. With experience planning and executing a wide variety of successful youth-focused athletic events, programs, and activities, I am committed to ensuring safe and fun physical education experiences for children. I excel at<extra_id_2>, with a particular emphasis on elementary-aged children. Furthermore, my commitment to instilling a lifelong dedication to personal health, nutrition, and physical exercise to program participants is sure to make me an immediate asset to your school in this position. Highlights of my experience include... Thriving as a Youth Program Instructor for the Greenville YMCA for t

In [14]:
from transformers import T5ForConditionalGeneration, AdamW, get_linear_schedule_with_warmup
import pytorch_lightning as pl

class CodeT5(pl.LightningModule):
    def __init__(self, lr=3e-4, num_train_epochs=15, warmup_steps=100):
        super().__init__()
        self.model = T5ForConditionalGeneration.from_pretrained("Salesforce/codet5-small")
        # self.model = GPT2LMHeadModel.from_pretrained("gpt2")


        self.save_hyperparameters()

    def forward(self, input_ids, attention_mask, labels=None):     
        outputs = self.model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        return outputs
    
    def common_step(self, batch, batch_idx):
        outputs = self(**batch)
        loss = outputs.loss

        return loss
      
    def training_step(self, batch, batch_idx):
        loss = self.common_step(batch, batch_idx)     
        # logs metrics for each training_step,
        # and the average across the epoch
        self.log("training_loss", loss)

        return loss

    def validation_step(self, batch, batch_idx):
        loss = self.common_step(batch, batch_idx)     
        self.log("validation_loss", loss, on_epoch=True)

        return loss

    def test_step(self, batch, batch_idx):
        loss = self.common_step(batch, batch_idx)     

        return loss

    def configure_optimizers(self):
        # create optimizer
        optimizer = torch.optim.AdamW(self.parameters(), lr=self.hparams.lr)
        # create learning rate scheduler
        num_train_optimization_steps = self.hparams.num_train_epochs * len(train_dataloader)
        lr_scheduler = {'scheduler': get_linear_schedule_with_warmup(optimizer,
                                                    num_warmup_steps=self.hparams.warmup_steps,
                                                    num_training_steps=num_train_optimization_steps),
                        'name': 'learning_rate',
                        'interval':'step',
                        'frequency': 1}
        
        return {"optimizer": optimizer, "lr_scheduler": lr_scheduler}

    def train_dataloader(self):
        return train_dataloader

    def val_dataloader(self):
        return valid_dataloader

    def test_dataloader(self):
        return test_dataloader

In [15]:
import wandb

wandb.login()

ERROR:wandb.jupyter:Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit: 

··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [16]:
model = CodeT5()

Downloading:   0%|          | 0.00/1.57k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/242M [00:00<?, ?B/s]

In [17]:
from pytorch_lightning import Trainer
from pytorch_lightning.loggers import WandbLogger
from pytorch_lightning.callbacks import EarlyStopping, LearningRateMonitor
import wandb

wandb_logger = WandbLogger(name='cover-letter-generator', project='CodeT5')
# for early stopping, see https://pytorch-lightning.readthedocs.io/en/1.0.0/early_stopping.html?highlight=early%20stopping
early_stop_callback = EarlyStopping(
    monitor='validation_loss',
    patience=3,
    strict=False,
    verbose=False,
    mode='min'
)
lr_monitor = LearningRateMonitor(logging_interval='step')

trainer = Trainer(accelerator='gpu', devices=1,
                  default_root_dir="/output/run", 
                  logger=wandb_logger, 
                  callbacks=[early_stop_callback, lr_monitor], max_epochs=100)
trainer.fit(model)

[34m[1mwandb[0m: Currently logged in as: [33mtokycedric[0m ([33mtoky[0m). Use [1m`wandb login --relogin`[0m to force relogin


INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name  | Type                       | Params
-----------------------------------------------------
0 | model | T5ForConditionalGeneration | 60.5 M
-----------------------------------------------------
60.5 M    Trainable params
0         Non-trainable params
60.5 M    Total params
241.969   Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

# Sauvegarde du model

In [18]:
model.model.save_pretrained(".")

# Inference

In [20]:
test_set = load_dataset(data_files="LM_Dataset_all.csv", path='data', sep=';')



  0%|          | 0/1 [00:00<?, ?it/s]

# Another one

In [21]:
test_example = test_set['train'][8]
test_example

{'name': 'Nicholas B. Westberry',
 'skills': 'communication and problem solving',
 'exp': 'receiving shipments, unloading goods, verifying invoices, and stocking items',
 'job': 'Warehouse Worker',
 'lettre': 'Dear Mr. McKee:\nWith the enclosed resume, I would like to express my sincere interest in the Warehouse Worker position you have available. As a skilled, self-directed, and reliable professional with experience performing a range of general warehouse operations while ensuring compliance to regulations and guidelines, I possess the knowledge and experience to allow me to contribute toward the success of your company.\nMy background includes successfully receiving shipments, unloading goods, verifying invoices, and stocking items while working collaboratively with peers and management to maximize productivity and efficiency. Through my experience, I have become highly skilled at adhering to warehouse goals and requirements while facilitating streamlined operations.\nThe following a

In [22]:
from transformers import T5ForConditionalGeneration


tokenizer = T5Tokenizer.from_pretrained("t5-small")
model_test = T5ForConditionalGeneration.from_pretrained('.')

In [27]:
max_input = 256
max_target = 4096

def inference(row, model):
    inputs =  ['Writte cover letter with this information: name: '+ row['name'] + ', skill: ' + row['skills'] + ', experience: '+ row['exp'] + ', job: ' + row['job']]
    inputs_ids = tokenizer(inputs, max_length=256, truncation=True, return_tensors='pt').input_ids
    outputs = model.generate(inputs_ids, num_beams=8, do_sample=True, min_length=512, max_new_tokens=2048)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

In [28]:
input = inference(test_example, model_test)
print(input)

Dear Mr. Johnson: Upon review of your posting for a, I felt compelled to submit my resume for your consideration. With my strong understanding of accounting principles gained throughout my educational and recent professional background, as well as my superior skills, I feel confident that I would significantly benefit your company. From, my background has prepared me to excel in this role. With a solid foundation in accounting and finance, my communication and team leadership abilities position me ready to thrive in this challenging position. Highlights of my background include: Earning a Bachelor’s degree in Accounting and attaining comprehensive knowledge of strategic financial planning, tax preparation, and general accounting principles. Preparing financial and accounting-related reports, managing accounts payable / receivable, tax returns, and general accounting principles. Preparing financial and accounting-related reports and managing special projects while excelling within detai