In [7]:
pip install git+https://github.com/lxuechen/private-transformers.git

Collecting git+https://github.com/lxuechen/private-transformers.git
  Cloning https://github.com/lxuechen/private-transformers.git to /tmp/pip-req-build-_c60jel1
  Running command git clone --filter=blob:none --quiet https://github.com/lxuechen/private-transformers.git /tmp/pip-req-build-_c60jel1
  Resolved https://github.com/lxuechen/private-transformers.git to commit 18ccc4eab7355e4ac96051a82434796f6aa4624b
  Preparing metadata (setup.py) ... [?25ldone
Note: you may need to restart the kernel to use updated packages.


In [8]:
!pip install tqdm



In [9]:
from tqdm import tqdm
import transformers
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from private_transformers import PrivacyEngine
#from text_dataset import Dataset


class Dataset(torch.utils.data.Dataset):
    def __init__(self, texts, labels, eos_token):
       self.texts = texts
       self.y = labels
       self.eos_token = eos_token

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, index):
        text = self.texts[index] + ' ' + self.eos_token
        label = self.y[index]

        return text, label

def pre_process(texts, labels, prompts):
    correct_texts = []
    wrong_texts = []

    for text, label in zip(texts, labels):
        correct = prompts[label] + ' ' + text
        correct_texts.append(correct)

        wrong = [prompts[l] + ' ' + text for l in range(len(prompts)) if l != label]
        wrong_texts.extend(wrong)

    return correct_texts, wrong_texts

def get_data_from_txt(path: str):
    texts = []
    labels = []
    with open(path, 'r') as f:
        for line in f:
            texts.append(' '.join(line.split(' ')[1:]).replace('\n', ''))
            labels.append(int(line.split(' ')[0]))
    
    return texts, labels

def forward_step(correct_texts, wrong_texts, tokenizer, model, mismatch_loss, mismatch_weight):
    tokenized_texts = tokenizer(correct_texts, truncation=True, max_length=500, return_tensors='pt', padding=True).input_ids.to('cpu')
    tokenized_texts_wrong = tokenizer(wrong_texts, truncation=True, max_length=500, return_tensors='pt', padding=True).input_ids.to('cpu')

    lm_loss = model(tokenized_texts, labels=tokenized_texts).loss.unsqueeze(dim=0)

    if mismatch_loss:
        lm_loss -= mismatch_weight * model(tokenized_texts_wrong, labels=tokenized_texts_wrong).loss.unsqueeze(dim=0)
    
    return lm_loss


def train_llm(train_data, train_loader):
    model = GPT2LMHeadModel.from_pretrained("gpt2")
    #model.parallelize()
    model.train()
    tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
    tokenizer.pad_token = tokenizer.eos_token
    optimizer = torch.optim.Adam(model.parameters(), lr = 8e-6)
    args_epochs=2
    print(args_epochs)
    epsilon=0.5
    print(epsilon)
    
    args_mismatch_loss=0.25
    args_mismatch_weight=0.01
   
    privacy_engine = PrivacyEngine(
            model,
            batch_size=1,
            sample_size=10,
            epochs=args_epochs,
            max_grad_norm=0.1,
            target_epsilon=epsilon,
        )
    privacy_engine.attach(optimizer)
    
    args_prompts=["Write a negative review about a bad movie:", "Write a positive review about a good movie:"]

    for epoch in range(args_epochs):
        print(f'training epoch {epoch}')

        total_loss = 0
        for texts, labels in tqdm(train_loader):
            correct_texts, wrong_texts = pre_process(texts, labels, args_prompts)
            lm_loss = forward_step(correct_texts, wrong_texts, tokenizer, model, args_mismatch_loss, args_mismatch_weight)

           
            optimizer.step(loss=lm_loss)
            
            total_loss += lm_loss.item()

        print('total language modeling loss', total_loss/len(train_data))
       

    print()
    print('model training done!')
    print()

   
    return model

train_texts, train_labels = get_data_from_txt('/kaggle/input/imdbreview/imdb_train.txt')
print(type(train_texts))
print(type(train_labels))
train_texts = train_texts[0:100]
train_labels =train_labels[0:100]
train_data = Dataset(train_texts, train_labels, '<|endoftext|>')
train_loader = torch.utils.data.DataLoader(train_data, shuffle=False, batch_size=1)
    
pmodel = train_llm(train_data,train_loader)

print(pmodel)

<class 'list'>
<class 'list'>
2
0.5
training epoch 0


100%|██████████| 100/100 [08:33<00:00,  5.13s/it]


total language modeling loss 4.40100430727005
training epoch 1


100%|██████████| 100/100 [08:28<00:00,  5.08s/it]

total language modeling loss 3.7247038793563845

model training done!

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)



