In [1]:
pip install git+https://github.com/lxuechen/private-transformers.git

Collecting git+https://github.com/lxuechen/private-transformers.git
  Cloning https://github.com/lxuechen/private-transformers.git to /tmp/pip-req-build-kzk3vpkb
  Running command git clone --filter=blob:none --quiet https://github.com/lxuechen/private-transformers.git /tmp/pip-req-build-kzk3vpkb
  Resolved https://github.com/lxuechen/private-transformers.git to commit 18ccc4eab7355e4ac96051a82434796f6aa4624b
  Preparing metadata (setup.py) ... [?25ldone
Collecting prv-accountant (from private-transformers==0.2.3)
  Downloading prv_accountant-0.2.0-py3-none-any.whl (21 kB)
Collecting jupyter (from private-transformers==0.2.3)
  Downloading jupyter-1.0.0-py2.py3-none-any.whl (2.7 kB)
Collecting ml-swissknife (from private-transformers==0.2.3)
  Downloading ml_swissknife-0.1.29-py3-none-any.whl (67 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.4/67.4 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
Collecting gputil (from ml-swissknife->private-transformers==0.2.3)

In [2]:
!pip install tqdm



In [13]:
from tqdm import tqdm
import transformers
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from private_transformers import PrivacyEngine

class Dataset(torch.utils.data.Dataset):
    def __init__(self, texts, labels, eos_token):
       self.texts = texts
       self.y = labels
       self.eos_token = eos_token

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, index):
        text = self.texts[index] + ' ' + self.eos_token
        label = self.y[index]

        return text, label


def get_data_from_txt(path: str):
    texts = []
    labels = []
    with open(path, 'r') as f:
        for line in f:
            texts.append(' '.join(line.split(' ')[1:]).replace('\n', ''))
            labels.append(int(line.split(' ')[0]))
    
    return texts, labels

def forward_step(texts,tokenizer, model):
    tokenized_texts = tokenizer(texts, truncation=True, max_length=500, return_tensors='pt', padding=True).input_ids.to('cpu')

    lm_loss = model(tokenized_texts, labels=tokenized_texts).loss.unsqueeze(dim=0)

   
    
    return lm_loss



    
def train_llm(train_data, train_loader, 
               ):
    model = GPT2LMHeadModel.from_pretrained("gpt2")
    #model.parallelize()
    model.train()
    tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
    tokenizer.pad_token = tokenizer.eos_token
    optimizer = torch.optim.Adam(model.parameters(), lr = 8e-6)
    args_epochs=2
    print(args_epochs)
    epsilon=0.5
    print(epsilon)

   
    privacy_engine = PrivacyEngine(
            model,
            batch_size=1,
            sample_size=10,
            epochs=args_epochs,
            max_grad_norm=0.1,
            target_epsilon=epsilon,
        )
    privacy_engine.attach(optimizer)
    

    for epoch in range(args_epochs):
        print(f'training epoch {epoch}')

        total_loss = 0
        for texts, labels in tqdm(train_loader):
            
            lm_loss = forward_step(texts,tokenizer, model)

           
            optimizer.step(loss=lm_loss)
            
            total_loss += lm_loss.item()

        print('total language modeling loss', total_loss/len(train_data))
       # model.save_pretrained(f'{args_model_out}_epoch{epoch}')

    print()
    print('model training done!')
    print()

    
    return model



train_texts, train_labels = get_data_from_txt('/kaggle/input/imdbreview/imdb_train.txt')

train_texts = train_texts[0:100]
train_labels =train_labels[0:100]

train_data = Dataset(train_texts, train_labels, '<|endoftext|>')


train_loader = torch.utils.data.DataLoader(train_data, shuffle=False, batch_size=1)
    
pmodel = train_llm(train_data,train_loader)

print(pmodel)

2
0.5
training epoch 0


100%|██████████| 100/100 [03:27<00:00,  2.07s/it]


total language modeling loss 4.433933084011078
training epoch 1


100%|██████████| 100/100 [03:25<00:00,  2.06s/it]

total language modeling loss 3.9711666679382325

model training done!

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)



