In [1]:
import os

os.environ["CUDA_VISIBLE_DEVICES"] = "1,3,4,5,6"
os.environ['TRANSFORMERS_CACHE'] = '/raid/rabikov/hf_cache/'
os.environ['HF_HOME'] = '/raid/rabikov/hf_cache/'

from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, LlamaTokenizer, LlamaForCausalLM
import torch





  from .autonotebook import tqdm as notebook_tqdm


In [2]:

model_id = "meta-llama/Llama-2-70b-hf"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)


In [3]:

model = LlamaForCausalLM.from_pretrained(model_id,
                                            quantization_config=bnb_config,
                                            device_map="auto", use_auth_token="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")
tokenizer = LlamaTokenizer.from_pretrained(model_id, use_auth_token="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")


Downloading (…)lve/main/config.json: 100%|██████████| 647/647 [00:00<00:00, 1.62MB/s]
Downloading (…)fetensors.index.json: 100%|██████████| 66.7k/66.7k [00:00<00:00, 640kB/s]
Downloading (…)of-00015.safetensors: 100%|██████████| 9.85G/9.85G [04:10<00:00, 39.4MB/s]
Downloading (…)of-00015.safetensors: 100%|██████████| 9.80G/9.80G [03:12<00:00, 50.9MB/s]
Downloading (…)of-00015.safetensors: 100%|██████████| 9.97G/9.97G [03:01<00:00, 55.0MB/s]
Downloading (…)of-00015.safetensors: 100%|██████████| 9.80G/9.80G [02:50<00:00, 57.3MB/s]
Downloading (…)of-00015.safetensors: 100%|██████████| 9.80G/9.80G [02:33<00:00, 64.0MB/s]
Downloading (…)of-00015.safetensors: 100%|██████████| 9.80G/9.80G [02:56<00:00, 55.6MB/s]
Downloading (…)of-00015.safetensors: 100%|██████████| 9.97G/9.97G [17:19<00:00, 9.58MB/s]
Downloading (…)of-00015.safetensors: 100%|██████████| 9.80G/9.80G [18:59<00:00, 8.60MB/s]
Downloading (…)of-00015.safetensors: 100%|██████████| 9.80G/9.80G [31:43<00:00, 5.15MB/s]
Downloading (…)

In [14]:
from peft import prepare_model_for_kbit_training

model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

In [15]:
from peft import LoraConfig, get_peft_model

config = LoraConfig(
    r=8, 
    lora_alpha=16, 
   # target_modules=["query_key_value"], 
    lora_dropout=0.05, 
    bias="none", 
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, config)

In [16]:
model.print_trainable_parameters()

trainable params: 4,194,304 || all params: 3,504,607,232 || trainable%: 0.11967971650867153


In [17]:
tokenizer('hello')['input_ids']

[1, 22172]

In [18]:
model.generate(**tokenizer('hello', return_tensors='pt'))



KeyboardInterrupt: 

In [7]:
import sys
sys.path.append("../NLP-DL-Project-hypo-to-hyper/pipeline_src/")


from config.config import TaskConfig
from train import CustomScheduler, train
from logger.logger import WanDBWriter
from trainer.train_epoch import train_epoch, predict
from dataset.dataset import init_data
from logger.logger import WanDBWriter

from tqdm import tqdm
import matplotlib.pyplot as plt
import numpy as np
import os
from torch import nn
import torch.nn.functional as F
import torch
import gc
import wandb
import json
import itertools
from collections import Counter
import pickle
import pandas as pd

from accelerate import Accelerator


In [8]:
accelerator = Accelerator()

accelerator.device

device(type='cuda')

In [9]:
config = TaskConfig()


config.n_epochs = 1
config.batch_size = 2
config.lr = 3e-4
config.min_lr = 3e-6

config.validation = 1000
config.save_every = 1000
config.compute_metrics_every = 1000

config.data_path = 'babel_datasets/wnet_only/reweighted_wnet_train_en_babel.pickle'
config.gold_path = (
    None  # "SemEval2018-Task9/training/gold/1A.english.training.gold.txt"
)
config.test_data_path = 'babel_datasets/wnet_only/reweighted_wnet_test_en_babel.pickle'
config.test_gold_path = (
    None  # "SemEval2018-Task9/test/gold/1A.english.test.gold.txt"
)

config.device = accelerator.device
#config.device = 'cuda'
config.using_peft = True
config.model_type = "Auto"  # Auto or Llama
config.wandb_log_dir = "/raid/rabikov/wandb/"
config.model_checkpoint = "EleutherAI/gpt-neo-125m"
config.exp_name = config.model_checkpoint.replace("/", "-") + '_test_accelerate'
config.saving_path = "/raid/rabikov/model_checkpoints/" + config.exp_name

In [10]:
train_dataset, test_dataset, train_loader, val_loader = init_data(tokenizer, config)

In [11]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(
    model.parameters(), lr=config.lr, betas=(0.9, 0.98), eps=1e-9
)
# scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer, max_lr=config.lr,
#                                                  steps_per_epoch=len(train_loader), epochs=config.n_epochs)

scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
    optimizer, T_max=len(train_loader) * config.n_epochs, eta_min=config.min_lr
)

In [None]:
pbar = tqdm(enumerate(train_loader), total=len(train_loader))
for batch_idx, batch in pbar:


    terms, att_mask_terms, targets, input_seqs, att_mask_input, labels = batch

    output = model.forward(
        input_seqs.to(config.device).long(),
        attention_mask=att_mask_input.to(config.device).long(),
        labels=labels.to(config.device).long(),
    )
    # output = model.forward(
    #     input_seqs,
    #     attention_mask=att_mask_input,
    #     labels=labels,
    # )

    optimizer.zero_grad()
    loss = output["loss"]
    loss.backward()
    optimizer.step()
    scheduler.step()
    
    torch.cuda.empty_cache()

In [12]:


model, optimizer, training_dataloader, scheduler = accelerator.prepare(
    model, optimizer, train_loader, scheduler
)

In [13]:
pbar = tqdm(enumerate(train_loader), total=len(train_loader))
for batch_idx, batch in pbar:


    terms, att_mask_terms, targets, input_seqs, att_mask_input, labels = batch

    # output = model.forward(
    #     input_seqs.to(config.device).long(),
    #     attention_mask=att_mask_input.to(config.device).long(),
    #     labels=labels.to(config.device).long(),
    # )
    output = model.forward(
        input_seqs,
        attention_mask=att_mask_input,
        labels=labels,
    )

    optimizer.zero_grad()
    loss = output["loss"]
    accelerator.backward(loss)
    optimizer.step()
    scheduler.step()
    
    torch.cuda.empty_cache()

  0%|          | 0/7769 [00:00<?, ?it/s]

 62%|██████▏   | 4818/7769 [1:47:44<1:05:59,  1.34s/it]


KeyboardInterrupt: 

In [16]:
accelerator.device

device(type='cuda')

In [7]:
txt = tokenizer('hello how are you doing?')

In [8]:
label = tokenizer.encode('i am doing well', return_tensors='pt', add_special_tokens=False)

In [26]:
label[0]

tensor([ 474,  626, 2599, 1532])

In [9]:
input_seq = torch.concat([txt['input_ids'], label], dim=1)

TypeError: expected Tensor as element 0 in argument 0, but got list

In [10]:
processed_term  = 'how are you?'
target = "great!"


encoded_term = tokenizer.encode(
    processed_term, return_tensors='pt'
)
encoded_target = tokenizer.encode(target, return_tensors='pt', add_special_tokens=False)

input_seq = torch.concat([encoded_term, encoded_target], dim=1)
labels = input_seq.clone()
labels[0, : encoded_term.size()[1]] = -100

att_mask_inputs = torch.zeros_like(input_seq)
att_mask_inputs[input_seq != 0] = 1

In [11]:
labels, input_seq

(tensor([[ -100,  -100,  -100,  -100,  -100,  2107, 29991]]),
 tensor([[    0,   920,   526,   366, 29973,  2107, 29991]]))

In [12]:
out = model.forward(input_ids=input_seq, attention_mask=att_mask_inputs, labels=labels)

In [13]:
loss = out.loss

In [14]:
loss

tensor(5.7479, grad_fn=<ToCopyBackward0>)

In [43]:
loss.backward()

In [44]:
optimizer = torch.optim.AdamW(model.parameters(), lr=3e-5)

In [45]:
optimizer.step()

In [46]:
out = model.forward(input_ids=input_seq, attention_mask=att_mask_inputs, labels=labels)

In [48]:
out.loss.backward()

In [49]:
optimizer.zero_grad()
optimizer.step()

In [50]:
out = model.forward(input_ids=input_seq, attention_mask=att_mask_inputs, labels=labels)

In [52]:
for i in range(10):
    optimizer.zero_grad()

    out = model.forward(input_ids=input_seq, attention_mask=att_mask_inputs, labels=labels)
    loss = out.loss
    print(loss)
    loss.backward()
    optimizer.step()

tensor(5.7083, grad_fn=<ToCopyBackward0>)
tensor(5.6528, grad_fn=<ToCopyBackward0>)
tensor(5.5960, grad_fn=<ToCopyBackward0>)
tensor(5.5263, grad_fn=<ToCopyBackward0>)
tensor(5.4481, grad_fn=<ToCopyBackward0>)
tensor(5.3696, grad_fn=<ToCopyBackward0>)
tensor(5.2882, grad_fn=<ToCopyBackward0>)
tensor(5.1871, grad_fn=<ToCopyBackward0>)
tensor(5.0791, grad_fn=<ToCopyBackward0>)
tensor(4.9481, grad_fn=<ToCopyBackward0>)
