In [23]:
%load_ext autoreload
%autoreload 2


import os

os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"

import sys
import torch
import pandas as pd
from torch import nn
import numpy as np
from torch.optim.lr_scheduler import ExponentialLR
import wandb

sys.path.append("../NLP-DL-Project-hypo-to-hyper/pipeline_src/")


from config.config import TaskConfig
from train import CustomScheduler, train
from logger.logger import WanDBWriter
from trainer.train_epoch import train_epoch, predict
from metrics.metrics import get_all_metrics
from dataset.dataset import init_data
from logger.logger import WanDBWriter


if torch.cuda.is_available():
    device = "cuda"
    print("GPU")
else:
    device = "cpu"
    print("CPU")


SEED = 0
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
print(torch.cuda.device_count())

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
GPU
4


In [24]:
from transformers import (
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    AutoConfig,
    AutoModelForCausalLM,
    LlamaTokenizer,
    LlamaForCausalLM,
)

from peft import LoraConfig, get_peft_model, get_peft_model_state_dict

In [28]:
config = TaskConfig()

config.batch_size = 2


config.data_path = 'babel_datasets/train_en_babel.pickle'
config.gold_path = (
    None  # "SemEval2018-Task9/training/gold/1A.english.training.gold.txt"
)
config.test_data_path = 'babel_datasets/test_en_babel.pickle'
config.test_gold_path = (
    None  # "SemEval2018-Task9/test/gold/1A.english.test.gold.txt"
)

config.device = device
config.using_peft = True
config.model_type = "Llama"  # Auto or Llama
config.wandb_log_dir = "/raid/rabikov/wandb/"
config.model_checkpoint = "eachadea/vicuna-7b-1.1"
config.exp_name = config.model_checkpoint.replace("/", "-")
config.saving_path = "/raid/rabikov/model_checkpoints/" + config.exp_name

load_path = (
    "/raid/rabikov/model_checkpoints/"
    + "eachadea-vicuna-7b-1.1_custom_epoch=3_MAP=0.0.pth"
)

In [29]:
if config.model_type == "Auto":
    model_type = AutoModelForCausalLM
    tokenizer_type = AutoTokenizer
elif config.model_type == "Llama":
    model_type = LlamaForCausalLM
    tokenizer_type = LlamaTokenizer

model = model_type.from_pretrained(
    config.model_checkpoint,
    # load_in_8bit=True,
    torch_dtype=torch.float16,
    device_map="auto",
)

tokenizer = tokenizer_type.from_pretrained(
    config.model_checkpoint,
    padding_side="left",
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [30]:
if config.using_peft:
    LORA_R = 8
    LORA_ALPHA = 16
    LORA_DROPOUT = 0.05
    LORA_TARGET_MODULES = [
        "q",
        "v",
    ]

    # model = prepare_model_for_int8_training(model)
    config_lora = LoraConfig(
        r=LORA_R,
        lora_alpha=LORA_ALPHA,
        # target_modules=LORA_TARGET_MODULES,
        lora_dropout=LORA_DROPOUT,
        bias="none",
        task_type="CAUSAL_LM",
    )
    model = get_peft_model(model, config_lora)
    model.print_trainable_parameters()

trainable params: 4194304 || all params: 6742609920 || trainable%: 0.06220594176090199


In [31]:
train_dataset, test_dataset, train_loader, val_loader = init_data(tokenizer, config)

In [32]:
train_dataset.data[12]

{'children': 'milliard.n.1',
 'parents': 'billion.n.3',
 'grandparents': 'large_integer.n.1',
 'case': 'only_child_leaf'}

In [33]:
tokenizer.decode(train_dataset[1]['input_seq'])

"<s> Predict hyponyms for the word 'digit.n.1'.  Answer:<s>zero.n.2, three.n.1, four.n.1, five.n.1, six.n.1, seven.n.1, eight.n.1, nine.n.1, binary_digit.n.1, decimal_digit.n.1, duodecimal_digit.n.1, hexadecimal_digit.n.1, octal_digit.n.1, significant_digit.n.1, one.n.1, two.n.1"

In [34]:
checkpoint = torch.load(load_path, map_location='cpu')
model.load_state_dict(checkpoint["model"])
del checkpoint
torch.cuda.empty_cache()

In [37]:
config.gen_args = {
    "no_repeat_ngram_size": 2,
    "max_new_tokens": 32,
    "num_return_sequences": 2,
    "num_beams": 15,
    "early_stopping": True,
    "num_beam_groups": 5,
    "diversity_penalty": 1.0,
    "temperature": 0.9,
}


config.gen_args = {
    "no_repeat_ngram_size": 2,
    "num_beams": 5,
    "early_stopping": True,
    "max_new_tokens": 8,
    "temperature": 0.95,
}

In [38]:
if config.using_peft:
    all_preds, all_labels = predict(model.model, tokenizer, val_loader, config)
else:
    all_preds, all_labels = predict(model, tokenizer, val_loader, config)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

eval going: 100%|██████████| 605/605 [2:17:14<00:00, 13.61s/it]  


In [43]:
all_preds[:10]

['\x04\x04\x04\x04\x04\x04\x04',
 '\x04\x04\x04\x04\x04\x04\x04',
 '\x04\x04\x04\x04\x04\x04\x04',
 '\x04\x04\x04\x04\x04\x04\x04',
 '\x04\x04\x04\x04\x04\x04\x04',
 '\x04\x04\x04\x04\x04\x04\x04',
 '\x04\x04\x04\x04\x04\x04\x04',
 '\x04\x04\x04\x04\x04\x04\x04',
 '\x04\x04\x04\x04\x04\x04\x04',
 '\x04\x04\x04\x04\x04\x04\x04']

In [18]:
all_labels1 = list(all_labels)

In [19]:
def transform(label):
    all_words = label.split(',')
    new_words = []
    for word in all_words:
        new_words.append(word.strip().split('.')[0])
    
    return ', '.join(new_words)
all_labels2 = list(map(transform, all_labels1))

In [27]:
all_preds2 = list(map(lambda x: x.replace('.', ''), all_preds))

In [28]:
metrics = get_all_metrics(all_labels2, all_preds2, limit=15)

In [34]:
all_labels2[17], all_preds2[17]

('black_locust, clammy_locust', 'chrysanthemum')

In [29]:
metrics

{'MRR': 0.0, 'MAP': 0.0, 'P@1': 0.0, 'P@3': 0.0, 'P@5': 0.0, 'P@15': 0.0}

In [15]:
import pickle

saving_path = config.saving_predictions_path + config.exp_name + "_"

with open(saving_path, "wb") as fp:
    pickle.dump(all_preds, fp)

In [17]:
all_preds[4], all_labels[4]

('person, actor, film director, cinematography, filmmaker, visual arts, person',
 'thrower, baseball player, jock, person')