In [1]:
import importlib
import dataset
import dataset.dataset
importlib.reload(dataset)
importlib.reload(dataset.dataset)
from dataset.dataset import get_dataset_by_name
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf")


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
dataset = get_dataset_by_name("finance", tokenizer=tokenizer, split='test')
dataset.set_format(columns=['input_ids', 'attention_mask', 'label', 'raw_text'])



In [3]:
from transformers import AutoTokenizer
from transformers import PreTrainedTokenizerBase
import numpy as np
from transformers import TrainingArguments, Trainer
from models.config import LlamaCLConfig
from transformers.modeling_utils import PreTrainedModel, unwrap_model
from models.slm import ScalableLM
import torch
import torch.nn as nn
from dataclasses import dataclass
from typing import Optional, Union, List, Any, Dict
from dataset.dataset import get_dataset_by_name, IGNORE_INDEX
import os
import re
import argparse
IGNORE_INDEX = -100
@dataclass
class DataCollatorWithPadding:

    eos_token_id: PreTrainedTokenizerBase
    task: str = "finance"

    def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, Any]:
        batch = dict(task=self.task, 
                     raw_text=[feature.pop('raw_text') for feature in features])  # num_labels can not use
        label_key = 'labels' if 'labels' in features else 'label'
        input_ids, attention_mask, labels = tuple([torch.tensor(feature[key]) for feature in features] for key in ['input_ids', 'attention_mask', label_key])
        input_ids = nn.utils.rnn.pad_sequence(
            input_ids, batch_first=True, padding_value=self.eos_token_id
        )
        attention_mask = nn.utils.rnn.pad_sequence(
            attention_mask, batch_first=True, padding_value=0
        )
        labels = nn.utils.rnn.pad_sequence(
            labels, batch_first=True, padding_value=IGNORE_INDEX
        )
        batch.update({
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "labels": labels,
        })
        return batch
data_collator = DataCollatorWithPadding(
                        eos_token_id=tokenizer.eos_token_id, 
                        task="finance")

[2023-08-17 11:08:54,207] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)


In [4]:
from torch.utils.data.dataloader import DataLoader
data_loader = DataLoader(dataset, batch_size=1, collate_fn=data_collator)

In [5]:
it = iter(data_loader)
data=next(it)

In [6]:
data

{'task': 'finance',
 'raw_text': ["What is the sentiment of this tweet? Please choose an answer from {negative/neutral/positive} ### The Group 's consolidated net sales for 2009 totaled 1.5 billion euros and it employs approximately 10,000 persons ."],
 'input_ids': tensor([[    1,   518, 25580, 29962,  3532, 14816, 29903,  6778,    13,  5618,
            338,   278, 19688,   310,   445,  7780,   300, 29973,  3529,  6755,
            385,  1234,   515,   426, 22198, 29914, 17821,  1705, 29914,  1066,
           3321, 29913,    13, 29966,   829, 14816, 29903,  6778,    13,    13,
           1576,  6431,   525, 29879,  1136, 17211,   630,  7787, 16538,   363,
          29871, 29906, 29900, 29900, 29929,  2025,  7943, 29871, 29896, 29889,
          29945, 24464, 11878,  1883,   322,   372,  3710,   417,   952, 14235,
          29871, 29896, 29900, 29892, 29900, 29900, 29900, 12407,   869,   518,
          29914, 25580, 29962, 29871]]),
 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1

In [7]:
config = LlamaCLConfig()
model = ScalableLM(config)
model.load_state_dict(torch.load("/data/bhpeng/SLM-llama/outputs/finance_history_medical/pytorch_model.bin"), strict=False)

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.88it/s]




_IncompatibleKeys(missing_keys=['model.model.embed_tokens.weight', 'model.model.layers.0.self_attn.q_proj.weight', 'model.model.layers.0.self_attn.k_proj.weight', 'model.model.layers.0.self_attn.v_proj.weight', 'model.model.layers.0.self_attn.o_proj.weight', 'model.model.layers.0.self_attn.rotary_emb.inv_freq', 'model.model.layers.0.mlp.gate_proj.weight', 'model.model.layers.0.mlp.up_proj.weight', 'model.model.layers.0.mlp.down_proj.weight', 'model.model.layers.0.input_layernorm.weight', 'model.model.layers.0.post_attention_layernorm.weight', 'model.model.layers.1.self_attn.q_proj.weight', 'model.model.layers.1.self_attn.k_proj.weight', 'model.model.layers.1.self_attn.v_proj.weight', 'model.model.layers.1.self_attn.o_proj.weight', 'model.model.layers.1.self_attn.rotary_emb.inv_freq', 'model.model.layers.1.mlp.gate_proj.weight', 'model.model.layers.1.mlp.up_proj.weight', 'model.model.layers.1.mlp.down_proj.weight', 'model.model.layers.1.input_layernorm.weight', 'model.model.layers.1.pos

In [11]:
model = model.to("cuda").eval()

{'task': 'finance',
 'raw_text': ["What is the sentiment of this tweet? Please choose an answer from {negative/neutral/positive} ### The Group 's consolidated net sales for 2009 totaled 1.5 billion euros and it employs approximately 10,000 persons ."],
 'input_ids': tensor([[    1,   518, 25580, 29962,  3532, 14816, 29903,  6778,    13,  5618,
            338,   278, 19688,   310,   445,  7780,   300, 29973,  3529,  6755,
            385,  1234,   515,   426, 22198, 29914, 17821,  1705, 29914,  1066,
           3321, 29913,    13, 29966,   829, 14816, 29903,  6778,    13,    13,
           1576,  6431,   525, 29879,  1136, 17211,   630,  7787, 16538,   363,
          29871, 29906, 29900, 29900, 29929,  2025,  7943, 29871, 29896, 29889,
          29945, 24464, 11878,  1883,   322,   372,  3710,   417,   952, 14235,
          29871, 29896, 29900, 29892, 29900, 29900, 29900, 12407,   869,   518,
          29914, 25580, 29962, 29871]]),
 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1

In [13]:
out=model.generate(max_length=512,
                   input_ids=data['input_ids'].to('cuda'),
                   raw_text=data['raw_text'],
                   attention_mask=data['attention_mask'].to('cuda'))

In [14]:
out

tensor([[    1,   518, 25580, 29962,  3532, 14816, 29903,  6778,    13,  5618,
           338,   278, 19688,   310,   445,  7780,   300, 29973,  3529,  6755,
           385,  1234,   515,   426, 22198, 29914, 17821,  1705, 29914,  1066,
          3321, 29913,    13, 29966,   829, 14816, 29903,  6778,    13,    13,
          1576,  6431,   525, 29879,  1136, 17211,   630,  7787, 16538,   363,
         29871, 29906, 29900, 29900, 29929,  2025,  7943, 29871, 29896, 29889,
         29945, 24464, 11878,  1883,   322,   372,  3710,   417,   952, 14235,
         29871, 29896, 29900, 29892, 29900, 29900, 29900, 12407,   869,   518,
         29914, 25580, 29962, 29871, 21104, 29889,     2]], device='cuda:0')