# 使用 AWQ 算法量化 Facebook OPT-2.7B 模型¶

In [1]:
from transformers import pipeline

model_path = "facebook/opt-2.7b"

generator = pipeline('text-generation',
                     model=model_path,
                     device=0,
                     do_sample=True,
                     num_return_sequences=3)

  from .autonotebook import tqdm as notebook_tqdm
2024-03-19 22:57:48.234898: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-03-19 22:57:48.236820: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2024-03-19 22:57:48.263007: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-19 22:57:48.263040: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-19 22:57:48.263602: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515

In [2]:
generator("The woman worked as a")

[{'generated_text': 'The woman worked as a bank teller and in a car dealership and was a frequent shoplifter'},
 {'generated_text': 'The woman worked as a flight attendant in a regional flying from Newark to John F Kennedy International Airport.'},
 {'generated_text': 'The woman worked as a caregiver for one of the residents at a nursing home. The elderly woman'}]

## 使用 AutoAWQ 量化模型

In [12]:
from awq import AutoAWQForCausalLM
from transformers import AutoTokenizer

quant_path = "../models/opt-2.7b-awq"
quant_config = {"zero_point": True, "q_group_size": 128, "w_bit": 4, "version":"GEMM"}

model = AutoAWQForCausalLM.from_pretrained(model_path,device_map="cuda")
tokenizer = AutoTokenizer.from_pretrained(model_path,trust_remote_code = True)


Fetching 8 files: 100%|██████████████████████████| 8/8 [00:00<00:00, 107202.66it/s]


In [13]:
model.quantize(tokenizer,quant_config=quant_config)

Repo card metadata block was not found. Setting CardData to empty.
AWQ: 100%|██████████████████████████████████████| 32/32 [1:36:52<00:00, 181.64s/it]


## Transformers 兼容性配置

In [15]:
from transformers import AwqConfig, AutoConfig

quantization_config = AwqConfig(
    bits = quant_config["w_bit"],
    group_size = quant_config["q_group_size"],
    zero_point = quant_config["zero_point"],
    version = quant_config["version"].lower()
).to_dict()

model.model.config.quantization_config = quantization_config

In [16]:
model.save_quantized(quant_path)
tokenizer.save_pretrained(quant_path)

('../models/opt-2.7b-awq/tokenizer_config.json',
 '../models/opt-2.7b-awq/special_tokens_map.json',
 '../models/opt-2.7b-awq/vocab.json',
 '../models/opt-2.7b-awq/merges.txt',
 '../models/opt-2.7b-awq/added_tokens.json',
 '../models/opt-2.7b-awq/tokenizer.json')

## 使用 GPU 加载量化模型

In [17]:
from transformers import AutoTokenizer,AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained(quant_path)
model = AutoModelForCausalLM.from_pretrained(quant_path,device_map="cuda").to(0)

In [20]:
def generate_text(text):
    inputs = tokenizer(text,return_tensors="pt").to(0)

    out = model.generate(**inputs,max_new_tokens=128)
    return tokenizer.decode(out[0],skip_special_tokens=True)

In [21]:
result = generate_text("Merry Christmas! I'm glad to")
print(result)

Merry Christmas! I'm glad to see that the first month of NoFap is proving to be one which I hope to repeat!
No way, are you one of those 90% who relapse after one month? That's awesome man and I don't know how you accomplished that, but I'm definitely proud of you!!
