# Quantization

In [1]:
!pip install --quiet bitsandbytes
!pip install --quiet transformers
!pip install --quiet accelerate
!pip install scipy numpy
!pip install torch==2.0.1
!pip install sentencepiece
!pip install protobuf

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m137.5/137.5 MB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
Collecting torch==2.0.1
  Downloading torch-2.0.1-cp310-cp310-manylinux1_x86_64.whl.metadata (24 kB)
Collecting nvidia-cuda-nvrtc-cu11==11.7.99 (from torch==2.0.1)
  Downloading nvidia_cuda_nvrtc_cu11-11.7.99-2-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu11==11.7.99 (from torch==2.0.1)
  Downloading nvidia_cuda_runtime_cu11-11.7.99-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cuda-cupti-cu11==11.7.101 (from torch==2.0.1)
  Downloading nvidia_cuda_cupti_cu11-11.7.101-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu11==8.5.0.96 (from torch==2.0.1)
  Downloading nvidia_cudnn_cu11-8.5.0.96-2-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu11==11.10.3.66 (from torch==2.0.1)
  Downloading nvidia_cublas_cu11-11.10.3.66-py3-none-manylinux1_x86_64.whl.metadata (1.

In [None]:
import torch
import transformers
from transformers import AutoModelForCausalLM, AutoTokenizer

# Checking the GPU configurations
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device_name = torch.cuda.get_device_name() if torch.cuda.is_available() else 'CPU'
print(f'Using device: {device}({device_name})')

Using device: cuda(Tesla T4)


In [None]:
import torch
import transformers
from transformers import AutoModelForCausalLM, AutoTokenizer

def quantize_and_save_model(model_name, save_path, quantization_bit=4, test_input="Once upon a time"):

    # Configuring the BitsAndBytes parameters for quantization
    if quantization_bit == 4:
        bnb_config = transformers.BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_quant_type='nf4',
            bnb_4bit_use_double_quant=True,
            bnb_4bit_compute_dtype=torch.bfloat16
        )
    elif quantization_bit == 8:
        bnb_config = transformers.BitsAndBytesConfig(
            load_in_8bit=True
        )
    else:
        raise ValueError("Quantization bit must be either 4 or 8")

    print(f"Downloading and quantizing model: {model_name}")

    # Download and quantize the model
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        quantization_config=bnb_config,
        device_map="auto",
        trust_remote_code=True,
    )
    model.eval()
    print(f"Model loaded and quantized on {device}")

    # Save the quantized model and tokenizer
    print(f"Saving quantized model to {save_path}")
    model.save_pretrained(save_path)
    tokenizer.save_pretrained(save_path)
    print(f"Quantized model and tokenizer saved to {save_path}")

    # Test the model
    input_ids = tokenizer(test_input, return_tensors="pt").input_ids.to(device)
    with torch.no_grad():
        output = model.generate(input_ids, max_length=50)
    print(f"Test output: {tokenizer.decode(output[0], skip_special_tokens=True)}")

    return model, tokenizer

# Example usage
if __name__ == "__main__":
    model_name = "meta-llama/Llama-2-7b-chat-hf"  # Replace with the desired model
    save_path = "./llama2_7b_quantized"

    quantized_model, tokenizer = quantize_and_save_model(model_name, save_path)

Downloading and quantizing model: meta-llama/Llama-2-7b-chat-hf


tokenizer_config.json:   0%|          | 0.00/1.62k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

Model loaded and quantized on cuda
Saving quantized model to ./llama2_7b_quantized
Quantized model and tokenizer saved to ./llama2_7b_quantized
Test output: Once upon a time, in a far-off land, there was a young prince named Leo. Unterscheidung between the two languages is not always straightforward, and there are several factors that can make it difficult to determine whether a given word is of Latin


# Load and test it

In [4]:
import zipfile
import os

def unzip_file(zip_file_path, extract_to=None):
    # Check if the output directory is provided, if not, extract to the same directory as the zip file
    if extract_to is None:
        extract_to = os.path.dirname(zip_file_path)

    # Ensure the output directory exists
    os.makedirs(extract_to, exist_ok=True)

    # Open the zip file
    with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
        # Extract all the contents to the specified directory
        zip_ref.extractall(extract_to)
        print(f"Extracted all files to: {extract_to}")

# Example usage
zip_file_path = r"C:\Users\Admin\Downloads\Lllama_2_quantized"
extract_to = "/extracted_files_llama"

unzip_file(zip_file_path, extract_to)


FileNotFoundError: [Errno 2] No such file or directory: 'C:\\Users\\Admin\\Downloads\\Lllama_2_quantized'

In [3]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

def load_llama_Model(base_model_path):
    tokenizer = AutoTokenizer.from_pretrained(base_model_path)
    model = AutoModelForCausalLM.from_pretrained(base_model_path)
    return model, tokenizer

base_model_path = r"/content/sample_data/llama_quant"
model7b, tokenizer7b = load_llama_Model(base_model_path)

Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.
`low_cpu_mem_usage` was None, now set to True since model is quantized.


OSError: Error no file named pytorch_model.bin, model.safetensors, tf_model.h5, model.ckpt.index or flax_model.msgpack found in directory /content/sample_data/llama_quant.

In [None]:
question = "What is the capital of France?"

input_ids = tokenizer7b.encode(question, return_tensors='pt')

with torch.no_grad():
    output = model7b.generate(input_ids, max_length=50, num_return_sequences=1)

response = tokenizer7b.decode(output[0], skip_special_tokens=True)

print(response)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Configuring the BitsAndBytes paramenters for quantization
bnb_config = transformers.BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16
)

base_model_path = "/workspace/sandeep/huggingface/llama13B"

tokenizer = AutoTokenizer.from_pretrained(base_model_path)
model = AutoModelForCausalLM.from_pretrained(
    base_model_path,
    local_files_only=True,
    quantization_config=bnb_config,
)
model.eval()
print(f"Model loaded on {device}")

# Load it from locally

In [None]:
# Configuring the BitsAndBytes paramenters for quantization
bnb_config = transformers.BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16
)

base_model_path = "/workspace/sandeep/huggingface/llama13B"

tokenizer = AutoTokenizer.from_pretrained(base_model_path)
model = AutoModelForCausalLM.from_pretrained(
    base_model_path,
    local_files_only=True,
    quantization_config=bnb_config,
)
model.eval()
print(f"Model loaded on {device}")

In [None]:
# Configuring the BitsAndBytes paramenters for quantization
bnb_config = transformers.BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16
)

base_model_path = "/workspace/sandeep/huggingface/llama13B"

tokenizer = AutoTokenizer.from_pretrained(base_model_path)
model = AutoModelForCausalLM.from_pretrained(
    base_model_path,
    local_files_only=True,
    quantization_config=bnb_config,
)
model.eval()
print(f"Model loaded on {device}")


# Qualtizzation krish naik

In [None]:
!pip install -q accelerate==0.21.0 peft==0.4.0 bitsandbytes==0.40.2 transformers==4.31.0 trl==0.4.7

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.9/116.9 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.2/244.2 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m72.9/72.9 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.5/92.5 MB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m43.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.4/77.4 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m59.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m996.9 kB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
import torch
import transformers
from transformers import AutoModelForSequenceClassification, AutoTokenizer

# Checking the GPU configurations
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device_name = torch.cuda.get_device_name() if torch.cuda.is_available() else 'CPU'
print(f'Using device: {device} ({device_name})')

# Configuring the BitsAndBytes parameters for quantization
bnb_config = transformers.BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16
)


Using device: cuda (Tesla T4)


In [None]:
# Load the tokenizer and model
from transformers import AutoTokenizer, AutoModelForCausalLM
model_name = "NousResearch/Llama-2-7b-chat-hf"

tokenizer = AutoTokenizer.from_pretrained(model_name)

# Since this is a sequence classification model, use AutoModelForSequenceClassification
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,  # Applying quantization (optional, not necessary for small models like this)
)

# Move model to the appropriate device
model.to(device)

model.eval()
print(f"Model loaded on {device}")




tokenizer_config.json:   0%|          | 0.00/746 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/21.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/435 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/583 [00:00<?, ?B/s]



model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/200 [00:00<?, ?B/s]

ValueError: `.to` is not supported for `4-bit` or `8-bit` models. Please use the model as it is, since the model has already been set to the correct devices and casted to the correct `dtype`.

In [None]:
# Example usage with the tokenizer
text = "I love using Hugging Face models!"
inputs = tokenizer(text, return_tensors="pt")
inputs = {k: v.to(device) for k, v in inputs.items()}  # Move inputs to the appropriate device

# Perform inference
with torch.no_grad():
    outputs = model(**inputs)
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    print(f"Sentiment prediction: {predictions.item()} (0 = Negative, 1 = Positive)")

In [None]:
!pip install -q torchinfo

In [None]:
from transformers import LlamaForCausalLM, LlamaTokenizer
from torchinfo import summary

# Load the model (you can choose a specific LLaMA model variant)
model_name = "meta-llama/Llama-2-7b"  # You can replace this with "Llama-2-13b" or "Llama-2-70b"
model = LlamaForCausalLM.from_pretrained(model_name)

# Load the tokenizer
tokenizer = LlamaTokenizer.from_pretrained(model_name)

# Display the model's architecture summary
# `summary` gives detailed information about each layer, the number of parameters, and more.
# Adjust `input_size` to match the sequence length (here it's set to 512 for demonstration).

summary(model, input_size=(1, 512), depth=3)  # depth=3 gives a summary of main layers; increase depth for more detail




OSError: meta-llama/Llama-2-7b does not appear to have a file named config.json. Checkout 'https://huggingface.co/meta-llama/Llama-2-7b/main' for available files.

In [None]:
from datasets import load_dataset, DatasetDict, Dataset

from transformer(
      AutoTokenizer,
      AutoConfig,
      AutoModelForSequenceClassification,
      DataCollatorWithPadding,
      TrainingArguments,
      Trainer)

from peft import PeftModel, PeftConfig, get_peft_model, LoraConfig
import evaluate
import torch
import numpy as np

SyntaxError: invalid syntax (<ipython-input-5-ee240ed91039>, line 3)