In [15]:
import torch

try:
    from transformers import BitsAndBytesConfig
    print("✅ bitsandbytes가 성공적으로 설치 및 import 되었습니다.")
    
    # MPS(M3 GPU)에서 4비트 양자화가 가능한지 최종 확인
    if torch.backends.mps.is_available():
        bnb_config = BitsAndBytesConfig(load_in_4bit=True)
        print("✅ 4비트 양자화 설정이 가능하며, MPS를 사용할 준비가 되었습니다.")
    else:
        print("🟡 bitsandbytes는 설치되었으나, MPS를 사용할 수 없습니다.")

except ImportError:
    print("❌ Import에 실패했습니다. 빌드 과정에서 에러가 발생했을 수 있습니다.")
except Exception as e:
    print(f"❌ 다른 에러 발생: {e}")

✅ bitsandbytes가 성공적으로 설치 및 import 되었습니다.
✅ 4비트 양자화 설정이 가능하며, MPS를 사용할 준비가 되었습니다.


In [16]:
# PyTorch를 먼저 설치합니다 (MPS 지원을 위해 필수).
!pip install torch torchvision torchaudio

# 그 다음, 나머지 라이브러리들을 설치합니다.
!pip install bitsandbytes transformers accelerate python-dotenv jupyter

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [17]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TextStreamer
from huggingface_hub import login
import torch

In [18]:
from dotenv import load_dotenv
import os

In [19]:
hf_token = os.getenv("HUGGINGFACEHUB_API_TOKEN")

In [20]:
if hf_token:
    login(token=hf_token)
    print("✅ Hugging Face 로그인이 완료되었습니다.")
else:
    print("❗️ HUGGINGFACE_TOKEN을 찾을 수 없습니다. .env 파일을 확인하세요.")


✅ Hugging Face 로그인이 완료되었습니다.


In [21]:
LLAMA = "meta-llama/Meta-Llama-3.1-8B-Instruct"
PHI3 = "microsoft/Phi-3-mini-4k-instruct"
GEMMA2 = "google/gemma-2-2b-it"
QWEN2 = "Qwen/Qwen2-7B-Instruct" # exercise for you
MIXTRAL = "mistralai/Mixtral-8x7B-Instruct-v0.1" # If this doesn't fit it your GPU memory, try others from the hub

In [22]:
messages = [
    {
        "role": "system",
        "content": "You are a helpful assistant"
    },
    {
        "role": "user",
        "content": "Tell me light-hearted joke for a room of data scientists"
    }
]


#### Accessing Llama3.1 from meta

In order to use the fantastic Llama 3.1, Meta does require you to sign their terms of service.

Visit their model instructions page in Hugging Face:
https://huggingface.co/meta-llama/Meta-Llama-3.1-8B

At the top of the page are instructions on how to agree to their terms. If possible, you should use the same email as your huggingface account.

In my experience approval comes in a couple of minutes. Once you've been approved for any 3.1 model, it applies to the whole family of models.

If you have any problems accessing Llama, please see this colab, including some suggestions if you don't get approved by Meta for any reason.

https://colab.research.google.com/drive/1deJO03YZTXUwcq2vzxWbiBhrRuI29Vo8

In [10]:
quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_quant_type="nf4"
)

In [11]:
# Tokenizer

tokenizer = AutoTokenizer.from_pretrained(
    LLAMA,
    use_fast=True,
    trust_remote_code=True
)
tokenizer.pad_token = tokenizer.eos_token
input = tokenizer.apply_chat_template(
    messages,
    llm_int8_enable_fp32_cpu_offload=True,
    return_tensors = "pt"
).to("mps")

In [12]:
model = AutoModelForCausalLM.from_pretrained(
    LLAMA,
    quantization_config=quant_config)

The 8-bit optimizer is not available on your device, only available on CUDA for now.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/184 [00:00<?, ?B/s]

In [13]:

# 양자화된 모델을 로컬에 저장
print("💾 양자화된 모델을 로컬에 저장합니다...")
model.save_pretrained("./Llama-2-7b-chat-hf-4bit")
tokenizer.save_pretrained("./Llama-2-7b-chat-hf-4bit")
print("✅ 저장 완료!")

💾 양자화된 모델을 로컬에 저장합니다...
✅ 저장 완료!


In [24]:
local_model_path = "/Users/sungminhong/Documents/Udemi/LLM_tutorial/3rd_week/Llama-2-7b-chat-hf-4bit"

# 2. 모델 불러오기
# device_map="auto" 옵션은 모델을 MPS(GPU)에 자동으로 할당해줍니다.
print("🚀 저장된 4비트 모델을 빠르게 로딩합니다...")
model = AutoModelForCausalLM.from_pretrained(
    local_model_path,
    llm_int8_enable_fp32_cpu_offload=True
)

# 3. 토크나이저 불러오기
tokenizer = AutoTokenizer.from_pretrained(local_model_path)

print("✅ 로딩 완료!")

🚀 저장된 4비트 모델을 빠르게 로딩합니다...


TypeError: LlamaForCausalLM.__init__() got an unexpected keyword argument 'llm_int8_enable_fp32_cpu_offload'

In [25]:
from transformers import AutoTokenizer, AutoModelForCausalLM

# 저장된 모델 경로
local_model_path = "./Llama-2-7b-chat-hf-4bit"

# 모델 불러오기
model = AutoModelForCausalLM.from_pretrained(
    local_model_path,
    device_map="auto",
    llm_int8_enable_fp32_cpu_offload=True # 업그레이드 후에는 이 옵션이 정상 작동합니다.
)

# 토크나이저 불러오기
tokenizer = AutoTokenizer.from_pretrained(local_model_path)

print("✅ 로딩 완료!")

TypeError: LlamaForCausalLM.__init__() got an unexpected keyword argument 'llm_int8_enable_fp32_cpu_offload'