In [1]:
# 코드 시작 전 실행하기
# D 드라이브로 변경하는 코드
import os

# Hugging Face 캐시 경로 변경
os.environ['HF_HOME'] = r'D:\huggingface_cache'          # 캐시 전체 경로
os.environ['TRANSFORMERS_CACHE'] = r'D:\huggingface_cache\transformers'
os.environ['HF_DATASETS_CACHE'] = r'D:\huggingface_cache\datasets'
os.environ['HF_METRICS_CACHE'] = r'D:\huggingface_cache\metrics'


In [3]:
# 코드 버전 확인용
# 버전 확인 하고 가상환경이 맞는지 확인하기
import torch
print(torch.__version__)      # PyTorch 버전
print(torch.version.cuda)     # CUDA 버전
print(torch.cuda.is_available())  # GPU 사용 가능 여부


2.8.0+cu126
12.6
True


In [4]:
# 1️ 필요한 라이브러리
from transformers import Blip2Processor, Blip2ForConditionalGeneration
import torch
from PIL import Image
import requests
from io import BytesIO

# 2️ GPU 사용 여부 확인
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# 3️ 모델 및 프로세서 로드
model_name = "Salesforce/blip2-flan-t5-xl"  # Windows + GPU 호환 안정 모델
processor = Blip2Processor.from_pretrained(model_name)
model = Blip2ForConditionalGeneration.from_pretrained(model_name).to(device)

# 4️ 이미지 불러오기
image_url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/p-blog/candy.JPG"
img = Image.open(BytesIO(requests.get(image_url).content)).convert("RGB")

# 5️ 텍스트 질문
question = "What animal is on the candy?"

# 6️ 입력 처리
inputs = processor(images=img, text=question, return_tensors="pt").to(device)

# 7️ 모델 실행
generated_ids = model.generate(**inputs)
answer = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]

# 8️ 결과 출력
print("Answer:", answer)


  from .autonotebook import tqdm as notebook_tqdm
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


Using device: cuda


Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  3.02it/s]


Answer: a giraffe


In [13]:
# 한국어 질문 예시
img_url = "https://www.gyeongju.go.kr/upload/content/thumb/gyimage/%EC%B2%A8%EC%84%B1%EB%8C%80%EC%9D%98%20%EC%95%84%EB%A6%84%EB%8B%A4%EC%9B%80_1_.jpg"
img = Image.open(BytesIO(requests.get(img_url).content)).convert("RGB")
question = "introduce this image in korean"

# 입력 처리
inputs = processor(images=img, text=question, return_tensors="pt").to(device)

# 모델 실행
generated_ids = model.generate(**inputs)
answer = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]

# 결과 출력
print("Answer:", answer)


Answer: 


In [5]:
# 랭체인 사용하는 코드
from transformers import Blip2Processor, Blip2ForConditionalGeneration, pipeline
from PIL import Image
import requests, torch
from io import BytesIO
from langchain_community.llms import HuggingFacePipeline

# 1. 모델 및 프로세서 로드
device = "cuda" if torch.cuda.is_available() else "cpu"
model_name = "Salesforce/blip2-flan-t5-xl"   
processor = Blip2Processor.from_pretrained(model_name)
model = Blip2ForConditionalGeneration.from_pretrained(
    model_name,
    torch_dtype=torch.float16 if device=="cuda" else torch.float32
).to(device)

# 2. HuggingFace pipeline 생성
blip2_pipe = pipeline(
    "image-to-text",
    model=model,
    tokenizer=processor.tokenizer,
    image_processor=processor.image_processor,
    device=0 if device=="cuda" else -1
)

# 3. LangChain LLM 래퍼로 감싸기 (텍스트 전용)
llm = HuggingFacePipeline(pipeline=blip2_pipe)

# 4. 이미지 불러오기
image_url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/p-blog/candy.JPG"
img = Image.open(BytesIO(requests.get(image_url).content)).convert("RGB")

# 5. HuggingFace pipeline에서 이미지+질문 직접 실행
question = "What animal is on the candy?"
raw_answer = blip2_pipe({"image": img, "text": question})[0]["generated_text"]

# 6. LangChain으로 후처리 (텍스트만 전달)
result = llm.invoke(f"The image was asked: {question}. The raw answer was: {raw_answer}. Summarize clearly.")

print("Answer:", result)



Loading checkpoint shards: 100%|██████████| 2/2 [00:03<00:00,  1.58s/it]
Device set to use cuda:0


TypeError: Incorrect format used for image. Should be an url linking to an image, a base64 string, a local path, or a PIL image.

In [9]:
# 0. 필요 라이브러리
from transformers import Blip2Processor, Blip2ForConditionalGeneration, pipeline
from PIL import Image
import requests, torch
from io import BytesIO
from langchain_community.llms import HuggingFacePipeline   # 최신 LangChain LLM 래퍼

# 1. 디바이스 설정
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# 2. 모델 및 프로세서 로드
model_name = "Salesforce/blip2-opt-2.7b"  # GPU 8GB 환경에서 안정적
processor = Blip2Processor.from_pretrained(model_name)

# 메모리 절약 옵션: float16 + device_map
model = Blip2ForConditionalGeneration.from_pretrained(
    model_name,
    torch_dtype=torch.float16 if device=="cuda" else torch.float32,
    device_map="auto"
)

# 3. HuggingFace pipeline 생성 (이미지->텍스트)
blip2_pipe = pipeline(
    task="image-to-text",
    model=model,
    tokenizer=processor.tokenizer,
    image_processor=processor.image_processor,
)

# 4. LangChain LLM 래퍼 (텍스트 후처리용)
llm = HuggingFacePipeline(pipeline=blip2_pipe)

# 5. 이미지 불러오기
image_url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/p-blog/candy.JPG"
img = Image.open(BytesIO(requests.get(image_url).content)).convert("RGB")

# 6. HuggingFace pipeline에서 이미지+질문 처리
question = "What animal is on the candy?"
raw_answer = blip2_pipe(img, prompt=question)[0]["generated_text"]

# 7. LangChain에서 텍스트 후처리 (선택적)
final_answer = llm.invoke(
    f"The image was asked: '{question}'. The raw answer from BLIP2 was: '{raw_answer}'. Summarize it clearly."
)

print("Answer:", final_answer)


Using device: cuda


Loading checkpoint shards: 100%|██████████| 2/2 [00:07<00:00,  3.75s/it]
Device set to use cuda:0
Passing `prompt` to the `image-to-text` pipeline is deprecated and will be removed in version 4.48 of 🤗 Transformers. Use the `image-text-to-text` pipeline instead


ValueError: Incorrect image source. Must be a valid URL starting with `http://` or `https://`, a valid path to an image file, or a base64 encoded string. Got The image was asked: 'What animal is on the candy?'. The raw answer from BLIP2 was: 'What animal is on the candy?
'. Summarize it clearly.. Failed with Incorrect padding

In [4]:
from transformers import Blip2Processor, Blip2ForConditionalGeneration, pipeline
from PIL import Image
import requests, torch
from io import BytesIO
from langchain.llms import GPT4All

# -------------------------------
# 1. 디바이스 설정
device = "cuda" if torch.cuda.is_available() else "cpu"

# -------------------------------
# 2. BLIP2 모델 및 프로세서 로드
model_name = "Salesforce/blip2-opt-2.7b"  # 8GB GPU safe
processor = Blip2Processor.from_pretrained(model_name)
model = Blip2ForConditionalGeneration.from_pretrained(
    model_name,
    torch_dtype=torch.float16 if device=="cuda" else torch.float32,
    device_map="auto"
)

# -------------------------------
# 3. 이미지 -> 텍스트 pipeline 생성
blip2_pipe = pipeline(
    task="image-to-text",
    model=model,
    tokenizer=processor.tokenizer,
    image_processor=processor.image_processor
)

# -------------------------------
# 4. 로컬 LLM (GPT4All 예시)
llm = GPT4All(model="ggml-gpt4all-j-v1.3-groovy.bin")  # 로컬 모델 파일 필요

# -------------------------------
# 5. 이미지 불러오기
image_url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/p-blog/candy.JPG"
response = requests.get(image_url)
response.raise_for_status()
img = Image.open(BytesIO(response.content)).convert("RGB")

# -------------------------------
# 6. BLIP2로 이미지 처리
question = "What animal is on the candy?"
raw_answer = blip2_pipe(img, prompt=question)[0]["generated_text"]
print("Raw Answer:", raw_answer)

# -------------------------------
# 7. 로컬 LLM으로 후처리 (텍스트 요약/정리)
final_answer = llm(f"The raw answer from BLIP2 was: '{raw_answer}'. Summarize clearly.")
print("Final Answer:", final_answer)

# ------------


Loading checkpoint shards: 100%|██████████| 2/2 [00:03<00:00,  1.65s/it]
Some parameters are on the meta device because they were offloaded to the cpu.
Device set to use cuda:0


FileNotFoundError: Model file does not exist: WindowsPath('C:/Users/EunSung/.cache/gpt4all/ggml-gpt4all-j-v1.3-groovy.bin')

In [19]:
import torch, gc

gc.collect()                   # 파이썬 메모리 정리
torch.cuda.empty_cache()        # CUDA 캐시 비우기


In [None]:

pip install

Collecting gpt4all
  Downloading gpt4all-2.8.2-py3-none-win_amd64.whl.metadata (4.8 kB)
Downloading gpt4all-2.8.2-py3-none-win_amd64.whl (119.6 MB)
   ---------------------------------------- 0.0/119.6 MB ? eta -:--:--
   ---------------------------------------- 0.8/119.6 MB 4.2 MB/s eta 0:00:29
    --------------------------------------- 1.6/119.6 MB 3.8 MB/s eta 0:00:31
    --------------------------------------- 2.4/119.6 MB 3.7 MB/s eta 0:00:32
   - -------------------------------------- 3.1/119.6 MB 3.8 MB/s eta 0:00:31
   - -------------------------------------- 4.2/119.6 MB 3.9 MB/s eta 0:00:30
   - -------------------------------------- 5.2/119.6 MB 4.2 MB/s eta 0:00:28
   -- ------------------------------------- 6.6/119.6 MB 4.4 MB/s eta 0:00:26
   -- ------------------------------------- 7.3/119.6 MB 4.5 MB/s eta 0:00:26
   -- ------------------------------------- 8.4/119.6 MB 4.5 MB/s eta 0:00:25
   --- ------------------------------------ 10.0/119.6 MB 4.7 MB/s eta 0:00:24
