In [None]:
# =====================================================================
# 🏗️ Google Colab용 GPU AI API 서버
# =====================================================================

# 1️⃣ 필요한 패키지 설치
!pip install fastapi uvicorn transformers torch accelerate pyngrok nest_asyncio

# 2️⃣ 라이브러리 임포트
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
import uvicorn
import time
from typing import Optional, List
import asyncio
import nest_asyncio
from pyngrok import ngrok
import threading

Collecting fastapi
  Downloading fastapi-0.115.12-py3-none-any.whl.metadata (27 kB)
Collecting uvicorn
  Downloading uvicorn-0.34.2-py3-none-any.whl.metadata (6.5 kB)
Collecting pyngrok
  Downloading pyngrok-7.2.9-py3-none-any.whl.metadata (9.3 kB)
Collecting starlette<0.47.0,>=0.40.0 (from fastapi)
  Downloading starlette-0.46.2-py3-none-any.whl.metadata (6.2 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-c

In [None]:
# Colab에서 asyncio 이벤트 루프 허용
nest_asyncio.apply()

# 3️⃣ FastAPI 앱 및 모델 설정
app = FastAPI(title="GPU AI Inference Server", version="1.0.0")

class QuestionRequest(BaseModel):
    question: str
    context: Optional[str] = None
    max_tokens: int = 50

class BatchQuestionRequest(BaseModel):
    questions: List[str]
    contexts: Optional[List[str]] = None
    max_tokens: int = 50

class AIResponse(BaseModel):
    answer: str
    inference_time: float
    success: bool
    device: str

class BatchAIResponse(BaseModel):
    answers: List[str]
    inference_time: float
    success: bool
    device: str
    count: int

# 글로벌 모델 변수
model = None
tokenizer = None
device = None

In [None]:

def load_model():
    """모델 로딩 함수"""
    global model, tokenizer, device

    print("🔄 AI 모델 로딩 시작...")

    device = "cuda" if torch.cuda.is_available() else "cpu"
    torch_dtype = torch.bfloat16 if device == "cuda" else torch.float32

    try:
        # GPU에서 모델 로딩
        model = AutoModelForCausalLM.from_pretrained(
            "cometlee39/phi2-lora-qa-finetuned",
            trust_remote_code=True,
            torch_dtype=torch_dtype,
            device_map="auto" if device == "cuda" else "cpu",
            low_cpu_mem_usage=True
        )

        tokenizer = AutoTokenizer.from_pretrained("cometlee39/phi2-lora-qa-finetuned")
        if tokenizer.pad_token is None:
            tokenizer.pad_token = tokenizer.eos_token

        model.eval()

        print(f"✅ 모델 로딩 완료! 디바이스: {device}")

        if device == "cuda":
            print(f"💾 GPU 메모리 사용량: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")

        return True

    except Exception as e:
        print(f"❌ 모델 로딩 실패: {e}")
        return False

In [None]:

def generate_answer(question: str, context: str = None, max_tokens: int = 500) -> dict:
    """단일 질문 응답 생성"""
    start_time = time.time()

    try:
        # 프롬프트 구성
        if context:
            prompt = f"Context: {context[:500]}...\n\nQuestion: {question}\nAnswer:"
        else:
            prompt = f"Question: {question}\nAnswer:"

        # 토크나이징
        inputs = tokenizer(
            prompt,
            return_tensors="pt",
            truncation=True,
            max_length=512
        )

        if device == "cuda":
            inputs = {k: v.to(device) for k, v in inputs.items()}

        # 최적화된 생성
        # with torch.no_grad():
        #     outputs = model.generate(
        #         **inputs,
        #         max_new_tokens=max_tokens,
        #         temperature=0.3,
        #         do_sample=False,
        #         pad_token_id=tokenizer.eos_token_id,
        #         eos_token_id=tokenizer.eos_token_id,
        #         use_cache=True,
        #     )
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=max_tokens,
                min_length=inputs['input_ids'].shape[1] + 10,  # 최소 길이
                do_sample=True,
                temperature=0.7,
                top_p=0.9,
                repetition_penalty=1.1,
                pad_token_id=tokenizer.eos_token_id,
                eos_token_id=tokenizer.eos_token_id,
                use_cache=True,
                early_stopping=False,
            )

        # 디코딩
        response = tokenizer.decode(outputs[0], skip_special_tokens=True)
        answer = response.split("Answer:")[-1].strip()

        if "\n" in answer:
            answer = answer.split("\n")[0].strip()

        inference_time = time.time() - start_time

        return {
            "answer": answer,
            "inference_time": inference_time,
            "success": True,
            "device": device
        }

    except Exception as e:
        return {
            "answer": f"추론 오류: {str(e)}",
            "inference_time": time.time() - start_time,
            "success": False,
            "device": device
        }

In [None]:

# 4️⃣ API 엔드포인트들
@app.post("/api/ask", response_model=AIResponse)
async def ask_single_question(request: QuestionRequest):
    """단일 질문 API"""
    if model is None:
        raise HTTPException(status_code=503, detail="모델이 로딩되지 않았습니다")

    result = generate_answer(request.question, request.context, request.max_tokens)
    return AIResponse(**result)

@app.post("/api/ask_batch", response_model=BatchAIResponse)
async def ask_batch_questions(request: BatchQuestionRequest):
    """배치 질문 API"""
    if model is None:
        raise HTTPException(status_code=503, detail="모델이 로딩되지 않았습니다")

    result = generate_batch_answers(request.questions, request.contexts, request.max_tokens)
    return BatchAIResponse(**result)

@app.get("/api/health")
async def health_check():
    """헬스 체크"""
    return {
        "status": "healthy" if model is not None else "loading",
        "device": device,
        "model_loaded": model is not None
    }

@app.get("/api/stats")
async def get_stats():
    """시스템 통계"""
    if torch.cuda.is_available() and device == "cuda":
        gpu_memory = torch.cuda.memory_allocated() / 1024**3
        gpu_total = torch.cuda.get_device_properties(0).total_memory / 1024**3
        return {
            "device": "cuda",
            "gpu_memory_used": f"{gpu_memory:.2f} GB",
            "gpu_memory_total": f"{gpu_total:.2f} GB",
            "gpu_utilization": f"{gpu_memory/gpu_total*100:.1f}%"
        }
    else:
        return {"device": "cpu"}

@app.get("/")
async def root():
    """루트 엔드포인트"""
    return {
        "message": "🚀 GPU AI Inference Server",
        "status": "running",
        "endpoints": ["/api/ask", "/api/ask_batch", "/api/health", "/api/stats"]
    }


In [None]:

# 5️⃣ 서버 실행 함수
def run_server():
    """백그라운드에서 서버 실행"""
    uvicorn.run(app, host="0.0.0.0", port=8000, log_level="info")

# 6️⃣ 메인 실행 코드
def start_ai_server():
    """AI 서버 시작"""
    print("🚀 Colab GPU AI Server 시작!")

    # GPU 확인
    if torch.cuda.is_available():
        print(f"✅ GPU 감지됨: {torch.cuda.get_device_name(0)}")
        print(f"💾 GPU 메모리: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")
    else:
        print("⚠️ GPU를 찾을 수 없습니다. CPU 모드로 실행됩니다.")

    # 모델 로딩
    if not load_model():
        print("❌ 모델 로딩에 실패했습니다. 서버를 시작할 수 없습니다.")
        return

    # ngrok 터널 설정 (선택사항)
    try:
        public_url = ngrok.connect(8000)
        print(f"🌐 Public URL: {public_url}")
        print("🔗 외부에서 API에 접근할 수 있습니다!")
    except Exception as e:
        print(f"⚠️ ngrok 터널 설정 실패: {e}")
        print("🏠 로컬에서만 접근 가능합니다.")

    # 서버를 별도 스레드에서 실행
    server_thread = threading.Thread(target=run_server, daemon=True)
    server_thread.start()

    print("\n🎉 서버가 시작되었습니다!")
    print("📋 사용 가능한 엔드포인트:")
    print("   - GET  /api/health   : 서버 상태 확인")
    print("   - GET  /api/stats    : GPU 메모리 통계")
    print("   - POST /api/ask      : 단일 질문")
    print("   - POST /api/ask_batch: 배치 질문")
    print("\n💡 서버를 중지하려면 런타임을 재시작하세요.")

    return True


In [None]:

# 7️⃣ 테스트 함수
def test_api(test_data):
    """API 테스트"""
    import requests
    import json

    try:
        # 헬스 체크
        # response = requests.get("http://localhost:8000/api/health")
        # print("🔍 헬스 체크:", response.json())

        response = requests.post("http://localhost:8000/api/ask",
                               json=test_data)
        result = response.json()
        print(f"\n질문: {test_data['question']}")
        print(f"\nAI 응답: {result['answer']}")
        # print(f"⏱️ 추론 시간: {result['inference_time']:.2f}초")
        # print(f"🖥️ 사용된 디바이스: {result['device']}")

    except Exception as e:
        print(f"❌ 테스트 실패: {e}")


In [None]:
# 8️⃣ 실행
if __name__ == "__main__":
    # 서버 시작
    if start_ai_server():
        # 잠시 대기 후 테스트
        import time
        time.sleep(5)

🚀 Colab GPU AI Server 시작!
✅ GPU 감지됨: Tesla T4
💾 GPU 메모리: 14.7 GB
🔄 AI 모델 로딩 시작...


adapter_config.json:   0%|          | 0.00/819 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/735 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/35.7k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/564M [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/189M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/7.43k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.57M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/1.08k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/473 [00:00<?, ?B/s]

✅ 모델 로딩 완료! 디바이스: cuda
💾 GPU 메모리 사용량: 5.27 GB


ERROR:pyngrok.process.ngrok:t=2025-05-29T05:38:07+0000 lvl=eror msg="failed to reconnect session" obj=tunnels.session err="authentication failed: Usage of ngrok requires a verified account and authtoken.\n\nSign up for an account: https://dashboard.ngrok.com/signup\nInstall your authtoken: https://dashboard.ngrok.com/get-started/your-authtoken\r\n\r\nERR_NGROK_4018\r\n"
ERROR:pyngrok.process.ngrok:t=2025-05-29T05:38:07+0000 lvl=eror msg="session closing" obj=tunnels.session err="authentication failed: Usage of ngrok requires a verified account and authtoken.\n\nSign up for an account: https://dashboard.ngrok.com/signup\nInstall your authtoken: https://dashboard.ngrok.com/get-started/your-authtoken\r\n\r\nERR_NGROK_4018\r\n"
ERROR:pyngrok.process.ngrok:t=2025-05-29T05:38:07+0000 lvl=eror msg="terminating with error" obj=app err="authentication failed: Usage of ngrok requires a verified account and authtoken.\n\nSign up for an account: https://dashboard.ngrok.com/signup\nInstall your aut

⚠️ ngrok 터널 설정 실패: The ngrok process errored on start: authentication failed: Usage of ngrok requires a verified account and authtoken.\n\nSign up for an account: https://dashboard.ngrok.com/signup\nInstall your authtoken: https://dashboard.ngrok.com/get-started/your-authtoken\r\n\r\nERR_NGROK_4018\r\n.
🏠 로컬에서만 접근 가능합니다.

🎉 서버가 시작되었습니다!
📋 사용 가능한 엔드포인트:
   - GET  /api/health   : 서버 상태 확인
   - GET  /api/stats    : GPU 메모리 통계
   - POST /api/ask      : 단일 질문
   - POST /api/ask_batch: 배치 질문

💡 서버를 중지하려면 런타임을 재시작하세요.


INFO:     Started server process [287]
INFO:     Waiting for application startup.
INFO:     Application startup complete.
INFO:     Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit)



🧪 API 테스트 중...
INFO:     127.0.0.1:60308 - "GET /api/health HTTP/1.1" 200 OK


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 헬스 체크: {'status': 'healthy', 'device': 'cuda', 'model_loaded': True}
INFO:     127.0.0.1:60314 - "POST /api/ask HTTP/1.1" 200 OK

🤖 AI 응답: Artificial intelligence, also known as AI, is a branch of computer science that deals with the development of intelligent machines that can perform tasks that would typically require
⏱️ 추론 시간: 3.90초
🖥️ 사용된 디바이스: cuda


In [None]:
print("\n🧪 API 테스트 중...")
# 단일 질문 테스트
test_cases = [
   {
    "question": "What types of visas are available for Canada?",
    "context": "Help us improve our services\nWe’re looking for people to try out our products. Find out how you\ncan participate.\nTop questions about visiting Canada\nHow long can I stay in Canada as a visitor?\nWhat is the difference between a single and a multiple\nentry visa?\nHow do I help a family member or friend apply to visit\nCanada?\nSee all questions about this topic\nDate modified:\n2025-05-07\n25. 5. 12. 오전 3:24\nVisitor Visa: Prepare for your arrival - Canada.ca\nhttps://www.canada.ca/en/immigration-refugees-citizenship/services/visit-canada/prepare-arrival.html\n4/4\n\n---\n\nvisitor visa or eTA\nwork permit\nstudy permit\nYou can submit all applications for your family with your work permit\napplication.\n25. 5. 12. 오전 3:55\nWho can apply - Canada.ca\nhttps://www.canada.ca/en/immigration-refugees-citizenship/services/immigrate-canada/start-visa/work-permits/eligibility.html\n5/6\n\n---\n\nGet a visa\nVisas are issued by foreign government offices in Canada. Requirements,\nfees and processing times vary, depending on"
  },
  {
    "question": "How do I apply for a visa to Canada?",
    "context": "You may need to complete extra steps when you fill out your\napplication.\nHow to apply for your new visitor visa\nYou need to apply online in your IRCC secure account. If you don’t\nalready have an account, follow these steps to register.\nYou’ll need to select “Apply to Come to Canada” from your account main\npage to get started.\nGet the right application form\nTo get the right application form, provide these answers in the online\nquestionnaire:\nFor the first question “What would you like to do in Canada?”, select\n“Study” if you have a valid study permit or\n“Work” if you have a valid work permit\nWhen asked “What is your current country/territory of residence?”,\nselect “Canada”.\nAnswer the questions on the next pages about your work or studies.\nYou may be given the option to extend your current study or work\npermit or to apply for a “Temporary Resident Visa”. Make sure you\nselect “Temporary Resident Visa”.\nThe application form listed in your document checklist will be called\n\n---\n\nGet a visa"
  },
  {
    "question": "What documents are required for a Canada visa?",
    "context": "Prepare your documents\nWhen you arrive in Canada, you must have\nyour passport\nyour visitor visa (if you need one)\nany travel document(s) you’re carrying\nYou may also need to show proof that your work permit application was\napproved. One way to do this is to show the border services officer your\nport of entry letter of introduction. You’ll get this letter if you\napplied online or\ngave us an email address in the work permit application form for\ncommunicating with you \nYou can print this letter or bring an electronic version with you.\nYou should also bring supporting documents, such as\nproof that you meet the requirements of the job, such as proof of\nwork experience and education\na copy of your employer’s positive labour market impact assessment\n(LMIA), if required\nYou’ll also need a copy of your attestation of issuance of your\nQuebec Acceptance Certificate (CAQ), if your employer needed\nan LMIA and you’ll work in Quebec.\nthe offer of employment number your employer received when they\n\n--"
  },
  {
    "question": "How much does a Canada visa cost?",
    "context": "Canada.ca\n \nTravel\n \nTravel abroad\n \nTravel documents\n\n\n\nVisas, biometrics and electronic travel\nauthorizations\nEntry requirements vary from country to country. Before you travel,\nunderstand what you need to enter the destination including any visas,\nelectronic travel authorizations (ETAs) and/or your biometrics.\nOn this page\nVisas\nBiometrics\nElectronic travel authorizations\nVisas\nA visa is an official document, usually stamped or glued inside a\npassport, that allows a foreign national to enter a country for a specific\npurpose and for a set amount of time.\nFind out if you need a visa\nTo find out if you need any visas for your trip, start by consulting our\nTravel advice and advisories page well before you plan to leave. Select\nyour destination from the drop-down menu and consult the information\non visas under “Entry and exit requirements”.\nYou should then contact your destination’s embassy or consulate in\nCanada to confirm whether you need a visa to enter the country or stay\n\n---\n\nHo"
  },
  {
    "question": "What is the processing time for a Canada visa?",
    "context": "You may need to give biometrics with your application. This\nprocessing time doesn’t include the time you need to give\nbiometrics.\n\n25. 5. 12. 오전 3:24\nTransit visa: After you apply - Canada.ca\nhttps://www.canada.ca/en/immigration-refugees-citizenship/services/visit-canada/transit/transit-visa/after-apply-next-steps.html\n2/3\n\n---\n\nCanada class and you’ve submitted an application for an open work\npermit, your work permit will normally be processed within four months.\n25. 5. 12. 오전 4:14\nSponsor your spouse, common-law partner, conjugal partner or dependent child – Complete Guide (IMM 5289) - Canada.ca\nhttps://www.canada.ca/en/immigration-refugees-citizenship/services/application/application-forms-guides/guide-5289-sponsor-your-spouse-co…\n71/89\n\n---\n\nGet a visa\nVisas are issued by foreign government offices in Canada. Requirements,\nfees and processing times vary, depending on the country and type of\nvisa you need. The most common categories are business, work, student\nand tourist visas. Ge"
  }
]

for test_data in test_cases:
  test_api(test_data)


🧪 API 테스트 중...
INFO:     127.0.0.1:45752 - "POST /api/ask HTTP/1.1" 200 OK

질문: What types of visas are available for Canada?

AI 응답: For traveling to Canada, there are different types of visas available depending on your purpose of visit. Some common types include visitor visas (eVisa), work permits, study permits, and temporary resident visas like the Transit Visa and Work-Related Temporary Resident
INFO:     127.0.0.1:53514 - "POST /api/ask HTTP/1.1" 200 OK

질문: How do I apply for a visa to Canada?

AI 응답: To apply for a visa to Canada, you will need to go through the process online using your IRCC secure account. Start by selecting "Apply to Come to Canada" from your account main page. Once that's done, provide all the necessary information
INFO:     127.0.0.1:53520 - "POST /api/ask HTTP/1.1" 200 OK

질문: What documents are required for a Canada visa?

AI 응답: To apply for a Canada visa, you will need your passport and any necessary travel documents. Additionally, if applying online