In [22]:
llm = ChatOllama(
    model="qwen3:0.6b",
    base_url="https://ollama-gcs-948194141289.europe-west1.run.app",
    temperature=0.1
)

In [23]:
import subprocess
import requests
import json
import time
from typing import Optional, Dict, Any

class OllamaClient:
    def __init__(self, service_url: str):
        self.service_url = service_url.rstrip('/')
        self._token = None
        self._token_expires = 0

    def _get_identity_token(self) -> str:
        """Get identity token, with caching"""
        current_time = time.time()

        # Token is valid for ~1 hour, refresh if needed
        if self._token is None or current_time >= self._token_expires:
            result = subprocess.run(
                ['gcloud', 'auth', 'print-identity-token'],
                capture_output=True,
                text=True
            )

            if result.returncode != 0:
                raise Exception(f"Failed to get identity token: {result.stderr}")

            self._token = result.stdout.strip()
            # Cache for 50 minutes (tokens valid for ~1 hour)
            self._token_expires = current_time + (50 * 60)

        return self._token

    def _make_request(self, endpoint: str, data: Optional[Dict] = None, method: str = "GET") -> Dict[Any, Any]:
        """Make authenticated request to Ollama API"""
        url = f"{self.service_url}/{endpoint.lstrip('/')}"
        token = self._get_identity_token()

        headers = {
            'Authorization': f'Bearer {token}',
            'Content-Type': 'application/json'
        }

        if method.upper() == "POST":
            response = requests.post(url, headers=headers, json=data)
        else:
            response = requests.get(url, headers=headers)

        response.raise_for_status()
        return response.json()

    def generate(self, model: str, prompt: str, stream: bool = False) -> Dict[str, Any]:
        """Generate text using specified model"""
        data = {
            'model': model,
            'prompt': prompt,
            'stream': stream
        }
        return self._make_request('/api/generate', data, method="POST")

    def list_models(self) -> Dict[str, Any]:
        """List available models"""
        return self._make_request('/api/tags')

    def pull_model(self, model: str) -> Dict[str, Any]:
        """Pull a model to the service"""
        data = {'name': model}
        return self._make_request('/api/pull', data, method="POST")

    def chat(self, model: str, messages: list, stream: bool = False) -> Dict[str, Any]:
        """Chat with model using messages format"""
        data = {
            'model': model,
            'messages': messages,
            'stream': stream
        }
        return self._make_request('/api/chat', data, method="POST")



In [39]:
client, llm_model = OllamaClient("https://ollama-qwen3-948194141289.us-central1.run.app"), "qwen3:14b"
# client, llm_model = OllamaClient("https://ollama-qwen3-32b-948194141289.us-central1.run.app"), "qwen3:32b"

In [40]:
print("=== Available Models ===")
models = client.list_models()
for model in models.get('models', []):
    print(f"- {model.get('name', 'Unknown')}")

=== Available Models ===
- qwen3:32b


In [41]:
print("\n=== Text Generation ===")
result = client.generate(llm_model, "Why is the sky blue? Keep it short. /nothink")
print("Prompt:", "Why is the sky blue? Keep it short.")
print("Response:", result.get('response', result))


=== Text Generation ===
Prompt: Why is the sky blue? Keep it short.
Response: <think>
Okay, let's see why 2+2*3 is the way it is. Hmm, I remember something about order of operations from math class. Is it PEMDAS? Parentheses, Exponents, Multiplication and Division, Addition and Subtraction. Right, so multiplication comes before addition. So in the expression 2+2*3, I should do the multiplication first. Let me check that.

So 2*3 is 6. Then add 2 to that. 2 + 6 equals 8. Wait, but if someone does it left to right without considering order of operations, they might add 2+2 first, getting 4, then multiply by 3 to get 12. But that's not correct because multiplication has higher precedence. So the correct answer should be 8. Let me verify with another example to make sure I'm not mixing up anything. Like 5+3*2. If I do 3*2 first, that's 6, plus 5 is 11. If I did left to right, 5+3 is 8*2 is 16. Definitely, 11 is the right answer. So yeah, applying the same logic to 2+2*3, it's 8. I think t

In [12]:
print("\n=== Chat Format ===")
messages = [
    {"role": "user", "content": "Hello! What's 2+2?"}
]
chat_result = client.chat("qwen3:14b", messages)
print("Chat Response:", chat_result.get('message', {}).get('content', chat_result))



=== Chat Format ===
Chat Response: <think>
Okay, the user asked "Hello! What's 2+2?" Let me break this down. First, they started with a greeting, so I should respond politely. Then the math question. 2+2 is straightforward, but I need to make sure I explain it clearly. Maybe they're testing if I know basic math, or they just need help. Let me confirm the answer is 4, but also consider if there's any context I'm missing. Like, sometimes in different contexts, 2+2 might not be 4, but in standard arithmetic, it's definitely 4. I should keep the response friendly and helpful.
</think>

Hello! 2 + 2 equals **4**. Let me know if you need help with anything else! 😊
