In [None]:
!curl -fsSL https://ollama.com/install.sh | sh
!nohup ollama serve > output.log 2>&1 &
!ollama pull phi4-mini

>>> Installing ollama to /usr/local
>>> Downloading Linux amd64 bundle
######################################################################## 100.0%
>>> Creating ollama user...
>>> Adding ollama user to video group...
>>> Adding current user to ollama group...
>>> Creating ollama systemd service...
>>> The Ollama API is now available at 127.0.0.1:11434.
>>> Install complete. Run "ollama" from the command line.
[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?2

In [None]:
!pip install opencv-python pillow requests langchain langchain_community ollama langchain_ollama requests pillow torchvision torch transformers

Collecting langchain_community
  Downloading langchain_community-0.4.1-py3-none-any.whl.metadata (3.0 kB)
Collecting ollama
  Downloading ollama-0.6.1-py3-none-any.whl.metadata (4.3 kB)
Collecting langchain_ollama
  Downloading langchain_ollama-1.0.0-py3-none-any.whl.metadata (2.1 kB)
INFO: pip is looking at multiple versions of langchain-community to determine which version is compatible with other requirements. This could take a while.
Collecting langchain_community
  Downloading langchain_community-0.4-py3-none-any.whl.metadata (3.0 kB)
  Downloading langchain_community-0.3.31-py3-none-any.whl.metadata (3.0 kB)
Collecting requests
  Downloading requests-2.32.5-py3-none-any.whl.metadata (4.9 kB)
Collecting dataclasses-json<0.7.0,>=0.6.7 (from langchain_community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
INFO: pip is looking at multiple versions of langchain-ollama to determine which version is compatible with other requirements. This could take a while.


In [None]:
# ✅ Download Sample Images
import requests
from pathlib import Path
from PIL import Image
from torchvision import models, transforms
import torch

Path("sample_images").mkdir(exist_ok=True)
images = {
    "dog.jpg": "https://images.unsplash.com/photo-1518717758536-85ae29035b6d",
    "cat.jpg": "https://images.unsplash.com/photo-1592194996308-7b43878e84a6",
    "car.jpg": "https://images.unsplash.com/photo-1503376780353-7e6692767b70"
}
for filename, url in images.items():
    response = requests.get(url)
    with open(f"sample_images/{filename}", "wb") as f:
        f.write(response.content)
print("✅ Sample images downloaded.")

# ✅ Load Pre-trained Model
model = models.resnet50(pretrained=True)
model.eval()

LABELS_URL = "https://raw.githubusercontent.com/pytorch/hub/master/imagenet_classes.txt"
labels = requests.get(LABELS_URL).text.strip().split("\n")
transform = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225])
])
def recognize_image(image_path):
    img = Image.open(image_path).convert("RGB")
    input_tensor = transform(img).unsqueeze(0)
    with torch.no_grad():
        outputs = model(input_tensor)
    _, predicted_idx = torch.max(outputs, 1)
    return labels[predicted_idx.item()]

recognize_image("/content/sample_images/dog.jpg")

✅ Sample images downloaded.




Downloading: "https://download.pytorch.org/models/resnet50-0676ba61.pth" to /root/.cache/torch/hub/checkpoints/resnet50-0676ba61.pth


100%|██████████| 97.8M/97.8M [00:01<00:00, 88.2MB/s]


'German short-haired pointer'

In [None]:

# ✅ Setup LangChain with Ollama (Phi-4 Mini)
from langchain_community.llms import Ollama
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain

llm = Ollama(model="phi4-mini")  # Make sure Ollama is running locally with `phi4-mini` pulled
template = """
You are an AI assistant helping with image recognition.
The image content was identified as: {description}

Please describe this object and suggest possible one short use cases.

Answer:
"""
prompt = PromptTemplate.from_template(template)
chain = LLMChain(llm=llm, prompt=prompt)

# ✅ Process Images
for image_name in images:
    image_path = f"sample_images/{image_name}"
    label = recognize_image(image_path)
    print(f"\n🖼️ Image: {image_name}")
    print(f"📷 Recognized Label: {label}")
    response = chain.run({"description": label})
    print(f"🤖 Phi-4 Mini Response:\n{response}")

  llm = Ollama(model="phi4-mini")  # Make sure Ollama is running locally with `phi4-mini` pulled
  chain = LLMChain(llm=llm, prompt=prompt)



🖼️ Image: dog.jpg
📷 Recognized Label: German short-haired pointer


  response = chain.run({"description": label})


🤖 Phi-4 Mini Response:
A German Shorthaired Pointer (GSP) is a medium-sized breed of dog known for its intelligence, agility, keen hunting instincts, pointed ears that resemble those found on foxes or deer. They have thick double coats with colors ranging from red to black, white markings can appear anywhere except the face and legs.

One short use case: A German Shorthaired Pointer could be used as a guard dog due to its alertness and protective nature towards property.




🖼️ Image: cat.jpg
📷 Recognized Label: Persian cat
🤖 Phi-4 Mini Response:
Object Description:

A Persian Cat is a breed of domestic cats known for its distinctive physical characteristics. They have long, luxurious fur that can come in various colors such as black, white, smoke gray (gray with tan markings), chocolate brown, or bluepoint-blue tabby patterns like seals and moorhens.

Persian Cats also possess flat faces commonly referred to as "pugs," giving them a rounded appearance without the typical feline facial

In [None]:
!pip install transformers -U



In [None]:
# Download sample images
import requests
from pathlib import Path
Path("sample_images").mkdir(exist_ok=True)

images = {
    "dog.jpg": "https://images.unsplash.com/photo-1518717758536-85ae29035b6d",
    "cat.jpg": "https://images.unsplash.com/photo-1592194996308-7b43878e84a6",
    "car.jpg": "https://images.unsplash.com/photo-1503376780353-7e6692767b70"
}

for filename, url in images.items():
    response = requests.get(url)
    with open(f"sample_images/{filename}", "wb") as f:
        f.write(response.content)
print("✅ Sample images downloaded.")

# Load CLIP model
from transformers import CLIPProcessor, CLIPModel
from PIL import Image
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device)
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

# Define candidate labels
labels = ["dog", "cat", "car", "tree", "person", "computer", "building", "bottle", "phone", "book"]

def recognize_clip(image_path):
    image = Image.open(image_path).convert("RGB")
    inputs = processor(text=labels, images=image, return_tensors="pt", padding=True).to(device)
    outputs = model(**inputs)
    logits_per_image = outputs.logits_per_image
    probs = logits_per_image.softmax(dim=1)
    predicted_index = probs.argmax().item()
    return labels[predicted_index]

# LangChain + Ollama (Requires local Ollama server with `phi4-mini`)
from langchain_community.llms import Ollama
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain

llm = Ollama(model="phi4-mini")  # Requires Ollama running locally
prompt = PromptTemplate.from_template("""
You are an AI assistant helping with image recognition.
The image content was identified as: {description}

Please describe this object and suggest possible use cases.

Answer:
""")
chain = LLMChain(llm=llm, prompt=prompt)

# Run recognition and generate response
for image_name in images:
    image_path = f"sample_images/{image_name}"
    label = recognize_clip(image_path)
    print(f"\n🖼️ Image: {image_name}")
    print(f"📷 Recognized Label (CLIP): {label}")
    response = chain.run({"description": label})
    print(f"🤖 Phi-4 Mini Response:\n{response}")


✅ Sample images downloaded.


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/605M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/605M [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


preprocessor_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/592 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]


🖼️ Image: dog.jpg
📷 Recognized Label (CLIP): dog
🤖 Phi-4 Mini Response:
A "dog" is a domesticated mammal belonging to the species Canis lupus familiaris. Dogs typically have elongated snouts, pointy ears, fur-covered bodies that can vary in color from black through brown shades or white (some dogs may also exhibit patches of spots), and four legs ending with paws equipped for walking on various surfaces.

Possible use cases:

1. Companionship: As one of the most popular pets worldwide due to their loyalty and companionship.
2. Work animals/dogs: They can be trained as service/working or rescue/specialist dogs in a variety of roles, such as police K-9 units for search-and-rescue operations; hearing assistance (dogs with disabilities that are deaf), guide work ("seeing-eye" dogs).
3. Entertainment and sports: Dogs participate actively in many sport activities like agility trials, flyball competitions.
4. Therapy Animals/Pets: They can also be trained to provide emotional support or comp

In [None]:
!pip install -U huggingface_hub

Collecting huggingface_hub
  Downloading huggingface_hub-1.1.5-py3-none-any.whl.metadata (13 kB)
Downloading huggingface_hub-1.1.5-py3-none-any.whl (516 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m516.0/516.0 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: huggingface_hub
  Attempting uninstall: huggingface_hub
    Found existing installation: huggingface-hub 0.36.0
    Uninstalling huggingface-hub-0.36.0:
      Successfully uninstalled huggingface-hub-0.36.0
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
transformers 4.57.1 requires huggingface-hub<1.0,>=0.34.0, but you have huggingface-hub 1.1.5 which is incompatible.[0m[31m
[0mSuccessfully installed huggingface_hub-1.1.5


In [None]:
import requests
from PIL import Image
from io import BytesIO
import torch
from transformers import CLIPProcessor, CLIPModel
from langchain_community.llms import Ollama
from langchain.prompts import PromptTemplate
import os
import numpy as np

# Download sample images
def download_sample_images():
    image_urls = [
        "https://images.unsplash.com/photo-1600585154340-be6161a56a0c",  # House
        "https://images.unsplash.com/photo-1518791841217-8f162f1e1131",  # Cat
        "https://images.unsplash.com/photo-1507525428034-b723cf961d3e"   # Beach
    ]
    image_paths = []
    captions = [
        "A modern house with large windows",
        "A fluffy cat sitting on a couch",
        "A tropical beach with palm trees"
    ]
    os.makedirs("sample_images", exist_ok=True)

    for i, url in enumerate(image_urls):
        try:
            response = requests.get(url)
            if response.status_code == 200:
                img = Image.open(BytesIO(response.content))
                img_path = f"sample_images/image_{i+1}.jpg"
                img.save(img_path)
                image_paths.append(img_path)
                print(f"Downloaded image {i+1} to {img_path}")
            else:
                print(f"Failed to download image {i+1}")
        except Exception as e:
            print(f"Error downloading image {i+1}: {e}")

    return image_paths, captions

# Initialize CLIP and Ollama
def initialize_models():
    device = "cuda" if torch.cuda.is_available() else "cpu"
    clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device)
    clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
    llm = Ollama(model="phi4-mini")
    return clip_model, clip_processor, llm, device

# Use Case 1: Image Recognition (classify image based on text labels)
def image_recognition(image_path, labels, clip_model, clip_processor, device):
    image = Image.open(image_path).convert("RGB")
    inputs = clip_processor(text=labels, images=image, return_tensors="pt", padding=True).to(device)

    with torch.no_grad():
        outputs = clip_model(**inputs)
        logits_per_image = outputs.logits_per_image
        probs = logits_per_image.softmax(dim=1).cpu().numpy()[0]

    result = {label: prob for label, prob in zip(labels, probs)}
    predicted_label = max(result, key=result.get)
    return predicted_label, result

# Use Case 2: Image Description (generate detailed description using Phi-4 Mini)
def image_description(image_path, clip_model, clip_processor, llm, device):
    image = Image.open(image_path).convert("RGB")
    # Use CLIP to get a basic classification to guide the description
    labels = ["house", "cat", "beach", "car", "tree"]
    inputs = clip_processor(text=labels, images=image, return_tensors="pt", padding=True).to(device)

    with torch.no_grad():
        outputs = clip_model(**inputs)
        logits_per_image = outputs.logits_per_image
        probs = logits_per_image.softmax(dim=1).cpu().numpy()[0]
        top_label = labels[np.argmax(probs)]

    # Create a prompt for Phi-4 Mini
    prompt_template = PromptTemplate(
        input_variables=["label"],
        template="Generate a detailed description of an image that primarily features a {label}. Include details about possible colors, setting, and context."
    )
    prompt = prompt_template.format(label=top_label)

    try:
        description = llm.invoke(prompt)
        return description
    except Exception as e:
        return f"Error generating description: {e}"

# Use Case 3: Image Search by Text (find image matching a text query)
def image_search_by_text(text_query, image_paths, clip_model, clip_processor, device):
    images = [Image.open(path).convert("RGB") for path in image_paths]
    inputs = clip_processor(text=[text_query], images=images, return_tensors="pt", padding=True).to(device)

    with torch.no_grad():
        outputs = clip_model(**inputs)
        logits_per_image = outputs.logits_per_image
        probs = logits_per_image.softmax(dim=1).cpu().numpy()[0]

    best_image_idx = np.argmax(probs)
    return image_paths[best_image_idx], probs[best_image_idx]

# Use Case 4: Image Search by Image (find most similar image)
def image_search_by_image(query_image_path, image_paths, clip_model, clip_processor, device):
    query_image = Image.open(query_image_path).convert("RGB")
    images = [Image.open(path).convert("RGB") for path in image_paths]

    # Encode query image
    query_inputs = clip_processor(images=query_image, return_tensors="pt").to(device)
    with torch.no_grad():
        query_features = clip_model.get_image_features(**query_inputs).cpu().numpy()

    # Encode all images
    image_features = []
    for image in images:
        inputs = clip_processor(images=image, return_tensors="pt").to(device)
        with torch.no_grad():
            features = clip_model.get_image_features(**inputs).cpu().numpy()
        image_features.append(features)

    # Compute cosine similarities
    similarities = [
        np.dot(query_features, feat.T) / (np.linalg.norm(query_features) * np.linalg.norm(feat))
        for feat in image_features
    ]

    best_image_idx = np.argmax(similarities)
    return image_paths[best_image_idx], similarities[best_image_idx]

def main():
    # Download sample images
    image_paths, captions = download_sample_images()

    # Initialize models
    clip_model, clip_processor, llm, device = initialize_models()

    # Use Case 1: Image Recognition
    print("\n=== Image Recognition ===")
    labels = ["a house", "a cat", "a beach", "a car", "a tree"]
    for i, image_path in enumerate(image_paths):
        predicted_label, probs = image_recognition(image_path, labels, clip_model, clip_processor, device)
        print(f"Image {i+1} ({image_path}):")
        print(f"Predicted: {predicted_label}")
        print(f"Probabilities: {probs}")

    # Use Case 2: Image Description
    print("\n=== Image Description ===")
    for i, image_path in enumerate(image_paths):
        description = image_description(image_path, clip_model, clip_processor, llm, device)
        print(f"Image {i+1} ({image_path}):")
        print(f"Description: {description}")

    # Use Case 3: Image Search by Text
    print("\n=== Image Search by Text ===")
    text_query = "A tropical beach with palm trees"
    best_image, score = image_search_by_text(text_query, image_paths, clip_model, clip_processor, device)
    print(f"Text Query: '{text_query}'")
    print(f"Best Match: {best_image} (Score: {score:.4f})")

    # Use Case 4: Image Search by Image
    print("\n=== Image Search by Image ===")
    query_image_path = image_paths[0]  # Use first image as query
    best_match, similarity = image_search_by_image(query_image_path, image_paths, clip_model, clip_processor, device)
    print(f"Query Image: {query_image_path}")
    print(f"Best Match: {best_match} (Similarity: {float(similarity):.4f})")

if __name__ == "__main__":
    main()

Downloaded image 1 to sample_images/image_1.jpg
Downloaded image 2 to sample_images/image_2.jpg
Downloaded image 3 to sample_images/image_3.jpg

=== Image Recognition ===
Image 1 (sample_images/image_1.jpg):
Predicted: a house
Probabilities: {'a house': np.float32(0.9935417), 'a cat': np.float32(0.0001029685), 'a beach': np.float32(7.3100186e-05), 'a car': np.float32(0.00032168283), 'a tree': np.float32(0.0059605367)}
Image 2 (sample_images/image_2.jpg):
Predicted: a cat
Probabilities: {'a house': np.float32(0.00063693087), 'a cat': np.float32(0.9987239), 'a beach': np.float32(4.9661096e-05), 'a car': np.float32(0.00033760496), 'a tree': np.float32(0.00025181822)}
Image 3 (sample_images/image_3.jpg):
Predicted: a beach
Probabilities: {'a house': np.float32(0.0013105997), 'a cat': np.float32(0.000164817), 'a beach': np.float32(0.9975725), 'a car': np.float32(0.00045183016), 'a tree': np.float32(0.000500346)}

=== Image Description ===
Image 1 (sample_images/image_1.jpg):
Description: Th

  print(f"Best Match: {best_match} (Similarity: {float(similarity):.4f})")
