# Run

In [1]:
# Setup - Install Dependencies

!pip install streamlit
!pip install -q torch torchvision torchaudio
!pip install -q git+https://github.com/facebookresearch/segment-anything.git
!pip install -q opencv-python matplotlib transformers ftfy regex faiss-cpu
!pip install -q transformers faiss-cpu
!pip install -q transformers accelerate
!npm install localtunnel

Collecting streamlit
  Downloading streamlit-1.47.1-py3-none-any.whl.metadata (9.0 kB)
Collecting watchdog<7,>=2.1.5 (from streamlit)
  Downloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.3/44.3 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Downloading streamlit-1.47.1-py3-none-any.whl (9.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.9/9.9 MB[0m [31m68.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pydeck-0.9.1-py2.py3-none-any.whl (6.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m78.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl (79 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.1/79.1 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[?25hInst

In [2]:
# Download SAM Checkpoint (only once)
!wget -q https://dl.fbaipublicfiles.com/segment_anything/sam_vit_b_01ec64.pth -O sam_vit_b.pth

In [3]:
%%writefile app.py

import streamlit as st
from PIL import Image
import time
import os
from io import BytesIO
import torch
import cv2
import numpy as np
import matplotlib.pyplot as plt
from segment_anything import sam_model_registry, SamPredictor
from torchvision import transforms
import requests
import re
import faiss
import pandas as pd
import spacy
from huggingface_hub import InferenceClient
from segment_anything import sam_model_registry, SamPredictor
from transformers import CLIPProcessor, CLIPModel, BlipProcessor, BlipForConditionalGeneration, AutoTokenizer, AutoModelForCausalLM, pipeline

device = "cuda" if torch.cuda.is_available() else "cpu"

def call_llm():
    response = client.chat.completions.create(
      model="HuggingFaceH4/zephyr-7b-beta",
      messages=[
        {"role": "user", "content": llm_prompt}
      ],
      temperature=0.7,
      max_tokens=200
    )

    result = response['choices'][0]['message']['content'].strip()
    return result

import random

def build_prompt(tags, destination):

  # Safe fallback for unknown location
    location = destination['location']
    if not location or location.lower() == 'unknown':
        location_phrase = "this stunning landscape"
        alt_location_sentence = "This scenery is truly a hidden gem waiting to be explored."
    else:
        location_phrase = location
        alt_location_sentence = f"A similar travel destination could be '{location}'."

    similar_phrases = [
        f"A similar travel destination could be \"{destination['location']}\".",
        f"Another amazing place you might love is \"{destination['location']}\".",
        f"A great place to consider is \"{destination['location']}\"."
    ]
    chosen_phrase = random.choice(similar_phrases)

    return (
        f"""
        The user wants a destination that matches these tags: {tags}.

        The most visually similar destination is: {destination['location']}.
        Destination description: {destination['description']}.

        If the location is unknown, suggest a real or plausible location that matches the scenery based on the tags. If you can't, describe the scenery naturally without naming a place.

        Write a short explanation in exactly 8 sentences:
        1 Start with an sentence like : "What a stunning place!"
        2 Then say: "{chosen_phrase}"
        3 Then say: {alt_location_sentence}
        4 In the next sentence, briefly introduce the place (where it is, what it is).
        5 In the next sentence, highlight what makes it special.
        6 In the next sentence, describe what makes it visually or culturally unique.
        7 In the next sentence, say why it’s famous and why someone should visit.
        8 End with a friendly invitation to visit — keep it short and warm.
        Make this one paragraph. Do not include numbers. Keep the whole explanation in 200 tokens. Do not exceed this.

        IMPORTANT:
        - Make sure the description strictly relates to the tags and factual information about the location.
        - Avoid speculation or adding information not supported by the tags or description.
        - Maintain clarity and avoid generic statements.
        - Do not repeat phrases unnecessarily.
        """
    )

def build_feedback_prompt(tags, destination):

    feedback_prompt = f"""
    The user wants a destination that matches these tags: {tags}.

    The most visually similar destination is: {destination['location']}.
    Destination description: {destination['description']}.

    If the location is unknown, suggest a real or plausible location that matches the scenery based on the tags. If you can't, describe the scenery naturally without naming a place.

    Write a single, warm, descriptive message in **exactly 5 to 7 complete sentences**.
    Start with a phrase like “Oh, I see you want...” referencing the user’s interest.
    Mention the place name naturally and speak directly to the reader (do not use “we”).
    Describe what makes it special, what they can see or do there, and inspire them to visit.


    IMPORTANT:
        - Make sure the description strictly relates to the tags and factual information about the location.
        - Avoid speculation or adding information not supported by the tags or description.
        - Maintain clarity and avoid generic statements.
        - Do not repeat phrases unnecessarily.
        - Give one output suggestion only.
        - Do not include instructions — output only the final message.
        - Keep the total length under 200 tokens.

    Output only the final text.
    """
    return feedback_prompt


####################################### VisionAgent #######################################################

# Load SAM model
sam = sam_model_registry["vit_b"](checkpoint="sam_vit_b.pth")
sam.to(device)
predictor = SamPredictor(sam)

# Load BLIP model
blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to(device)

# Load CLIP model
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device)
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

# Generate caption
def generate_caption(image_pil):
    inputs = blip_processor(image_pil, return_tensors="pt").to(device)
    out = blip_model.generate(**inputs,do_sample=True,
        top_p=0.9,            # Nucleus sampling
        temperature=0.8,       # Add variability
        max_length=100,
        repetition_penalty=1.2)
    caption = blip_processor.decode(out[0], skip_special_tokens=True)
    return caption

# Generate segment and caption
def segment_and_caption_image(image):
    # Convert to NumPy array for downstream use
    img = image.convert("RGB")
    image_np = np.array(img)

    # Convert to OpenCV format (BGR)
    image_bgr = cv2.cvtColor(image_np, cv2.COLOR_RGB2BGR)
    predictor.set_image(image_bgr)

    # Input point in the center of the image
    input_point = np.array([[image.width // 2, image.height // 2]])
    input_label = np.array([1])  # 1 = positive label

    # Generate segmentation masks
    masks, scores, _ = predictor.predict(
        point_coords=input_point,
        point_labels=input_label,
        multimask_output=True
    )

    caption = generate_caption(image)
    return caption

def extract_keywords_from_caption(caption_):
    if not isinstance(caption, str) or not caption.strip():
        return []

    keywords = re.findall(r'\b\w+\b', caption.lower())
    stopwords = {"a", "the", "at", "on", "in", "with", "and", "of", "to", "is", "by", "an"}
    return [kw for kw in keywords if kw not in stopwords]


################################## DestinationAgent ###############################################

# Load unsplash data
# Get filenames
index_file = "/content/unsplash_clip.index"
csv_file = "/content/unsplash_clip_metadata.csv"

# Load files
faiss_index = faiss.read_index(index_file)
metadata_df = pd.read_csv(csv_file)

def get_best_location(row):
    for col in [
        "photo_location_city",
        "photo_location_country",
        "ai_primary_landmark_name"
    ]:
        val = row.get(col)
        if isinstance(val, str) and val.strip():
            return val.strip()
    return "Unknown"

metadata_df["resolved_location"] = metadata_df.apply(get_best_location, axis=1)

#Setup Embedding Function
def get_clip_embedding(text):
    inputs = clip_processor(text=[text], return_tensors="pt", padding=True, truncation=True).to(device)
    with torch.no_grad():
        return clip_model.get_text_features(**inputs).cpu().numpy().astype("float32")

#Define Matching Function with BLIP Tags
def match_from_blip_tags(blip_tags, top_k):
    query = ", ".join(blip_tags)
    query_emb = get_clip_embedding(query)

    distances, indices = faiss_index.search(query_emb, top_k)

    results = []
    for i, idx in enumerate(indices[0]):
        row = metadata_df.iloc[idx]
        results.append({
            "rank": i + 1,
            "photo_id": row.get("photo_id"),
            "image_url": row.get("photo_image_url"),
            "description": row.get("ai_description"),
            "location": row.get("resolved_location", "Unknown") if "resolved_location" in row else "Unknown",
            "distance": float(distances[0][i])
        })
    return results


#################################### Feedback Agent ##################################################
# Initialize HuggingFace API client
client = InferenceClient(
    provider="featherless-ai",
    api_key='[YOUR_HUGGING_FACE_TOKEN]',
)

nlp = spacy.load("en_core_web_sm")
# Predefined map of adjectives to tags (90 entries)
adjective_map = {
    "Beautiful": "scenic", "Scenic": "scenic",
    "Cheap": "affordable", "Affordable": "affordable",
    "Crowded": "overpopulated", "Overpopulated": "overpopulated",
    "Expensive": "luxury", "Luxury": "luxury",
    "Quiet": "peaceful", "Peaceful": "peaceful",
    "Rustic": "rural", "Rural": "rural",
    "Romantic": "romantic", "Romantic": "romantic",
    "Clean": "hygienic", "Hygienic": "hygienic",
    "Warm": "tropical", "Tropical": "tropical",
    "Cold": "winter", "Winter": "winter",
    "Busy": "urban", "Urban": "urban",
    "Relaxing": "tranquil", "Tranquil": "tranquil",
    "Charming": "quaint", "Quaint": "quaint",
    "Historic": "cultural", "Cultural": "cultural",
    "Adventurous": "adventure", "Adventure": "adventure",
    "Safe": "secure", "Secure": "secure",
    "Family-friendly": "family", "Family": "family",
    "Friendly": "hospitable", "Hospitable": "hospitable",
    "Lively": "vibrant", "Vibrant": "vibrant",
    "Modern": "urban", "Urban": "urban",
    "Tropical": "island", "Island": "island",
    "Unique": "exotic", "Exotic": "exotic",
    "Vibrant": "lively", "Lively": "lively",
    "Isolated": "remote", "Remote": "remote",
    "Spacious": "expansive", "Expansive": "expansive",
    "Luxury": "high-end", "High-end": "high-end",
    "Breathtaking": "scenic", "Scenic": "scenic",
    "Peaceful": "calm", "Calm": "calm",
    "Exotic": "adventurous", "Adventurous": "adventurous",
    "Picturesque": "scenic", "Scenic": "scenic",
    "Majestic": "grand", "Grand": "grand",
    "Aesthetic": "artistic", "Artistic": "artistic",
    "Serene": "calm", "Calm": "calm",
    "Idyllic": "peaceful", "Peaceful": "peaceful",
    "Dynamic": "vibrant", "Vibrant": "vibrant",
    "Enchanting": "charming", "Charming": "charming",
    "Mysterious": "exotic", "Exotic": "exotic",
    "Thriving": "urban", "Urban": "urban",
    "Chilly": "cold", "Cold": "cold",
    "Diverse": "multicultural", "Multicultural": "multicultural",
    "Wilderness": "nature", "Nature": "nature",
    "Glistening": "shiny", "Shiny": "shiny",
    "Untouched": "pristine", "Pristine": "pristine",
    "Cozy": "comfortable", "Comfortable": "comfortable",
    "Lush": "green", "Green": "green",
    "Bright": "sunny", "Sunny": "sunny",
    "Vast": "expansive", "Expansive": "expansive",
    "Picturesque": "scenic", "Scenic": "scenic",
    "Delightful": "charming", "Charming": "charming",
    "Vibrant": "colorful", "Colorful": "colorful",
    "Fascinating": "intriguing", "Intriguing": "intriguing",
    "Sandy": "beach", "Beach": "beach",
    "Luminous": "bright", "Bright": "bright",
    "Breezy": "refreshing", "Refreshing": "refreshing",
    "Clear": "crystal-clear", "Crystal-clear": "crystal-clear"
}

def extract_adjectives(feedback_text):
    """
    Use spaCy POS tagging to extract adjectives from the feedback text.
    """
    doc = nlp(feedback_text)
    adjectives = [token.text.capitalize() for token in doc if token.pos_ == "ADJ"]
    return adjectives

# Handle special phrases (e.g., "not too crowded")
def process_special_phrases(feedback_text, original_tags):
    special_phrases = {
        "not too crowded": "peaceful",
        "not too expensive": "affordable",
        "too crowded": "overpopulated",
        "too expensive": "luxury",
        "too hot": "tropical",
        "not too hot": "tropical",
        "not too cold": "tropical",
        "too cold": "winter",
        "not too far": "near",
        "not too close": "remote",
        "not too long": "short",
        "not too short": "long",
        "not too busy": "peaceful",
        "too busy": "urban",
        "not too loud": "quiet",
        "too loud": "noisy",
        "not too bright": "dim",
        "too bright": "sunny",
        "not too rainy": "dry",
        "too rainy": "wet"
    }

    for phrase, tag in special_phrases.items():
        if phrase in feedback_text.lower():
            if tag not in original_tags:
                original_tags.append(tag)
            feedback_text = feedback_text.lower().replace(phrase, '')

    return feedback_text



def fallback_to_llm(feedback_text: str, original_tags: list):
    print("Fallback to LLM")
    # Fallback to LLM
    response = client.chat.completions.create(
        model="mistralai/Mistral-7B-Instruct-v0.2",
        messages=[
            {
                "role": "user",
                "content": f"""
                Classify the following text as "feedback", "query", or "assertive chat".
                If it's feedback or a query, provide the relevant tags based on the text and the original tags. Do not put any comment. Just tags please. For each relevant adjective that is not present in the original
                text provide one tag in one word that best fits the adjective you don't have to provide its similar or synonymous tags. Be very diligent in providing the tags. Like not too hot can be cold or warm. The tags has
                to be words. no characters please very mindful regarding this. The tags can also be noun like place name or sight name. You can keep the original tags if you think they are relevant.
                Return only the tags, in a comma-separated list.
                If it's assertive chat, return "null".

                Example 1:
                Input: "Can you show me something cool?"
                Original tags: ["nature", "mountain"]
                Output: "affordable, scenic, cool"

                Example 2:
                Input: "What is the weather like in Switzerland?"
                Original tags: ["travel", "weather"]
                Output: "mountain, Switzerland, adventure"

                Example 3:
                Input: "I don't care about the price, just show me something unique."
                Original tags: ["nature", "adventure"]
                Output: "unique, scenic, exotic"

                Example 4:
                Input: "Great"
                Original tags: ["nature", "mountain"]
                Output: "null"

                Example 5:
                Input: "Thank You"
                Original tags: ["nature", "mountain"]
                Output: "null"


                Input:
                "{feedback_text}"
                Original tags: {original_tags}
                """
            }
        ],
    )

    result = response['choices'][0]['message']['content'].strip()

    if result.lower() != "null":
        tags = [tag.strip().strip('"') for tag in result.split(',')]
        return tags
    else:
        return None

# Main processing function
def process_feedback(feedback_text: str, original_tags: list):
    """
    Detect feedback/query, then extract relevant tags either from the map or fallback to LLM.
    """
    feedback_text = process_special_phrases(feedback_text, original_tags)  # Handle special phrases

    words = feedback_text.lower().split()

    adjectives = extract_adjectives(feedback_text)

    # Find tags from predefined map
    # tags = [adjective_map.get(word.capitalize(), None) for word in words]
    tags = [adjective_map.get(adj, None) for adj in adjectives]

    tags = list(set(tag for tag in tags if tag))  # Remove duplicates

    if tags:
        updated_tags = [tag for tag in tags if tag not in original_tags]
        original_tags.extend(updated_tags)
        return original_tags
    else:
        tags = fallback_to_llm(feedback_text, original_tags)
        updated_tags = [tag for tag in tags if tag not in original_tags]
        original_tags.extend(updated_tags)
        return original_tags


#################################### Streamlit App ###################################################

st.set_page_config(page_title="ExplainTrip", layout="wide", initial_sidebar_state="collapsed")
st.title("ExplainTrip!")

if "messages" not in st.session_state:
    st.session_state.messages = []

if 'image_tags' not in st.session_state:
    st.session_state['image_tags'] = None

for msg in st.session_state.messages:
    with st.chat_message(msg["role"]):
        st.write(msg["content"])

def fit_image_contain(img, target_width, target_height, color=(255, 255, 255)):
    # Resize while keeping aspect ratio
    img_ratio = img.width / img.height
    target_ratio = target_width / target_height

    if img_ratio > target_ratio:
        # Image is wider, fit width
        new_width = target_width
        new_height = int(target_width / img_ratio)
    else:
        # Image is taller, fit height
        new_height = target_height
        new_width = int(target_height * img_ratio)

    img_resized = img.resize((new_width, new_height), Image.LANCZOS)

    # Create new canvas
    new_img = Image.new("RGB", (target_width, target_height), color)
    paste_x = (target_width - new_width) // 2
    paste_y = (target_height - new_height) // 2

    new_img.paste(img_resized, (paste_x, paste_y))
    return new_img

def clean_message(llm_output: str) -> str:
    if llm_output.startswith('Example: "') and llm_output.endswith('"'):
        result = llm_output[len('Example: "'): -1]
    else:
        result = llm_output  # or handle differently if format unexpected

prompt = st.chat_input(placeholder="Got a favorite place? Let’s find its match!...",accept_file=True, file_type=["jpg", "jpeg", "png"])

if prompt:
    if prompt.text and prompt.files:
        uploaded_file = prompt.files[0]
        st.session_state.messages.append({"role": "user", "content": f"Uploaded file: {uploaded_file.name}"})
        with st.chat_message("user"):
            img = Image.open(uploaded_file)
            fitted_img = fit_image_contain(img, 400, 300)  # Fit in 400x300 with white background
            st.image(fitted_img, caption={uploaded_file.name}, use_container_width=False)
            st.write(prompt.text)

        time.sleep(1)
        bot_response = "Thanks for the image! Let me find similar places..."
        st.session_state.messages.append({"role": "bot", "content": bot_response})
        with st.chat_message("bot"):
            st.write(bot_response)

        with st.spinner("Thinking..."):
            # Simulated BLIP tags from VisionAgent
            caption = segment_and_caption_image(img)
            blip_tags = extract_keywords_from_caption(caption)
            st.session_state['image_tags'] = blip_tags
            # Match and display
            results = match_from_blip_tags(blip_tags, top_k=25)
            top_result = results[0]
            llm_prompt = build_prompt(blip_tags, top_result)
            # Print the generated text
            #output = generator(llm_prompt, max_new_tokens=150, temperature=0.7)
            #response_text = clean_message(output[0]["generated_text"])
            response_text = call_llm()

            bot_response = response_text
            st.session_state.messages.append({"role": "bot", "content": bot_response})
            with st.chat_message("bot"):
                st.write(bot_response)
                response = requests.get(top_result["image_url"])
                result_img = Image.open(BytesIO(response.content))
                fitted_result_img = fit_image_contain(result_img, 400, 300)  # Fit in 400x300 with white background
                st.image(fitted_result_img, caption=f"📍 {top_result['location']}", use_container_width=False)
    elif prompt.text:
        user_text = prompt.text
        st.session_state.messages.append({"role": "user", "content": user_text})
        with st.chat_message("user"):
            st.write(user_text)

        time.sleep(1)
        with st.spinner("Thinking..."):
            original_tags = st.session_state['image_tags']
            feedback_text = user_text
            updated_tags = process_feedback(feedback_text, original_tags)
            # Match and display
            results = match_from_blip_tags(updated_tags, top_k=25)
            top_result = results[0]
            llm_prompt = build_feedback_prompt(updated_tags, top_result)
            # Print the generated text
            #output = generator(llm_prompt, max_new_tokens=150, temperature=0.7)
            #response_text = output[0]["generated_text"]
            response_text = call_llm()

            bot_response = response_text
            st.session_state.messages.append({"role": "bot", "content": bot_response})
            with st.chat_message("bot"):
                st.write(bot_response)
                response = requests.get(top_result["image_url"])
                result_img = Image.open(BytesIO(response.content))
                fitted_result_img = fit_image_contain(result_img, 400, 300)  # Fit in 400x300 with white background
                st.image(fitted_result_img, caption=f"📍 {top_result['location']}", use_container_width=False)


Writing app.py


In [4]:
# Your public ip is the password to the localtunnel
!curl ipv4.icanhazip.com

34.91.129.20


In [6]:
!streamlit run app.py &>./logs.txt & npx localtunnel --port 8501

[1G[0K⠙[1G[0K⠹[1G[0Kyour url is: https://eight-birds-tease.loca.lt
^C
