<a href="https://colab.research.google.com/github/TanviHarihar/PES1UG22CS646_Jenkins/blob/main/nft_search.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [81]:
# Install required dependencies
!pip install --upgrade pip chromadb langchain sentence-transformers pillow datasets langchain-community fsspec==2024.12.0



In [82]:
import os
import io
import torch
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
from PIL import Image
from huggingface_hub import login
from datasets import load_dataset
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
import chromadb

In [83]:
# Secure Hugging Face login
login(os.getenv("HUGGINGFACE_TOKEN"))

# Load dataset
dataset = load_dataset("hongerzh/my-NFT-text-1")

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [84]:
for i, entry in enumerate(dataset["train"][:5]):  # Print first 5 samples
    print(f"Entry {i}: {entry}")


Entry 0: image
Entry 1: text


In [85]:
print(type(dataset))  # Should be a dict or Dataset object
print(type(dataset["train"]))  # Should be a list of dicts
print(dataset["train"][:5])  # Print first 5 entries


<class 'datasets.dataset_dict.DatasetDict'>
<class 'datasets.arrow_dataset.Dataset'>
{'image': [<PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=800x1000 at 0x7B552F99D090>, <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=616x1000 at 0x7B5554B90590>, <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=1000x1000 at 0x7B552FADDA10>, <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=800x1000 at 0x7B57F443A550>, <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=1000x1000 at 0x7B552FAF3410>], 'text': ['An image characterized by the title: "La Meije", several tags: "landscapephotography,alps,mountains,glacier", and a short description: "Part of my series "Light on Alpine Peaks"\nThis was a amazing sunset in the French Alps. This part of the Alps is not very popular, however I always find it spectacular with a lot of mesmerizing landscapes. \nAfter few hours of hiking, the big reward was to spend the night in front of the amazing mountain LA Meije and its glacier, on

In [86]:
import re
from tqdm import tqdm

# Function to extract title, tags, and description from text
def parse_text(text):
    title_match = re.search(r'title:\s*"([^"]+)"', text)
    tags_match = re.search(r'tags:\s*"([^"]+)"', text)

    # Extract everything after "short description:" as description
    description_match = re.search(r'short description:\s*"?(.*)', text, re.DOTALL)

    title = title_match.group(1) if title_match else ""
    tags = tags_match.group(1).split(',') if tags_match else []
    description = description_match.group(1).strip() if description_match else text  # Fallback to full text if not found

    return {"title": title, "tags": tags, "description": description}

# Process dataset without storing images
def process_dataset(dataset):
    for idx, text in tqdm(enumerate(dataset["train"]["text"]), total=len(dataset["train"]["text"]), desc="Processing dataset"):
        parsed = parse_text(text)
        parsed["image_id"] = idx  # Store image index instead of image
        yield parsed

# Store only text data while keeping image reference
processed_data = list(process_dataset(dataset))

# Print sample
print(processed_data[:5])  # Print first 2 entries


Processing dataset: 100%|██████████| 29339/29339 [00:00<00:00, 197050.54it/s]

[{'title': 'La Meije', 'tags': ['landscapephotography', 'alps', 'mountains', 'glacier'], 'description': 'Part of my series "Light on Alpine Peaks"\nThis was a amazing sunset in the French Alps. This part of the Alps is not very popular, however I always find it spectacular with a lot of mesmerizing landscapes. \nAfter few hours of hiking, the big reward was to spend the night in front of the amazing mountain LA Meije and its glacier, one of my favorite in the French Alps."', 'image_id': 0}, {'title': 'Sea bathing', 'tags': ['2d', 'anime', 'digital', 'illustration', 'girl'], 'description': 'Kemomimi girl\n\n2386*3871 pixel png"', 'image_id': 1}, {'title': 'NAGASAKI - MOON', 'tags': ['illustration', 'contemporaryart', 'abstract', 'japan'], 'description': 'NAGASAKI - MOON\n2021\nThe journey from Nagasaki to the Moon by train takes a long time. City pop playlist graced my journey to the moon.\n\n5906x5906 pixels\n300 dpi"', 'image_id': 2}, {'title': 'TIED LOVE', 'tags': ['silence', 'classi




In [87]:
!pip install chromadb sentence-transformers



In [88]:
from sentence_transformers import SentenceTransformer

# Load a lightweight BERT model for text embeddings
model = SentenceTransformer("multi-qa-MiniLM-L6-cos-v1")

# Example: Convert text into an embedding
example_text = "Beautiful mountain landscape with a glacier."
embedding = model.encode(example_text)

print(f"Embedding shape: {embedding.shape}")  # Should be (384,)


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/11.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/383 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Embedding shape: (384,)


In [89]:
import chromadb

# Initialize a persistent ChromaDB collection
chroma_client = chromadb.PersistentClient(path="chroma_nft")
collection = chroma_client.get_or_create_collection("nft_metadata")


In [91]:
existing_ids = set(collection.get()["ids"])  # Fetch already stored IDs

print("🔄 Storing NFT metadata in ChromaDB...")
for item in tqdm(processed_data, desc="Processing NFTs", unit="NFT"):
    unique_id = str(item["image_id"])  # Convert ID to string for consistency

    if unique_id in existing_ids:
        collection.delete(ids=[unique_id])  # Remove old entry before adding new one

    # Create a structured document containing all metadata
    document_text = f"Title: {item['title']}\nTags: {', '.join(item['tags'])}\nDescription: {item['description']}"
    embedding = model.encode(document_text).tolist()

    collection.add(
        ids=[unique_id],
        embeddings=[embedding],
        metadatas=[{
            "title": item["title"],
            "tags": ", ".join(item["tags"]),
            "description": item["description"]
        }],
        documents=[document_text]  # Ensure the full text is stored
    )

    existing_ids.add(unique_id)  # Update set with new ID

print("✅ All NFT metadata stored successfully!")


🔄 Storing NFT metadata in ChromaDB...


Processing NFTs: 100%|██████████| 29339/29339 [15:12<00:00, 32.14NFT/s]

✅ All NFT metadata stored successfully!





In [92]:
# Fetch a few stored NFT metadata from ChromaDB
sample_data = collection.get(limit=5)  # Retrieve 5 entries

# Print the retrieved entries
for idx, (nft_id, metadata) in enumerate(zip(sample_data["ids"], sample_data["metadatas"])):
    print(f"\nNFT {idx+1}:")
    print(f"ID: {nft_id}")
    print(f"Title: {metadata['title']}")
    print(f"Tags: {metadata['tags']}")
    print(f"Description: {metadata['description']}")



NFT 1:
ID: 0
Title: La Meije
Tags: landscapephotography, alps, mountains, glacier
Description: Part of my series "Light on Alpine Peaks"
This was a amazing sunset in the French Alps. This part of the Alps is not very popular, however I always find it spectacular with a lot of mesmerizing landscapes. 
After few hours of hiking, the big reward was to spend the night in front of the amazing mountain LA Meije and its glacier, one of my favorite in the French Alps."

NFT 2:
ID: 1
Title: Sea bathing
Tags: 2d, anime, digital, illustration, girl
Description: Kemomimi girl

2386*3871 pixel png"

NFT 3:
ID: 2
Title: NAGASAKI - MOON
Tags: illustration, contemporaryart, abstract, japan
Description: NAGASAKI - MOON
2021
The journey from Nagasaki to the Moon by train takes a long time. City pop playlist graced my journey to the moon.

5906x5906 pixels
300 dpi"

NFT 4:
ID: 3
Title: TIED LOVE
Tags: silence, classical, painting, 3d, fashion, woman, chiaroscuro, caravaggio, digitalfashion
Description: 

In [93]:
!pip install thefuzz



In [94]:
from thefuzz import process  # Import fuzzy matching

def clean_text(text):
    """Removes unwanted symbols and ensures proper encoding."""
    return text.encode('utf-8').decode('utf-8', 'ignore').replace("\n", " ").strip()

def search_nfts(query: str, top_k: int = 5, sort_by: str = "relevance", filters: dict = None, exact_match: bool = True):
    query_embedding = model.encode(query).tolist()

    # Perform search in ChromaDB with optional filters
    results = collection.query(
    query_embeddings=[query_embedding],
    n_results=top_k,
    where=filters if filters else None
    )

    # Extract results
    metadatas = results.get("metadatas", [])
    distances = results.get("distances", [])
    if not metadatas or not distances:

      return []  # Return empty list if no results
    metadatas, distances = metadatas[0], distances[0]  # Extract first item safely


    # **Clean metadata and add similarity scores**
    for i, meta in enumerate(metadatas):
        meta["title"] = clean_text(meta.get("title", "N/A"))
        meta["tags"] = clean_text(meta.get("tags", "N/A"))
        meta["description"] = clean_text(meta.get("description", "N/A"))
        meta["similarity_score"] = round((1 - (distances[i] / 2)) * 100, 2)


    # **Exact Match Filtering**
    if exact_match:
        exact_results = [meta for meta in metadatas if query.lower() in meta.get("title", "").lower()]
        if exact_results:
            return exact_results  # Return exact matches first

    # **Fuzzy Matching on Titles and Tags**
    titles = [meta["title"] for meta in metadatas if "title" in meta]
    tags_list = [meta["tags"] for meta in metadatas if "tags" in meta]

    # Match on titles
    fuzzy_title_matches = process.extractBests(query, titles, limit=top_k)
    fuzzy_title_results = [meta for meta in metadatas if meta["title"] in [match[0] for match in fuzzy_title_matches]]

    # Match on tags
    fuzzy_tag_matches = process.extractBests(query, tags_list, limit=top_k)
    fuzzy_tag_results = [meta for meta in metadatas if meta["tags"] in [match[0] for match in fuzzy_tag_matches]]

    # Merge results from both fuzzy matches
    fuzzy_results = list({v["title"]: v for v in fuzzy_title_results + fuzzy_tag_results}.values())

    # **Sorting Logic**
    if sort_by == "popularity":
        fuzzy_results = sorted(fuzzy_results, key=lambda x: x.get("likes", 0), reverse=True)
    elif sort_by == "date":
        fuzzy_results = sorted(fuzzy_results, key=lambda x: x.get("created_at", ""), reverse=True)

    return fuzzy_results if fuzzy_results else metadatas  # Return fuzzy matches or original results


In [95]:
# Get user input
query = input("🔍 Enter NFT search query: ")
search_results = search_nfts(query)

# Display results
print("\n🔍 Search Results:")
if search_results:
    for result in search_results:
        title = result.get("title", "N/A")
        tags = result.get("tags", "N/A")
        description = result.get("description", "N/A")
        similarity = round(result.get("similarity_score", 0), 2)

        # Clean description
        description = description.replace("\n", " ").replace('"', '').strip()

        print(f"🎨 Title: {title}")
        print(f"🏷 Tags: {tags}")
        print(f"📝 Description: {description}")
        print(f"📊 Similarity Score: {similarity}%")
        print("-" * 50)
else:
    print("❌ No matching NFTs found.")


🔍 Enter NFT search query: desert safari

🔍 Search Results:
🎨 Title: Camel Fence
🏷 Tags: landscape, landscapephotography, photography, photo, nft, fnd, desert, sand
📝 Description: A fence built to keep camels in (or out) is swallowed up as it follows the contours of a desert dune alongside a highway somewhere near Dubai.  This image has won multiple awards:  2017 - Australian Photography Awards Landscape division 2nd place 2015 - South Australian Travel Photographer of the year winning portfolio 2015 - Australian Professional Photography Awards - Silver with Distinction award 2015 - South Australian Professional Photography Awards - Silver with Distinction award
📊 Similarity Score: 55.49%
--------------------------------------------------
🎨 Title: Desert Dolphin
🏷 Tags: dolphin, desert, form, photography, photo, art, abstraction, landscapephotography, landscape, offer
📝 Description: This picture was taken in the Khara Desert, Isfahan Province, Iran Shape of a dolphin with desert sand Th