# Neural Semantic Search Engine

In [2]:

!pip install sentence-transformers faiss-cpu numpy pandas
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
import faiss
import os


def load_and_preprocess_data(file_path):
    """
    Loads text data from a file and preprocesses it for semantic search.
    This function can be extended to handle JSON, CSV, or other formats.
    """
    documents = []
    metadata = []


    if file_path.endswith('.txt'):
        with open(file_path, 'r', encoding='utf-8') as f:
            content = f.read()

            chunks = [p.strip() for p in content.split('\n\n') if p.strip()]
            for i, chunk in enumerate(chunks):
                documents.append(chunk)
                metadata.append({"source": os.path.basename(file_path), "chunk_id": i})



    if not documents:
        raise ValueError("No documents found in the provided file path.")

    return documents, metadata


def generate_embeddings(documents, model_name='all-MiniLM-L6-v2'):
    """
    Uses a pre-trained Sentence-Transformer model to generate embeddings.
    'all-MiniLM-L6-v2' is a fast and effective model for many use cases.
    For more complex tasks, you might use 'multi-qa-mpnet-base-dot-v1'
    or a larger model.
    """
    print(f"Loading Sentence-Transformer model: {model_name}...")
    model = SentenceTransformer(model_name)
    print("Generating embeddings for documents...")
    corpus_embeddings = model.encode(documents, convert_to_numpy=True)
    print(f"Embeddings generated with shape: {corpus_embeddings.shape}")
    return corpus_embeddings, model


def create_faiss_index(embeddings):
    """
    Creates a FAISS index for efficient similarity search.
    We'll use IndexFlatL2 for a simple, brute-force search.
    For very large datasets (millions of vectors), more advanced
    indexes like IndexIVFFlat or IndexHNSWFlat should be used.
    """
    d = embeddings.shape[1]
    print(f"Creating a FAISS index with dimension {d}...")
    index = faiss.IndexFlatL2(d)
    index.add(embeddings)
    print(f"Index created with {index.ntotal} vectors.")
    return index


def semantic_search(query, model, index, documents, k=5):
    """
    Performs a semantic search on the FAISS index.
    """
    print(f"\nSearching for query: '{query}'")
    query_embedding = model.encode([query])
    D, I = index.search(query_embedding, k)

    results = []
    for i in range(k):
        doc_idx = I[0][i]
        score = D[0][i]
        results.append({
            "score": score,
            "document": documents[doc_idx]
        })
    return results


def run_semantic_search_project():
    """
    Combines all steps into a single, executable pipeline.
    """
    print("--- Running Semantic Search Project ---")


    dummy_file_path = "sample_corpus.txt"
    if not os.path.exists(dummy_file_path):
        with open(dummy_file_path, "w", encoding="utf-8") as f:
            f.write("The quick brown fox jumps over the lazy dog.\n\n")
            f.write("A dog is a man's best friend, known for its loyalty and companionship.\n\n")
            f.write("Machine learning is a subset of artificial intelligence.\n\n")
            f.write("Deep learning models are a type of neural network with many layers.\n\n")
            f.write("A fox is a clever and cunning animal, often found in forests.\n\n")


    documents, metadata = load_and_preprocess_data(dummy_file_path)


    corpus_embeddings, model = generate_embeddings(documents)


    faiss_index = create_faiss_index(corpus_embeddings)


    user_query = "What is artificial intelligence about?"
    results = semantic_search(user_query, model, faiss_index, documents, k=3)

    print("\n--- Search Results ---")
    for result in results:
        print(f"Score: {result['score']:.4f}")
        print(f"Document: {result['document']}\n")

run_semantic_search_project()

Collecting faiss-cpu
  Downloading faiss_cpu-1.12.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.1 kB)
Downloading faiss_cpu-1.12.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (31.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.4/31.4 MB[0m [31m55.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.12.0
--- Running Semantic Search Project ---
Loading Sentence-Transformer model: all-MiniLM-L6-v2...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Generating embeddings for documents...
Embeddings generated with shape: (5, 384)
Creating a FAISS index with dimension 384...
Index created with 5 vectors.

Searching for query: 'What is artificial intelligence about?'

--- Search Results ---
Score: 0.7304
Document: Machine learning is a subset of artificial intelligence.

Score: 1.2591
Document: Deep learning models are a type of neural network with many layers.

Score: 1.7636
Document: A dog is a man's best friend, known for its loyalty and companionship.

