<a href="https://colab.research.google.com/github/Rashmi-debug43/Statathon/blob/Semantic-Search/Semantic_search.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# ======================================
# 1. IMPORT LIBRARIES
# ======================================
import pandas as pd
import numpy as np
import gradio as gr
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity


# ======================================
# 2. LOAD DATASET
# ======================================
df = pd.read_csv("/content/NSS.csv.csv")
df.columns = df.columns.str.strip()


# ======================================
# 3. LOAD EMBEDDING MODEL
# ======================================
model = SentenceTransformer("all-MiniLM-L6-v2")


# ======================================
# 4. CREATE EMBEDDINGS
# ======================================
descriptions = df["Description"].astype(str).tolist()
description_embeddings = model.encode(
    descriptions,
    show_progress_bar=True,
    convert_to_numpy=True
)


# ======================================
# 5. CONFIDENCE LEVEL FUNCTION
# ======================================
def confidence_label(score):
    if score >= 0.75:
        return "High", "🟢"
    elif score >= 0.55:
        return "Medium", "🟡"
    else:
        return "Low", "🔴"


# ======================================
# 6. TOP-K MATCH FUNCTION
# ======================================
def identify_top_k_occupations(user_input, k):
    if user_input.strip() == "":
        return "⚠️ Please enter a job description."

    query_embedding = model.encode([user_input], convert_to_numpy=True)
    similarities = cosine_similarity(query_embedding, description_embeddings)[0]

    top_k_indices = similarities.argsort()[-k:][::-1]

    output_text = ""

    for idx in top_k_indices:
        score = similarities[idx]
        level, emoji = confidence_label(score)
        row = df.iloc[idx]

        output_text += f"""
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
🎯 Confidence Score : {score:.4f}
📊 Confidence Level : {emoji} {level}
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
🔖 KEY CLASSIFICATION
Occupational Title : {row['Title']}
NCO 2015           : {row['Code']}

📄 FULL RECORD
S No               : {idx + 1}
Occupational Title : {row['Title']}
NCO 2015           : {row['Code']}
Description        : {row['Description']}
"""

    return output_text


# ======================================
# 7. CLEAR FUNCTION
# ======================================
def clear_all():
    return "", 5, ""


# ======================================
# 8. GRADIO UI (TEXT ONLY)
# ======================================
with gr.Blocks() as demo:

    gr.HTML("""
    <style>
        .gradio-container {
            max-width: 1200px !important;
        }
    </style>
    """)

    gr.Markdown("""
    # 🔍 Top-K Semantic Occupational Mapping (MOSPI)
    ### AI-based Job → NCO 2015 Mapping
    *Sentence-BERT • Confidence Levels*
    """)

    user_input = gr.Textbox(
        lines=5,
        label="📝 Job Description",
        placeholder="Eg: auditor, accounts executive, billing clerk..."
    )

    top_k = gr.Slider(1, 10, value=5, step=1, label="🔢 Top-K Matches")

    with gr.Row():
        submit_btn = gr.Button("🚀 Submit", variant="primary")
        clear_btn = gr.Button("🧹 Clear")

    output_text = gr.Textbox(
        label="📄 Occupation Classification (Detailed)",
        lines=25
    )

    submit_btn.click(
        identify_top_k_occupations,
        inputs=[user_input, top_k],
        outputs=output_text
    )

    clear_btn.click(
        clear_all,
        outputs=[user_input, top_k, output_text]
    )


# ======================================
# 9. LAUNCH
# ======================================
demo.launch(debug=True)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/23 [00:00<?, ?it/s]

It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://2a7495da39d1291f89.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)
