<a href="https://colab.research.google.com/github/Rashmi-debug43/Statathon/blob/main/Perfect_match.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [9]:
# ======================================
# 1. IMPORT LIBRARIES
# ======================================
import pandas as pd
import numpy as np
import gradio as gr
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity


# ======================================
# 2. LOAD NSS DATASET
# ======================================
df = pd.read_csv("NSS.csv")   # change name if needed
df.columns = df.columns.str.strip()  # safety


# ======================================
# 3. LOAD EMBEDDING MODEL
# ======================================
model = SentenceTransformer("all-MiniLM-L6-v2")


# ======================================
# 4. CREATE EMBEDDINGS
# NSS HAS NO DESCRIPTIONS ‚Üí USE OCCUPATION
# ======================================
occupations = df["Occupation"].astype(str).tolist()
occupation_embeddings = model.encode(occupations)


# ======================================
# 5. SEMANTIC MATCH FUNCTION
# ======================================
def identify_occupation(user_input):
    # Embed input job description
    query_embedding = model.encode([user_input])

    # Similarity calculation
    similarities = cosine_similarity(query_embedding, occupation_embeddings)[0]
    best_index = np.argmax(similarities)
    confidence = similarities[best_index]

    # Matched occupation
    occupation = occupations[best_index]

    # NSS insights for that occupation
    subset = df[df["Occupation"] == occupation]

    avg_income = subset["Monthly_Income"].mean()
    avg_expenditure = subset["Monthly_Expenditure"].mean()
    avg_age = subset["Age"].mean()
    common_gender = subset["Gender"].mode()[0]
    states = subset["State"].value_counts().head(3).index.tolist()

    return f"""
‚úÖ IDENTIFIED OCCUPATION
-----------------------
{occupation}

üìä NSS SOCIO-ECONOMIC INSIGHTS
-----------------------------
‚Ä¢ Average Monthly Income     : ‚Çπ{avg_income:.0f}
‚Ä¢ Average Monthly Expenditure: ‚Çπ{avg_expenditure:.0f}
‚Ä¢ Average Age                : {avg_age:.1f} years
‚Ä¢ Most Common Gender         : {common_gender}
‚Ä¢ Top States                 : {', '.join(states)}

üîç CONFIDENCE SCORE
------------------
{confidence:.4f}
"""


# ======================================
# 6. GRADIO INTERFACE
# ======================================
interface = gr.Interface(
    fn=identify_occupation,
    inputs=gr.Textbox(
        lines=7,
        placeholder="Enter a job description (e.g., works with software, handles data, manages systems)"
    ),
    outputs=gr.Textbox(label="Occupation Match Result"),
    title="AI Occupation Identifier using NSS Data",
    description=(
        "Maps free-text job descriptions to standardized NSS occupations "
        "using semantic similarity and provides socio-economic insights."
    )
)


# ======================================
# 7. LAUNCH APP
# ======================================
interface.launch()


It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://2d8543cea36ee4a28b.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




In [7]:
print(df.columns)


Index(['Household_ID', 'State', 'District', 'Age', 'Gender', 'Occupation',
       'Monthly_Income', 'Monthly_Expenditure'],
      dtype='object')


In [5]:
from google.colab import files
uploaded = files.upload()

Saving NSS.csv to NSS.csv
