## Importing Required Packages

In [32]:
import pandas as pd
import gradio as gr
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sentence_transformers import SentenceTransformer

## Load job postings from CSV

In [33]:
df = pd.read_csv('E:/Codes/Python/Project/train.csv')

## Precompute TF-IDF embeddings

In [34]:
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(df['description'])

## Precompute Doc2Vec embeddings

In [35]:
documents = [TaggedDocument(doc.split(), [i]) for i, doc in enumerate(df['description'])]
doc2vec_model = Doc2Vec(documents, vector_size=50, window=2, min_count=1, workers=4)
doc2vec_embeddings = [doc2vec_model.infer_vector(doc.split()) for doc in df['description']]

## Precompute MiniLM embeddings

In [36]:
minilm_model = SentenceTransformer('all-MiniLM-L6-v2')
minilm_embeddings = minilm_model.encode(df['description'].tolist())



## Function to perform semantic search using TF-IDF

In [37]:
def tfidf_search(query, top_n=5):
    query_vector = tfidf_vectorizer.transform([query])
    similarities = cosine_similarity(query_vector, tfidf_matrix).flatten()
    df['similarity'] = similarities
    results = df.nlargest(top_n, 'similarity')
    return results[['title', 'company', 'location', 'description', 'similarity']].to_dict(orient='records')

## Function to perform semantic search using Doc2Vec

In [38]:
def doc2vec_search(query, top_n=5):
    query_vector = doc2vec_model.infer_vector(query.split())
    similarities = cosine_similarity([query_vector], doc2vec_embeddings).flatten()
    df['similarity'] = similarities
    results = df.nlargest(top_n, 'similarity')
    return results[['title', 'company', 'location', 'description', 'similarity']].to_dict(orient='records')

## Function to perform semantic search using MiniLM

In [39]:
def minilm_search(query, top_n=5):
    query_embedding = minilm_model.encode([query])
    similarities = cosine_similarity(query_embedding, minilm_embeddings).flatten()
    df['similarity'] = similarities
    results = df.nlargest(top_n, 'similarity')
    return results[['title', 'company', 'location', 'description', 'similarity']].to_dict(orient='records')

## Gradio interface

In [40]:
def search_interface(query, model):
    if model == 'TF-IDF':
        results = tfidf_search(query)
    elif model == 'Doc2Vec':
        results = doc2vec_search(query)
    else:
        results = minilm_search(query)
    
    formatted_results = "\n\n".join([f"**{r['title']}** at {r['company']} in {r['location']}\n\n{r['description']}\n\nSimilarity: {r['similarity']:.2f}" for r in results])
    return formatted_results

## Gradio UI

In [41]:
iface = gr.Interface(
    fn=search_interface,
    inputs=[
        gr.Textbox(lines=2, placeholder="Enter your search query here..."),
        gr.Radio(choices=['TF-IDF', 'Doc2Vec', 'MiniLM'], label="Choose the model")
    ],
    outputs="markdown",
    title="Job Postings Semantic Search",
    description="Search for job postings using different embedding models: TF-IDF, Doc2Vec, or MiniLM."
)

iface.launch()

Running on local URL:  http://127.0.0.1:7864

To create a public link, set `share=True` in `launch()`.


