<a href="https://colab.research.google.com/github/NithickRoshan/githupbemc2/blob/main/Similarity_Search_Model_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

# 1. Install Dependencies

!pip install PyPDF2 sentence-transformers spacy transformers
!python -m spacy download en_core_web_sm

# 2. Import Libraries

import spacy
import PyPDF2
from sentence_transformers import SentenceTransformer, util
from transformers import pipeline


# 3. Load Models

nlp = spacy.load("en_core_web_sm")   # spaCy for preprocessing + NER
sbert_model = SentenceTransformer("all-MiniLM-L6-v2")  # SBERT for embeddings
bert_pipeline = pipeline("feature-extraction", model="bert-base-uncased")

# 4. Function: Extract text from PDF

def extract_pdf_text(file_path):
    text = ""
    with open(file_path, "rb") as f:
        reader = PyPDF2.PdfReader(f)
        for page in reader.pages:
            if page.extract_text():
                text += page.extract_text() + "\n"
    return text


# 5. Function: Preprocess with spaCy

def preprocess_text(text):
    doc = nlp(text.lower())
    tokens = [token.lemma_ for token in doc if not token.is_stop and token.is_alpha]
    entities = [(ent.text, ent.label_) for ent in doc.ents]
    return " ".join(tokens), entities

# 6. Function: Get embeddings with SBERT

def get_embedding(text):
    return sbert_model.encode(text, convert_to_tensor=True)

# 7. Function: Compare similarity

def check_similarity(new_text, past_texts):
    new_emb = get_embedding(new_text)
    past_embs = sbert_model.encode(past_texts, convert_to_tensor=True)
    scores = util.cos_sim(new_emb, past_embs)
    return scores.cpu().tolist()[0]

# 8. Run Example with Uploaded PDF

file_path = "/content/Business Proposals.pdf"   # Path in Colab
pdf_text = extract_pdf_text(file_path)

# For MVP: Use Executive Summary section as "new proposal"
new_proposal = "Executive Summary: " + pdf_text[:800]  # take first 800 chars as example

# Preprocess
processed_text, entities = preprocess_text(new_proposal)
print(" Preprocessed Text:", processed_text[:200], "...")
print(" Named Entities:", entities)

# Past proposals (for semantic comparison)
past_proposals = [
    "AI-driven loan approval system for banks.",
    "IoT-based smart agriculture monitoring system.",
    "Blockchain-enabled healthcare data sharing.",
    "Mobile app for waste management in smart cities.",
    "Executive summary: Our Hi-Tech Rovers help solve homelessness with mobile support vehicles."
]

# Check similarity
similarities = check_similarity(new_proposal, past_proposals)

print("\n Similarity Scores:")
for i, score in enumerate(similarities):
    print(f"Proposal {i+1}: {score:.4f}")


Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m105.2 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


Device set to use cpu


 Preprocessed Text: executive summary san josé state university writing center write andrew davis business proposal spring business proposal business proposal document send potential customer persuade business business p ...
 Named Entities: [('san josé', 'GPE'), ('state university writing center', 'ORG'), ('andrew davis', 'PERSON'), ('spring 2022', 'DATE'), ('1', 'CARDINAL'), ('6', 'CARDINAL')]

 Similarity Scores:
Proposal 1: 0.2073
Proposal 2: -0.0035
Proposal 3: 0.0144
Proposal 4: 0.1149
Proposal 5: 0.0939
