In [None]:
!pip install sentence-transformers torch

In [None]:
from sentence_transformers import SentenceTransformer, util

# 1. Load model
model = SentenceTransformer("intfloat/e5-small-v2")

In [None]:
# 2. Example sentences (pretend these came from a PDF)
sentences = [
    "Machine learning is a field of artificial intelligence.",
    "Transformers are deep learning models used in NLP.",
    "Sentence embeddings capture semantic meaning of text.",
    "Vector databases are used for similarity search."
]


In [None]:
# E5 models expect "passage:" and "query:" prefixes
passages = [f"passage: {s}" for s in sentences]

In [None]:
# 3. Embed passages
passage_embeddings = model.encode(passages, normalize_embeddings=True)

In [None]:
# 4. Query
query = "How do we represent text meaning numerically?"
query_embedding = model.encode(
    f"query: {query}",
    normalize_embeddings=True
)

In [None]:
# 5. Similarity search
scores = util.cos_sim(query_embedding, passage_embeddings)[0]

# 6. Get best match
best_idx = scores.argmax()
print("Query:", query)
print("Most relevant sentence:", sentences[best_idx])
print("Similarity score:", float(scores[best_idx]))

## pdf loader

!pip install pypdf langchain install langchain_community

In [None]:
from langchain_community.document_loaders import PyPDFLoader

In [None]:
from sentence_transformers import SentenceTransformer, util

In [None]:
# from langchain.document_loaders import PyPDFLoader
from sentence_transformers import SentenceTransformer, util

# 1. Load PDF
loader = PyPDFLoader("/content/RITIKA KUMARI RESUMEE.pdf")   # <-- your PDF path
docs = loader.load()

# Take only a few lines for simplicity (3–5 sentences)
sentences = [
    doc.page_content.strip().replace("\n", " ")
    for doc in docs[:5]
]

# 2. Load embedding model
model = SentenceTransformer("intfloat/e5-small-v2")

# E5 requires prefixes
passages = [f"passage: {s}" for s in sentences]

# 3. Embed PDF sentences
passage_embeddings = model.encode(
    passages,
    normalize_embeddings=True
)

# 4. Query
query = "hey how are u?"
query_embedding = model.encode(
    f"query: {query}",
    normalize_embeddings=True
)

# 5. Similarity search
scores = util.cos_sim(query_embedding, passage_embeddings)[0]
best_idx = scores.argmax()

# 6. Result
print("Query:", query)
print("\nMost relevant sentence:")
print(sentences[best_idx])
print("\nSimilarity score:", float(scores[best_idx]))


## increasing the doc size

In [None]:
from langchain_community.document_loaders import PyPDFLoader
from sentence_transformers import SentenceTransformer, util

# Load PDF
loader = PyPDFLoader("/content/RITIKA KUMARI RESUMEE.pdf")
docs = loader.load()

# Simple sentence-level chunks
sentences = []
for doc in docs:
    for line in doc.page_content.split("\n"):
        line = line.strip()
        if len(line) > 30:
            sentences.append(line)

# Load model
model = SentenceTransformer("intfloat/e5-small-v2")

passages = [f"passage: {s}" for s in sentences]
passage_embeddings = model.encode(passages, normalize_embeddings=True)

# Query (resume-relevant)
query = "• Benchmarked in-house TTS and ASR models against state-of-the-art baselines, designing custom metrics for speech quality, intelligibility, and accuracy."
query_embedding = model.encode(f"query: {query}", normalize_embeddings=True)

scores = util.cos_sim(query_embedding, passage_embeddings)[0]
best_idx = scores.argmax()

print("Query:", query)
print("\nBest match:")
print(sentences[best_idx])
print("Score:", float(scores[best_idx]))
