In [10]:
!pip install sentence_transformers scikit-learn faiss-cpu gradio pandas



In [11]:
import pandas as pd
data = [
    {"query": "What is a patent?", "answer": "A patent is an exclusive right granted for an invention, which allows the inventor to exclude others from making, using, or selling the invention for a certain period of time."},
{"query": "How long does a patent last?", "answer": "A utility patent typically lasts for 20 years from the filing date, subject to maintenance fees."},
{"query": "What is the difference between a patent and a copyright?", "answer": "A patent protects inventions, while a copyright protects original works of authorship such as books, music, and software."},
{"query": "What are the types of patents?", "answer": "The three main types of patents are utility patents, design patents, and plant patents."},
{"query": "How do you apply for a patent?", "answer": "To apply for a patent, you must file a patent application with the relevant patent office, such as the USPTO or WIPO, including claims, descriptions, and drawings."},
{"query": "What is a prior art search?", "answer": "A prior art search is an investigation to determine if an invention is novel by looking for existing patents, publications, or products that are similar."},
{"query": "What does 'patent pending' mean?", "answer": "'Patent pending' means that a patent application has been filed, but the patent has not yet been granted."},
{"query": "Can software be patented?", "answer": "Yes, software can be patented if it produces a concrete, useful, and tangible result and meets other patentability criteria."},
{"query": "What is a provisional patent application?", "answer": "A provisional patent application is a lower-cost, informal patent application that allows inventors to secure a filing date before submitting a formal utility application."},
{"query": "What is patent infringement?", "answer": "Patent infringement occurs when someone makes, uses, sells, or imports a patented invention without the permission of the patent holder."}

]
df = pd.DataFrame(data)
df.head()

Unnamed: 0,query,answer
0,What is a patent?,A patent is an exclusive right granted for an ...
1,How long does a patent last?,A utility patent typically lasts for 20 years ...
2,What is the difference between a patent and a ...,"A patent protects inventions, while a copyrigh..."
3,What are the types of patents?,The three main types of patents are utility pa...
4,How do you apply for a patent?,"To apply for a patent, you must file a patent ..."


In [12]:
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np

In [13]:
model = SentenceTransformer('all-MiniLM-L6-v2')

In [14]:
doc_embeddings = model.encode(df['answer'].tolist(), show_progress_bar=True)
dimension = doc_embeddings.shape[1]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [15]:
dimension

384

In [16]:
index = faiss.IndexFlatL2(dimension)
index.add(doc_embeddings)

In [17]:
def semantic_search(query, top_k=3):
  query_embedding = model.encode([query])
  D, I = index.search(np.array(query_embedding), k=top_k)
  return df.iloc[I[0]]['answer']

In [20]:
semantic_search("Tell me about patent?")

Unnamed: 0,answer
0,A patent is an exclusive right granted for an ...
2,"A patent protects inventions, while a copyrigh..."
9,"Patent infringement occurs when someone makes,..."


In [21]:
#fine-tuned model
from sentence_transformers import InputExample, losses
from torch.utils.data import DataLoader

In [23]:
train_examples = [
    InputExample(texts=['What is a patent?', 'A patent is an exclusive right granted for an invention, which allows the inventor to exclude others from making, using, or selling the invention for a certain period of time.'], label=1),
    InputExample(texts=['How long does a patent last?', 'Photosynthesis is a natural process'], label=0),
    InputExample(texts=['What is the difference between a patent and a copyright?', 'A patent protects inventions, while a copyright protects original works of authorship such as books, music, and software.'], label=1),
    InputExample(texts=['What are the types of patents?', 'Capital of India is New Delhi'], label=0),
]

In [24]:
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=2)
train_loss = losses.CosineSimilarityLoss(model)

In [27]:
model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=1, show_progress_bar=True)
model.save('fine-tuned-legal-sbert')

Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33msrinidhi762005[0m ([33msrinidhi762005-vishwakarma-institute-of-technology[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss


In [28]:
model = SentenceTransformer("fine-tuned-legal-sbert")

In [29]:
doc_embeddings = model.encode(df['answer'].tolist(), show_progress_bar=True)
index = faiss.IndexFlatL2(doc_embeddings.shape[1])
index.add(doc_embeddings)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [30]:
semantic_search("What to do after my patent expired?")

Unnamed: 0,answer
0,A patent is an exclusive right granted for an ...
6,'Patent pending' means that a patent applicati...
1,A utility patent typically lasts for 20 years ...


In [31]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(df['answer'].tolist())

def get_tfidf_score(query):
  query_tfidf = vectorizer.transform([query])
  return tfidf_matrix.dot(query_tfidf.T).toarray().flatten()

In [35]:
def hybrid_search(query, alpha=0.5, top_k=3):
  query_embedding = model.encode([query])
  D, I = index.search(np.array(query_embedding), k=top_k)

  sem_scores = [1 - D[0][i] for i in range(top_k)]
  keyword_scores = get_tfidf_score(query)

  results = []
  for rank, idx in enumerate(I[0]):
    final_score = alpha * sem_scores[rank] + (1-alpha) * keyword_scores[idx]
    results.append((df.iloc[idx]['answer'], final_score))

  return sorted(results, key=lambda x: x[1], reverse=True)

In [36]:
for answer, score in hybrid_search("What to do after my patent expired?"):
  print(f"{round(score, 3)} -> {answer}")

0.052 -> A utility patent typically lasts for 20 years from the filing date, subject to maintenance fees.
0.05 -> 'Patent pending' means that a patent application has been filed, but the patent has not yet been granted.
0.041 -> A patent is an exclusive right granted for an invention, which allows the inventor to exclude others from making, using, or selling the invention for a certain period of time.
