In [1]:
pip install pandas numpy faiss-cpu sentence-transformers

Note: you may need to restart the kernel to use updated packages.


# Import Libraries

In [2]:
import pandas as pd
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer

# Load LLM model

In [3]:
model = SentenceTransformer("patent/sbert-all-MiniLM-L6-v2")

# Load patent dataset

In [4]:
patent_df = pd.read_excel(r"C:\Users\israh\Desktop\Masters\THESIS\code\Patent_database\patent500.xlsx")
print(patent_df.head())

   patent_id                                    patent_abstract
0   10000000  A frequency modulated (coherent) laser detecti...
1   10000001  The injection molding machine includes a fixed...
2   10000002  The present invention relates to: a method for...
3   10000003  The invention relates to a method for producin...
4   10000004  The present invention relates to provides a do...


# Load TRIZ dataset

In [5]:
triz_df = pd.read_csv(r"C:\Users\israh\Desktop\Masters\THESIS\code\TRIZ_effects_database\combined_triz_dataset.csv")
print(triz_df.head())

        tag input_1_name input_1_value input_2_name  input_2_value  \
0  Function       action        Absorb       object  Divided Solid   
1  Function       action        Absorb       object  Divided Solid   
2  Function       action        Absorb       object  Divided Solid   
3  Function       action        Absorb       object  Divided Solid   
4  Function       action        Absorb       object  Divided Solid   

              result                                        description  
0  Activated Alumina  A manufactured form of alumina (aluminium oxid...  
1   Activated Carbon  (or activated charcoal, activated coal, carbo ...  
2           Adhesive  A compound that adheres or bonds two items tog...  
3           Aerogels  A manufactured material with the lowest bulk d...  
4        Amphiphiles  A chemical compound possessing both hydrophili...  


# Embed patent + Build FAISS Index

In [6]:
print("Embedding patent database...")
patent_embeddings = model.encode(patent_df["patent_abstract"].tolist(), convert_to_numpy=True)
print("Embedding patent database: done")
#Build FAISS index for Patents
print("Building FAISS index for patents...")
patent_dim = patent_embeddings.shape[1]
patent_index = faiss.IndexFlatL2(patent_dim)
patent_index.add(patent_embeddings)

print("Building FAISS index: done")

Embedding patent database...
Embedding patent database: done
Building FAISS index for patents...
Building FAISS index: done


# view embedding in table

In [10]:
# Convert embeddings to DataFrame
embedding_df = pd.DataFrame(patent_embeddings)

# Add metadata
embedding_df['patent_id'] = patent_df['patent_id']
embedding_df['abstract'] = patent_df['patent_abstract']

# Show first few rows
embedding_df.head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,376,377,378,379,380,381,382,383,patent_id,abstract
0,-0.106439,0.231193,0.028058,-0.023522,-0.426877,0.121179,-0.106983,0.105149,-0.267921,0.125781,...,-0.04963,-0.288474,0.069247,-0.018643,0.225912,-0.039725,-0.074736,0.014816,10000000,A frequency modulated (coherent) laser detecti...
1,-0.33123,0.331754,0.000149,0.156075,-0.423781,0.225393,0.040836,0.230348,-0.485862,0.271967,...,-0.032981,-0.257741,0.033413,-0.131023,0.12083,0.136021,-0.052865,0.087452,10000001,The injection molding machine includes a fixed...
2,-0.393579,0.277757,0.115055,-0.034416,-0.423899,0.083583,-0.071407,0.384509,-0.444063,0.316631,...,0.01339,-0.328412,0.125423,0.069547,0.270139,0.030074,0.081338,0.256067,10000002,The present invention relates to: a method for...


# Embed TRIZ + Build FAISS Index

In [13]:
print("Embedding TRIZ effects...")

triz_texts = (
    triz_df["input_1_name"].fillna('') + " " +
    triz_df["input_1_value"].fillna('') + " " +
    triz_df["input_2_name"].fillna('') + " " +
    triz_df["input_2_value"].fillna('') + " " +
    triz_df["result"].fillna('') + " " +
    triz_df["description"].fillna('') + " " +
    triz_df["tag"].fillna('')
).tolist()

triz_embeddings = model.encode(triz_texts, convert_to_numpy=True)
print("Embedding TRIZ database: done")

#Build FAISS index for TRIZ
print("Building FAISS index for TRIZ...")
triz_dim = triz_embeddings.shape[1]
triz_index = faiss.IndexFlatL2(triz_dim)
triz_index.add(triz_embeddings)

print("Building FAISS index: done")

Embedding TRIZ effects...
Embedding TRIZ database: done
Building FAISS index for TRIZ...
Building FAISS index: done


# TRIZ Functions

In [18]:
def expand_query_with_triz(query, model, triz_index, triz_df, top_k=1):
    """Expand the user query using the most semantically similar TRIZ principle."""
    query_embedding = model.encode([query], convert_to_numpy=True)
    distances, indices = triz_index.search(query_embedding, top_k)
    
    # Get best matching TRIZ effect
    top_triz = triz_df.iloc[indices[0][0]]

    # ---- Build full TRIZ description dynamically ----
    triz_description = (
        str(top_triz.get("input_1_name", "")) + " " +
        str(top_triz.get("input_1_value", "")) + " " +
        str(top_triz.get("input_2_name", "")) + " " +
        str(top_triz.get("input_2_value", "")) + " " +
        str(top_triz.get("result", "")) + " " +
        str(top_triz.get("description", "")) + " " +
        str(top_triz.get("tag", ""))
    )

    # Expand the query
    enhanced_query = query + " " + triz_description
    print(f"\nExpanded Query with TRIZ:\n{enhanced_query}\n")
    return enhanced_query


def search_patents_with_triz(query, top_k=5, return_expanded_query=False):
    triz_expanded_query = expand_query_with_triz(query, model, triz_index, triz_df, top_k=1)
    results = search_patents_no_triz(triz_expanded_query, top_k=top_k)
    
    if return_expanded_query:
        return results, triz_expanded_query
    return results

# Normal Patent Search (No TRIZ)

In [16]:
def search_patents_no_triz(query, top_k=5):
    """Search patents without TRIZ expansion."""
    query_embedding = model.encode([query], convert_to_numpy=True)
    distances, indices = patent_index.search(query_embedding, top_k)
    results = patent_df.iloc[indices[0]]
    return results[["patent_id", "patent_abstract"]]


# Query here

In [20]:
import os

output_file = "combined_patent_search_results_rows.csv"

while True:
    user_query = input("\nEnter your patent search query (or type 'exit' to quit): ").strip()
    if user_query.lower() == 'exit':
        print("Exiting search.")
        break

    print("\n--- Searching without TRIZ expansion ---")
    normal_results = search_patents_no_triz(user_query, top_k=5)
    print(normal_results)

    print("\n--- Searching WITH TRIZ expansion ---")
    triz_results, triz_expanded_query = search_patents_with_triz(user_query, top_k=5, return_expanded_query=True)
    print(triz_results)

    # Add metadata columns
    normal_results["query"] = user_query
    normal_results["source"] = "LLM"
    normal_results["expanded_query"] = user_query

    triz_results["query"] = user_query
    triz_results["source"] = "TRIZ"
    triz_results["expanded_query"] = triz_expanded_query

    # Combine and reorder
    combined = pd.concat([normal_results, triz_results], ignore_index=True)
    combined = combined[["query", "source", "expanded_query", "patent_id", "patent_abstract"]]

    # Append to CSV
    if os.path.exists(output_file):
        combined.to_csv(output_file, mode='a', header=False, index=False)
    else:
        combined.to_csv(output_file, index=False)

    print(f"Results for '{user_query}' saved to '{output_file}'.")



Enter your patent search query (or type 'exit' to quit): 3-D printing system

--- Searching without TRIZ expansion ---
     patent_id                                    patent_abstract
54    10000056  A screen printing apparatus includes a pair of...
10    10000010  3-D printing system include development statio...
349   10000353  A sheet flipping device includes a receiving m...
22    10000024  An apparatus for controlling 3D printing inclu...
320   10000324  In a packing bag to store a stored article Z w...

--- Searching WITH TRIZ expansion ---

Expanded Query with TRIZ:
3-D printing system operation Measure parameter Shape 3D Printing The process of creating three dimensional objects from digital data using a materials printer, in a manner similar to printing images on paper. The term is most closely associated with additive manufacturing technology, where an object is created by laying down successive layers of material. Parameter

     patent_id                                  


Enter your patent search query (or type 'exit' to quit): power supply wire

--- Searching without TRIZ expansion ---
     patent_id                                    patent_abstract
164   10000167  A wire harness includes a cable unit, and an i...
126   10000129  A vehicle includes an engine, a traction batte...
292   10000296  An example aircraft electrical system includes...
361   10000365  An improved pulley for winch is described, in ...
281   10000285  Described herein are methods and systems for d...

--- Searching WITH TRIZ expansion ---

Expanded Query with TRIZ:
power supply wire operation Measure parameter Power Ampère's Force Law Ampere's Force Law decribes the force of attraction or repulsion between two current-carrying wires. Parameter

     patent_id                                    patent_abstract
278   10000282  An aircraft landing gear assembly includes a m...
223   10000227  A cylindrical buckling prevention member is fi...
97    10000100  Systems and methods of 

# Semantic Similarity
Get the semantic similarity score between query and result

In [1]:
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

# Load the saved search results
file_path = "combined_patent_search_results_rows.csv"
df = pd.read_csv(file_path)

# Load PatentBERT model
model = SentenceTransformer("patent/sbert-all-MiniLM-L6-v2")

def compute_similarity_from_csv(df, model):
    """Compute semantic similarity for each row in the DataFrame."""
    # Get query and abstract embeddings
    query_embeddings = model.encode(df["query"].tolist(), convert_to_numpy=True)
    abstract_embeddings = model.encode(df["patent_abstract"].tolist(), convert_to_numpy=True)

    # Compute cosine similarity row-wise
    similarities = []
    for q, a in zip(query_embeddings, abstract_embeddings):
        sim = cosine_similarity([q], [a])[0][0]
        similarities.append(sim)

    df["semantic_similarity"] = similarities
    return df

scored_df = compute_similarity_from_csv(df, model)
scored_df.to_csv("scored_patent_search_results.csv", index=False)
scored_df.head()


Unnamed: 0,query,source,expanded_query,patent_id,patent_abstract,semantic_similarity
0,3-D printing system,LLM,3-D printing system,10000056,A screen printing apparatus includes a pair of...,0.69497
1,3-D printing system,LLM,3-D printing system,10000010,3-D printing system include development statio...,0.686107
2,3-D printing system,LLM,3-D printing system,10000353,A sheet flipping device includes a receiving m...,0.670232
3,3-D printing system,LLM,3-D printing system,10000024,An apparatus for controlling 3D printing inclu...,0.659352
4,3-D printing system,LLM,3-D printing system,10000324,In a packing bag to store a stored article Z w...,0.637138
