In [None]:
import pandas as pd

In [7]:
from elasticsearch import Elasticsearch

# Initialize Elasticsearch client
client = Elasticsearch(
    hosts=f"https://es2.logic-mill.net:443",
    basic_auth=("elastic", "MTqCwEli2DWAJTFGKwIa"),
    retry_on_status=[500, 501, 502, 503, 504, 524],
    retry_on_timeout=True,
    request_timeout=60,
)

In [10]:
mapping = client.indices.get_mapping(index="docdb_cos")
print(mapping)

{'docdb_cos': {'mappings': {'properties': {'abstract': {'type': 'text', 'fields': {'keyword': {'type': 'keyword', 'ignore_above': 256}}}, 'applicants': {'type': 'text', 'fields': {'keyword': {'type': 'keyword', 'ignore_above': 256}}}, 'authority': {'type': 'text', 'fields': {'keyword': {'type': 'keyword', 'ignore_above': 256}}}, 'cpcClasses': {'type': 'text', 'fields': {'keyword': {'type': 'keyword', 'ignore_above': 256}}}, 'docDbId': {'type': 'text', 'fields': {'keyword': {'type': 'keyword', 'ignore_above': 256}}}, 'docNumber': {'type': 'text', 'fields': {'keyword': {'type': 'keyword', 'ignore_above': 256}}}, 'embedding': {'type': 'dense_vector', 'dims': 768, 'index': True, 'similarity': 'cosine', 'index_options': {'type': 'int8_hnsw', 'm': 16, 'ef_construction': 100}}, 'embedding_model': {'type': 'text', 'fields': {'keyword': {'type': 'keyword', 'ignore_above': 256}}}, 'encoded_at': {'type': 'date'}, 'familyID': {'type': 'text', 'fields': {'keyword': {'type': 'keyword', 'ignore_above

In [18]:
print(mapping['docdb_cos']['mappings']['properties'].keys())
#so we see we need publication date which is stored in the key publicationDate

dict_keys(['abstract', 'applicants', 'authority', 'cpcClasses', 'docDbId', 'docNumber', 'embedding', 'embedding_model', 'encoded_at', 'familyID', 'id', 'inventors', 'kind', 'publicationDate', 'title'])


In [19]:
# Fetch the embedding for the given patent ID
res = client.get(
    index='docdb_cos',
    id="EP0574583A1",
)
embedding = res["_source"]["embedding"]
k = 10
target_date = res["_source"]["publicationDate"]  #publication date of the focal patent

# Construct the search query with KNN and date filter
query = {
    "size": k,
    "query": {
        "bool": {
            "must": {
                "knn": {
                    "field": "embedding",
                    "num_candidates": min(k * 2, 10_000),
                    "query_vector": embedding
                }
            },
            "filter": {
                "range": {
                    "publicationDate": {
                        "lte": target_date  # Only include patents before this date
                    }
                }
            }
        }
    },
    "_source": False
}

# Perform the search
search = client.search(
    index="docdb_cos",
    body=query
)
# Print the search results
print(search["hits"]["hits"])
print(len(search["hits"]["hits"]))
result=search["hits"]["hits"]

[{'_index': 'docdb_cos', '_id': 'EP0574583A1', '_score': 0.9991764, '_ignored': ['abstract.keyword']}, {'_index': 'docdb_cos', '_id': 'WO9312941A1', '_score': 0.9973502, '_ignored': ['abstract.keyword']}, {'_index': 'docdb_cos', '_id': 'EP0439049A1', '_score': 0.98186874, '_ignored': ['abstract.keyword']}, {'_index': 'docdb_cos', '_id': 'EP0121935A2', '_score': 0.9772881, '_ignored': ['abstract.keyword']}, {'_index': 'docdb_cos', '_id': 'US4599288A', '_score': 0.9769945, '_ignored': ['abstract.keyword']}, {'_index': 'docdb_cos', '_id': 'EP0327897A2', '_score': 0.9763912, '_ignored': ['abstract.keyword']}, {'_index': 'docdb_cos', '_id': 'US4705719A', '_score': 0.97617257, '_ignored': ['abstract.keyword']}, {'_index': 'docdb_cos', '_id': 'US4522906A', '_score': 0.9761052, '_ignored': ['abstract.keyword']}, {'_index': 'docdb_cos', '_id': 'US4853256A', '_score': 0.9758661, '_ignored': ['abstract.keyword']}, {'_index': 'docdb_cos', '_id': 'US3865595A', '_score': 0.975675, '_ignored': ['abst

In [20]:
similar_patents_before_pubdate=[]
for i in result:
    similar_patents_before_pubdate.append(i['_id'])

In [21]:
similar_patents_before_pubdate

['EP0574583A1',
 'WO9312941A1',
 'EP0439049A1',
 'EP0121935A2',
 'US4599288A',
 'EP0327897A2',
 'US4705719A',
 'US4522906A',
 'US4853256A',
 'US3865595A']

In [22]:
#printing publication date of focal patent
res = client.get(
        index='docdb_cos',
        id="EP0574583A1",
    )
print(res["_source"]["publicationDate"])

1993-12-22T00:00:00Z


In [23]:
#printing publication date of all the similar patents (note they should be before 1993-12-22T00:00:00Z)
for i in similar_patents_before_pubdate:
    res = client.get(
        index='docdb_cos',
        id=i,
    )
    print(res["_source"]["publicationDate"])

1993-12-22T00:00:00Z
1993-07-08T00:00:00Z
1991-07-31T00:00:00Z
1984-10-17T00:00:00Z
1986-07-08T00:00:00Z
1989-08-16T00:00:00Z
1987-11-10T00:00:00Z
1985-06-11T00:00:00Z
1989-08-01T00:00:00Z
1975-02-11T00:00:00Z


In [24]:
#performing normal elastic search without considering publication date and see if the results are different(just to double check that our approach is working)

res = client.get(
    index='docdb_cos',
    id="EP0574583A1",
    _source=["embedding"],
)
embedding = res["_source"]["embedding"]
print(embedding)
k = 10
query = {
        "field": "embedding",
        "num_candidates": min(k * 2, 10_000),
        "query_vector": embedding
    }
search = client.search(
    index="docdb_cos",
    knn=query,
    source=False,
    size=k
)
result2=search["hits"]["hits"]
print(search["hits"]["hits"])


[0.9488954544067383, 0.3310294449329376, -0.2039463073015213, 1.3473436832427979, -0.49000170826911926, -0.8459027409553528, 0.9959840774536133, 0.012898440472781658, 0.23584337532520294, 0.9498447179794312, 0.32202303409576416, -0.3112526535987854, 0.026017388328909874, -1.094834566116333, -0.6812632083892822, -0.6021229028701782, -0.2808898687362671, 0.35133543610572815, 0.3486179709434509, -0.40024253726005554, 0.7051123380661011, 0.7272440195083618, -0.5039498209953308, 0.5396905541419983, 0.9435885548591614, 1.3055225610733032, -0.3592785596847534, 0.8505787253379822, -0.47281280159950256, 1.1017558574676514, -0.03214563801884651, -0.9644632935523987, 0.41567152738571167, 0.1856817901134491, -1.0777734518051147, 0.37132537364959717, 0.3344060182571411, -0.42137670516967773, -0.3382238447666168, 0.6275976300239563, -0.5520192980766296, 0.029307467862963676, 1.1798095703125, -0.8378037810325623, -0.8652032613754272, 0.8668477535247803, -1.352146863937378, 0.3279002010822296, -0.0091

In [25]:
similar_patents=[]
for i in result2:
    similar_patents.append(i["_id"])

In [26]:
similar_patents

['EP0574583A1',
 'WO9312941A1',
 'EP0439049A1',
 'EP2465692A1',
 'US4427754A',
 'EP2409851A2',
 'EP1813434A1',
 'US2008152847A1',
 'US7517833B2',
 'US2010291326A1']

In [27]:
#printing publication date of all the similar patents (note that here they can be after 1993-12-22T00:00:00Z)
for i in similar_patents:
    res = client.get(
        index='docdb_cos',
        id=i,
    )
    print(res["_source"]["publicationDate"])

1993-12-22T00:00:00Z
1993-07-08T00:00:00Z
1991-07-31T00:00:00Z
2012-06-20T00:00:00Z


KeyError: 'publicationDate'

In [96]:
#finding out why the program gave error for id US4427754A (because no publicationDate key (see output))
res = client.get(
        index='docdb_cos',
        id='US4427754A',
    )
print(res["_source"].keys())

dict_keys(['title', 'abstract', 'embedding', 'id', 'embedding_model', 'encoded_at'])


In [34]:
#printing the result together and we can see that the first difference is in the fourth entry where we first found publication date to be greater
similar_patents,similar_patents_before_pubdate

(['EP0574583A1',
  'WO9312941A1',
  'EP0439049A1',
  'EP2465692A1',
  'US4427754A',
  'EP2409851A2',
  'EP1813434A1',
  'US2008152847A1',
  'US7517833B2',
  'US2010291326A1'],
 ['EP0574583A1',
  'WO9312941A1',
  'EP0439049A1',
  'EP0121935A2',
  'US4599288A',
  'EP0327897A2',
  'US4705719A',
  'US4522906A',
  'US4853256A',
  'US3865595A'])

In [None]:
##OPTIONAL - Investigating if elastic search can be used to get familyIDs rather than making query to SQL database

In [37]:
#printing publication date of all the similar patents (note that here they can be after 1993-12-22T00:00:00Z)
for i in similar_patents:
    res = client.get(
        index='docdb_cos',
        id=i,
    )
    print(res["_source"]["familyID"])

1239749
1239749
11772668
36227807


KeyError: 'familyID'

In [80]:
#reading the data which contains the keys that need to be passed to logicmill server
data=pd.read_csv("3keys_for_logicmill.csv")
data.drop(columns='Unnamed: 0',inplace=True)

In [81]:
data.head()

Unnamed: 0,docdb_family_id,pat_publn_id,earliest_filing_date,appln_auth,granted,publn_auth,publn_nr,publn_kind,key
0,574528.0,307102421.0,1985-04-10,EP,Y,EP,286634,A1,EP0286634A1
1,574586.0,387396319.0,1987-06-08,EP,Y,EP,320505,A1,EP0320505A1
2,574660.0,387398325.0,1986-08-04,EP,Y,EP,401206,A1,EP0401206A1
3,1226456.0,303518198.0,1991-08-29,EP,Y,EP,670744,A1,EP0670744A1
4,1226485.0,302640285.0,1992-08-12,EP,Y,EP,607158,A1,EP0607158A1


In [82]:
#storing keys in the form of list
keys=list(data['key'][0:50])

In [83]:
keys

['EP0286634A1',
 'EP0320505A1',
 'EP0401206A1',
 'EP0670744A1',
 'EP0607158A1',
 'EP0537145A1',
 'EP470080A1',
 'EP0541533A1',
 'EP0580570A1',
 'EP0597835A1',
 'EP0599834A1',
 'EP0555233A1',
 'EP0574583A1',
 'EP0607134A1',
 'EP0602033A1',
 'EP0683881A1',
 'EP0684902A1',
 'EP0688172A1',
 'EP0675992A1',
 'EP0823839A1',
 'EP0578632A1',
 'EP0555209A1',
 'EP0594850A1',
 'EP1407424A2',
 'EP1441628A1',
 'EP1180562A1',
 'EP1183973A1',
 'EP1312399A1',
 'EP0824178A2',
 'EP0891738A2',
 'EP0893202A2',
 'EP0997608A2',
 'EP0997084A2',
 'EP0223418A1',
 'EP221816A2',
 'EP0309186A1',
 'EP332113A2',
 'EP0478022A1',
 'EP0516139A1',
 'EP0660860A1',
 'EP0628277A1',
 'EP0663222A1',
 'EP0838187A1',
 'EP0783071A1',
 'EP0721026A1',
 'EP0720824A2',
 'EP0565566A1',
 'EP0873059A1',
 'EP0953085A1',
 'EP0519036A1']

In [84]:
from elasticsearch import NotFoundError
import numpy as np
d={} #to store the patent and its corresponding k similar patents as key value pairs
patents_not_found=[] #to store patents which were not found when searching in logicmill
publicationdate_not_found=[]
embeddings_not_found=[] #to store the patents which were found but for which embeddings were not found

In [85]:
data['similar_patents']=np.nan #creating the column to store similar patents referenced by their respective keys
data.head()

Unnamed: 0,docdb_family_id,pat_publn_id,earliest_filing_date,appln_auth,granted,publn_auth,publn_nr,publn_kind,key,similar_patents
0,574528.0,307102421.0,1985-04-10,EP,Y,EP,286634,A1,EP0286634A1,
1,574586.0,387396319.0,1987-06-08,EP,Y,EP,320505,A1,EP0320505A1,
2,574660.0,387398325.0,1986-08-04,EP,Y,EP,401206,A1,EP0401206A1,
3,1226456.0,303518198.0,1991-08-29,EP,Y,EP,670744,A1,EP0670744A1,
4,1226485.0,302640285.0,1992-08-12,EP,Y,EP,607158,A1,EP0607158A1,


In [86]:
data['similar_patents']=data['similar_patents'].astype('object')
data.dtypes

docdb_family_id         float64
pat_publn_id            float64
earliest_filing_date     object
appln_auth               object
granted                  object
publn_auth               object
publn_nr                  int64
publn_kind               object
key                      object
similar_patents          object
dtype: object

In [87]:
h=-1
for i in keys:
    h=h+1
    try:
        res = client.get(
            index='docdb_cos',
            id=i,
        )
        if 'embedding'in res['_source'].keys():
            if 'publicationDate'in res['_source'].keys():
                    
                embedding = res["_source"]["embedding"]
                target_date = res["_source"]["publicationDate"]
                k = 50
                # Construct the search query with KNN and date filter
                query = {
                    "size": k,
                    "query": {
                        "bool": {
                            "must": {
                                "knn": {
                                    "field": "embedding",
                                    "num_candidates": min(k * 2, 10_000),
                                    "query_vector": embedding
                                }
                            },
                            "filter": {
                                "range": {
                                    "publicationDate": {
                                        "lte": target_date  # Only include patents before this date
                                    }
                                }
                            }
                        }
                    },
                    "_source": False
                }
                # Perform the search
                search = client.search(
                    index="docdb_cos",
                    body=query
                )

                #print(search["hits"]["hits"])
                l=[] # to store all the similar patents corresponding to the patent with key=i
                for j in search["hits"]["hits"]:
                    l.append(j['_id'])
                d[i]=l
                data.at[h,'similar_patents']=l
            
            else:
                publicationdate_not_found.append(i)    
        else: 
            embeddings_not_found.append(i)
    except NotFoundError:
        patents_not_found.append(i)

In [88]:
len(d)

19

In [89]:
len(embeddings_not_found)

25

In [90]:
len(patents_not_found)

5

In [91]:
len(publicationdate_not_found)

1

In [92]:
data.head()

Unnamed: 0,docdb_family_id,pat_publn_id,earliest_filing_date,appln_auth,granted,publn_auth,publn_nr,publn_kind,key,similar_patents
0,574528.0,307102421.0,1985-04-10,EP,Y,EP,286634,A1,EP0286634A1,
1,574586.0,387396319.0,1987-06-08,EP,Y,EP,320505,A1,EP0320505A1,
2,574660.0,387398325.0,1986-08-04,EP,Y,EP,401206,A1,EP0401206A1,
3,1226456.0,303518198.0,1991-08-29,EP,Y,EP,670744,A1,EP0670744A1,
4,1226485.0,302640285.0,1992-08-12,EP,Y,EP,607158,A1,EP0607158A1,


In [94]:
data2=data[data['similar_patents'].notna()].reset_index() #creating the dataframe with only those patents for which similar patents were found
data2.drop(columns='index',inplace=True)

In [95]:
data2.head()

Unnamed: 0,docdb_family_id,pat_publn_id,earliest_filing_date,appln_auth,granted,publn_auth,publn_nr,publn_kind,key,similar_patents
0,1239749.0,303117700.0,1991-12-25,EP,Y,EP,574583,A1,EP0574583A1,"[EP0574583A1, WO9312941A1, EP0439049A1, EP0121..."
1,3411930.0,301922022.0,1992-04-15,EP,N,EP,594850,A1,EP0594850A1,"[EP0594850A1, EP0555928A1, EP0524158A1, EP0331..."
2,3460833.0,290372405.0,2000-08-15,EP,N,EP,1180562,A1,EP1180562A1,"[EP1180562A1, EP0676509A1, EP0383740A1, EP0327..."
3,3460835.0,290339286.0,2000-08-31,EP,N,EP,1183973,A1,EP1183973A1,"[EP1183973A1, EP0331225A1, EP0341176A1, EP0457..."
4,3460912.0,287876166.0,2001-11-15,EP,N,EP,1312399,A1,EP1312399A1,"[EP1312399A1, US5779379A, EP1068947A1, WO88089..."
