In [1]:
from elasticsearch import Elasticsearch

# Initialize Elasticsearch client
client = Elasticsearch(
    hosts=f"https://es2.logic-mill.net:443",
    basic_auth=("elastic", "MTqCwEli2DWAJTFGKwIa"),
    retry_on_status=[500, 501, 502, 503, 504, 524],
    retry_on_timeout=True,
    request_timeout=60,
)

In [2]:
mapping = client.indices.get_mapping(index="docdb_cos")
print(mapping)

{'docdb_cos': {'mappings': {'properties': {'abstract': {'type': 'text', 'fields': {'keyword': {'type': 'keyword', 'ignore_above': 256}}}, 'applicants': {'type': 'text', 'fields': {'keyword': {'type': 'keyword', 'ignore_above': 256}}}, 'authority': {'type': 'text', 'fields': {'keyword': {'type': 'keyword', 'ignore_above': 256}}}, 'cpcClasses': {'type': 'text', 'fields': {'keyword': {'type': 'keyword', 'ignore_above': 256}}}, 'docDbId': {'type': 'text', 'fields': {'keyword': {'type': 'keyword', 'ignore_above': 256}}}, 'docNumber': {'type': 'text', 'fields': {'keyword': {'type': 'keyword', 'ignore_above': 256}}}, 'embedding': {'type': 'dense_vector', 'dims': 768, 'index': True, 'similarity': 'cosine', 'index_options': {'type': 'int8_hnsw', 'm': 16, 'ef_construction': 100}}, 'embedding_model': {'type': 'text', 'fields': {'keyword': {'type': 'keyword', 'ignore_above': 256}}}, 'encoded_at': {'type': 'date'}, 'familyID': {'type': 'text', 'fields': {'keyword': {'type': 'keyword', 'ignore_above

In [3]:
print(mapping['docdb_cos']['mappings']['properties'].keys())
#we need publication date and family ids which is stored in the keys publicationDate and familyID respectively

dict_keys(['abstract', 'applicants', 'authority', 'cpcClasses', 'docDbId', 'docNumber', 'embedding', 'embedding_model', 'encoded_at', 'familyID', 'id', 'inventors', 'kind', 'publicationDate', 'title'])


In [4]:
# Fetch the embedding for the given patent ID
res = client.get(
    index='docdb_cos',
    id="EP0574583A1",
)
embedding = res["_source"]["embedding"]
k = 10
target_date = res["_source"]["publicationDate"]  #publication date of the focal patent

# Construct the search query with KNN and date filter
query = {
    "size": k,
    "query": {
        "bool": {
            "must": {
                "knn": {
                    "field": "embedding",
                    "num_candidates": min(k * 2, 10_000),
                    "query_vector": embedding
                }
            },
            "filter": {
                "range": {
                    "publicationDate": {
                        "lte": target_date  # Only include patents before this date
                    }
                }
            }
        }
    },
    "_source": ["familyID"]
}

# Perform the search
search = client.search(
    index="docdb_cos",
    body=query
)
# Print the search results
print(search["hits"]["hits"])
print(len(search["hits"]["hits"]))
result=search["hits"]["hits"]

[{'_index': 'docdb_cos', '_id': 'EP0574583A1', '_score': 0.9991764, '_ignored': ['abstract.keyword'], '_source': {'familyID': '1239749'}}, {'_index': 'docdb_cos', '_id': 'WO9312941A1', '_score': 0.9973502, '_ignored': ['abstract.keyword'], '_source': {'familyID': '1239749'}}, {'_index': 'docdb_cos', '_id': 'EP0439049A1', '_score': 0.98186874, '_ignored': ['abstract.keyword'], '_source': {'familyID': '11772668'}}, {'_index': 'docdb_cos', '_id': 'EP0121935A2', '_score': 0.9772881, '_ignored': ['abstract.keyword'], '_source': {'familyID': '13228411'}}, {'_index': 'docdb_cos', '_id': 'US4599288A', '_score': 0.9769945, '_ignored': ['abstract.keyword'], '_source': {'familyID': '12721521'}}, {'_index': 'docdb_cos', '_id': 'EP0327897A2', '_score': 0.9763912, '_ignored': ['abstract.keyword'], '_source': {'familyID': '26357165'}}, {'_index': 'docdb_cos', '_id': 'US4705719A', '_score': 0.97617257, '_ignored': ['abstract.keyword'], '_source': {'familyID': '26434884'}}, {'_index': 'docdb_cos', '_id

In [5]:
for i in result:
    print(i["_source"]["familyID"])

1239749
1239749
11772668
13228411
12721521
26357165
26434884
13228411
26866231
23178773


In [6]:
#reading the data which contains the keys that need to be passed to logicmill server
import pandas as pd
data=pd.read_csv("3keys_for_logicmill.csv")
data.drop(columns='Unnamed: 0',inplace=True)

In [7]:
data.head()

Unnamed: 0,docdb_family_id,pat_publn_id,earliest_filing_date,appln_auth,granted,publn_auth,publn_nr,publn_kind,key
0,574528.0,307102421.0,1985-04-10,EP,Y,EP,286634,A1,EP0286634A1
1,574586.0,387396319.0,1987-06-08,EP,Y,EP,320505,A1,EP0320505A1
2,574660.0,387398325.0,1986-08-04,EP,Y,EP,401206,A1,EP0401206A1
3,1226456.0,303518198.0,1991-08-29,EP,Y,EP,670744,A1,EP0670744A1
4,1226485.0,302640285.0,1992-08-12,EP,Y,EP,607158,A1,EP0607158A1


In [8]:
#storing keys in the form of list
keys=list(data['key'][0:100])

In [9]:
keys

['EP0286634A1',
 'EP0320505A1',
 'EP0401206A1',
 'EP0670744A1',
 'EP0607158A1',
 'EP0537145A1',
 'EP470080A1',
 'EP0541533A1',
 'EP0580570A1',
 'EP0597835A1',
 'EP0599834A1',
 'EP0555233A1',
 'EP0574583A1',
 'EP0607134A1',
 'EP0602033A1',
 'EP0683881A1',
 'EP0684902A1',
 'EP0688172A1',
 'EP0675992A1',
 'EP0823839A1',
 'EP0578632A1',
 'EP0555209A1',
 'EP0594850A1',
 'EP1407424A2',
 'EP1441628A1',
 'EP1180562A1',
 'EP1183973A1',
 'EP1312399A1',
 'EP0824178A2',
 'EP0891738A2',
 'EP0893202A2',
 'EP0997608A2',
 'EP0997084A2',
 'EP0223418A1',
 'EP221816A2',
 'EP0309186A1',
 'EP332113A2',
 'EP0478022A1',
 'EP0516139A1',
 'EP0660860A1',
 'EP0628277A1',
 'EP0663222A1',
 'EP0838187A1',
 'EP0783071A1',
 'EP0721026A1',
 'EP0720824A2',
 'EP0565566A1',
 'EP0873059A1',
 'EP0953085A1',
 'EP0519036A1',
 'EP1020133A1',
 'EP1422376A1',
 'EP0274683A1',
 'EP0274084A1',
 'EP0853173A2',
 'EP1329555A1',
 'EP0436523A1',
 'EP1329554A1',
 'EP0853006A2',
 'EP0722013A1',
 'EP0957733A1',
 'EP0853160A2',
 'EP0116028

In [10]:
from elasticsearch import NotFoundError
import numpy as np
d={} #to store the patent and its corresponding k similar patents as key value pairs
d2={}
patents_not_found=[] #to store patents which were not found when searching in logicmill
publicationdate_not_found=[] #to store the patents which were found but for which publication date were not found
embeddings_not_found=[] #to store the patents which were found but for which embeddings were not found

In [11]:
data['similar_patents']=np.nan #creating the column to store similar patents referenced by their respective keys
data['similar_patents_familyIDs']=np.nan #creating the column to store similar patents family ids referenced by their respective keys
data.head()

Unnamed: 0,docdb_family_id,pat_publn_id,earliest_filing_date,appln_auth,granted,publn_auth,publn_nr,publn_kind,key,similar_patents,similar_patents_familyIDs
0,574528.0,307102421.0,1985-04-10,EP,Y,EP,286634,A1,EP0286634A1,,
1,574586.0,387396319.0,1987-06-08,EP,Y,EP,320505,A1,EP0320505A1,,
2,574660.0,387398325.0,1986-08-04,EP,Y,EP,401206,A1,EP0401206A1,,
3,1226456.0,303518198.0,1991-08-29,EP,Y,EP,670744,A1,EP0670744A1,,
4,1226485.0,302640285.0,1992-08-12,EP,Y,EP,607158,A1,EP0607158A1,,


In [12]:
data['similar_patents']=data['similar_patents'].astype('object')
data['similar_patents_familyIDs']=data['similar_patents_familyIDs'].astype('object')
data.dtypes

docdb_family_id              float64
pat_publn_id                 float64
earliest_filing_date          object
appln_auth                    object
granted                       object
publn_auth                    object
publn_nr                       int64
publn_kind                    object
key                           object
similar_patents               object
similar_patents_familyIDs     object
dtype: object

In [13]:
h=-1
for i in keys:
    h=h+1
    try:
        res = client.get(
            index='docdb_cos',
            id=i,
        )
        if 'embedding'in res['_source'].keys():
            if 'publicationDate'in res['_source'].keys():
                    
                embedding = res["_source"]["embedding"]
                target_date = res["_source"]["publicationDate"]
                k = 2000
                # Construct the search query with KNN and date filter
                query = {
                    "size": k,
                    "query": {
                        "bool": {
                            "must": {
                                "knn": {
                                    "field": "embedding",
                                    "num_candidates": min(k * 2, 10_000),
                                    "query_vector": embedding
                                }
                            },
                            "filter": {
                                "range": {
                                    "publicationDate": {
                                        "lte": target_date  # Only include patents before this date
                                    }
                                }
                            }
                        }
                    },
                    "_source": ["familyID"] 
                }
                # Perform the search
                search = client.search(
                    index="docdb_cos",
                    body=query
                )

                #print(search["hits"]["hits"])
                l=[] # to store all the similar patents corresponding to the patent with key=i
                l2=[] #to store all the similar patents family id corresponding to the patent with key=i
                for j in search["hits"]["hits"]:
                    l.append(j['_id'])
                    l2.append(j["_source"]["familyID"])
                d[i]=l
                d2[i]=l2
                data.at[h,'similar_patents']=l
                data.at[h,'similar_patents_familyIDs']=l2
            
            else:
                publicationdate_not_found.append(i)    
        else: 
            embeddings_not_found.append(i)
    except NotFoundError:
        patents_not_found.append(i)

In [14]:
len(d)

58

In [15]:
len(embeddings_not_found)

34

In [16]:
len(patents_not_found)

5

In [17]:
len(publicationdate_not_found)

3

In [18]:
data.head()

Unnamed: 0,docdb_family_id,pat_publn_id,earliest_filing_date,appln_auth,granted,publn_auth,publn_nr,publn_kind,key,similar_patents,similar_patents_familyIDs
0,574528.0,307102421.0,1985-04-10,EP,Y,EP,286634,A1,EP0286634A1,,
1,574586.0,387396319.0,1987-06-08,EP,Y,EP,320505,A1,EP0320505A1,,
2,574660.0,387398325.0,1986-08-04,EP,Y,EP,401206,A1,EP0401206A1,,
3,1226456.0,303518198.0,1991-08-29,EP,Y,EP,670744,A1,EP0670744A1,,
4,1226485.0,302640285.0,1992-08-12,EP,Y,EP,607158,A1,EP0607158A1,,


In [19]:
data2=data[data['similar_patents'].notna()].reset_index() #creating the dataframe with only those patents for which similar patents were found
data2.drop(columns='index',inplace=True)

In [20]:
data2.head()

Unnamed: 0,docdb_family_id,pat_publn_id,earliest_filing_date,appln_auth,granted,publn_auth,publn_nr,publn_kind,key,similar_patents,similar_patents_familyIDs
0,1239749.0,303117700.0,1991-12-25,EP,Y,EP,574583,A1,EP0574583A1,"[EP0574583A1, WO9312941A1, EP0439049A1, EP0121...","[1239749, 1239749, 11772668, 13228411, 1272152..."
1,3411930.0,301922022.0,1992-04-15,EP,N,EP,594850,A1,EP0594850A1,"[EP0594850A1, EP0555928A1, EP0524158A1, EP0331...","[3411930, 11361954, 11349746, 11150663, 932212..."
2,3460833.0,290372405.0,2000-08-15,EP,N,EP,1180562,A1,EP1180562A1,"[EP1180562A1, EP0676509A1, EP0383740A1, EP0327...","[3460833, 9462223, 26977734, 9362924, 26977734..."
3,3460835.0,290339286.0,2000-08-31,EP,N,EP,1183973,A1,EP1183973A1,"[EP1183973A1, EP0331225A1, EP0341176A1, EP0457...","[3460835, 19851828, 9364981, 24051253, 2405125..."
4,3460912.0,287876166.0,2001-11-15,EP,N,EP,1312399,A1,EP1312399A1,"[EP1312399A1, US5779379A, EP1068947A1, WO88089...","[3460912, 25249552, 8243487, 27489383, 2670732..."


In [21]:
len(data2['similar_patents_familyIDs'][0])

2000

In [22]:
len(np.unique(data2['similar_patents_familyIDs'][0]))

1842

In [23]:
def count_unique_elements(lst):
    return len(np.unique(lst))

In [24]:
#to store number of unique similar patents families
data2['num_unique_familyIDs'] = data2['similar_patents_familyIDs'].apply(count_unique_elements)

In [25]:
data2.head()

Unnamed: 0,docdb_family_id,pat_publn_id,earliest_filing_date,appln_auth,granted,publn_auth,publn_nr,publn_kind,key,similar_patents,similar_patents_familyIDs,num_unique_familyIDs
0,1239749.0,303117700.0,1991-12-25,EP,Y,EP,574583,A1,EP0574583A1,"[EP0574583A1, WO9312941A1, EP0439049A1, EP0121...","[1239749, 1239749, 11772668, 13228411, 1272152...",1842
1,3411930.0,301922022.0,1992-04-15,EP,N,EP,594850,A1,EP0594850A1,"[EP0594850A1, EP0555928A1, EP0524158A1, EP0331...","[3411930, 11361954, 11349746, 11150663, 932212...",1900
2,3460833.0,290372405.0,2000-08-15,EP,N,EP,1180562,A1,EP1180562A1,"[EP1180562A1, EP0676509A1, EP0383740A1, EP0327...","[3460833, 9462223, 26977734, 9362924, 26977734...",1922
3,3460835.0,290339286.0,2000-08-31,EP,N,EP,1183973,A1,EP1183973A1,"[EP1183973A1, EP0331225A1, EP0341176A1, EP0457...","[3460835, 19851828, 9364981, 24051253, 2405125...",1907
4,3460912.0,287876166.0,2001-11-15,EP,N,EP,1312399,A1,EP1312399A1,"[EP1312399A1, US5779379A, EP1068947A1, WO88089...","[3460912, 25249552, 8243487, 27489383, 2670732...",1746


In [26]:
data2[data2['num_unique_familyIDs']<1000]

Unnamed: 0,docdb_family_id,pat_publn_id,earliest_filing_date,appln_auth,granted,publn_auth,publn_nr,publn_kind,key,similar_patents,similar_patents_familyIDs,num_unique_familyIDs
