In [2]:
from elasticsearch import Elasticsearch

# Initialize Elasticsearch client
client = Elasticsearch(
    hosts=f"https://es2.logic-mill.net:443",
    basic_auth=("elastic", "MTqCwEli2DWAJTFGKwIa"),
    retry_on_status=[500, 501, 502, 503, 504, 524],
    retry_on_timeout=True,
    request_timeout=60,
)

In [3]:
mapping = client.indices.get_mapping(index="docdb_cos")
print(mapping)

{'docdb_cos': {'mappings': {'properties': {'abstract': {'type': 'text', 'fields': {'keyword': {'type': 'keyword', 'ignore_above': 256}}}, 'applicants': {'type': 'text', 'fields': {'keyword': {'type': 'keyword', 'ignore_above': 256}}}, 'authority': {'type': 'text', 'fields': {'keyword': {'type': 'keyword', 'ignore_above': 256}}}, 'cpcClasses': {'type': 'text', 'fields': {'keyword': {'type': 'keyword', 'ignore_above': 256}}}, 'docDbId': {'type': 'text', 'fields': {'keyword': {'type': 'keyword', 'ignore_above': 256}}}, 'docNumber': {'type': 'text', 'fields': {'keyword': {'type': 'keyword', 'ignore_above': 256}}}, 'embedding': {'type': 'dense_vector', 'dims': 768, 'index': True, 'similarity': 'cosine', 'index_options': {'type': 'int8_hnsw', 'm': 16, 'ef_construction': 100}}, 'embedding_model': {'type': 'text', 'fields': {'keyword': {'type': 'keyword', 'ignore_above': 256}}}, 'encoded_at': {'type': 'date'}, 'familyID': {'type': 'text', 'fields': {'keyword': {'type': 'keyword', 'ignore_above

In [4]:
print(mapping['docdb_cos']['mappings']['properties'].keys())
#we need publication date and family ids which is stored in the keys publicationDate and familyID respectively

dict_keys(['abstract', 'applicants', 'authority', 'cpcClasses', 'docDbId', 'docNumber', 'embedding', 'embedding_model', 'encoded_at', 'familyID', 'id', 'inventors', 'kind', 'publicationDate', 'title'])


In [5]:
# Fetch the embedding for the given patent ID
res = client.get(
    index='docdb_cos',
    id="EP0574583A1",
)
embedding = res["_source"]["embedding"]
k = 10
target_date = res["_source"]["publicationDate"]  #publication date of the focal patent

# Construct the search query with KNN and date filter
query = {
    "size": k,
    "query": {
        "bool": {
            "must": {
                "knn": {
                    "field": "embedding",
                    "num_candidates": min(k * 2, 10_000),
                    "query_vector": embedding
                }
            },
            "filter": {
                "range": {
                    "publicationDate": {
                        "lte": target_date  # Only include patents before this date
                    }
                }
            }
        }
    },
    "_source": ["familyID"]
}

# Perform the search
search = client.search(
    index="docdb_cos",
    body=query
)
# Print the search results
result=search["hits"]["hits"]
print(result)
print(len(result))

[{'_index': 'docdb_cos', '_id': 'EP0574583A1', '_score': 0.9991764, '_ignored': ['abstract.keyword'], '_source': {'familyID': '1239749'}}, {'_index': 'docdb_cos', '_id': 'WO9312941A1', '_score': 0.9973502, '_ignored': ['abstract.keyword'], '_source': {'familyID': '1239749'}}, {'_index': 'docdb_cos', '_id': 'EP0439049A1', '_score': 0.98186874, '_ignored': ['abstract.keyword'], '_source': {'familyID': '11772668'}}, {'_index': 'docdb_cos', '_id': 'EP0121935A2', '_score': 0.9772881, '_ignored': ['abstract.keyword'], '_source': {'familyID': '13228411'}}, {'_index': 'docdb_cos', '_id': 'US4599288A', '_score': 0.9769945, '_ignored': ['abstract.keyword'], '_source': {'familyID': '12721521'}}, {'_index': 'docdb_cos', '_id': 'EP0327897A2', '_score': 0.9763912, '_ignored': ['abstract.keyword'], '_source': {'familyID': '26357165'}}, {'_index': 'docdb_cos', '_id': 'US4705719A', '_score': 0.97617257, '_ignored': ['abstract.keyword'], '_source': {'familyID': '26434884'}}, {'_index': 'docdb_cos', '_id

In [6]:
for i in result:
    print(i["_source"]["familyID"])

1239749
1239749
11772668
13228411
12721521
26357165
26434884
13228411
26866231
23178773


In [None]:
#reading the data which contains the keys that need to be passed to logicmill server
import pandas as pd
data=pd.read_csv("3keys_for_logicmill.csv")
data.drop(columns='Unnamed: 0',inplace=True)

In [8]:
data.head()

Unnamed: 0,docdb_family_id,pat_publn_id,earliest_filing_date,appln_auth,granted,publn_auth,publn_nr,publn_kind,key
0,574528.0,307102421.0,1985-04-10,EP,Y,EP,286634,A1,EP0286634A1
1,574586.0,387396319.0,1987-06-08,EP,Y,EP,320505,A1,EP0320505A1
2,574660.0,387398325.0,1986-08-04,EP,Y,EP,401206,A1,EP0401206A1
3,1226456.0,303518198.0,1991-08-29,EP,Y,EP,670744,A1,EP0670744A1
4,1226485.0,302640285.0,1992-08-12,EP,Y,EP,607158,A1,EP0607158A1


In [9]:
#storing keys in the form of list
keys=list(data['key'])

In [10]:
len(keys)

229282

In [12]:
from elasticsearch import NotFoundError
import numpy as np
d={} #to store the patent and its corresponding k similar patents as key value pairs
d2={}
patents_not_found=[] #to store patents which were not found when searching in logicmill
publicationdate_not_found=[] #to store the patents which were found but for which publication date were not found
embeddings_not_found=[] #to store the patents which were found but for which embeddings were not found

In [13]:
data['similar_patents']=np.nan #creating the column to store similar patents referenced by their respective keys
data['similar_patents_familyIDs']=np.nan #creating the column to store similar patents family ids referenced by their respective keys
data.head()

Unnamed: 0,docdb_family_id,pat_publn_id,earliest_filing_date,appln_auth,granted,publn_auth,publn_nr,publn_kind,key,similar_patents,similar_patents_familyIDs
0,574528.0,307102421.0,1985-04-10,EP,Y,EP,286634,A1,EP0286634A1,,
1,574586.0,387396319.0,1987-06-08,EP,Y,EP,320505,A1,EP0320505A1,,
2,574660.0,387398325.0,1986-08-04,EP,Y,EP,401206,A1,EP0401206A1,,
3,1226456.0,303518198.0,1991-08-29,EP,Y,EP,670744,A1,EP0670744A1,,
4,1226485.0,302640285.0,1992-08-12,EP,Y,EP,607158,A1,EP0607158A1,,


In [14]:
data['similar_patents']=data['similar_patents'].astype('object')
data['similar_patents_familyIDs']=data['similar_patents_familyIDs'].astype('object')
data.dtypes

docdb_family_id              float64
pat_publn_id                 float64
earliest_filing_date          object
appln_auth                    object
granted                       object
publn_auth                    object
publn_nr                       int64
publn_kind                    object
key                           object
similar_patents               object
similar_patents_familyIDs     object
dtype: object

In [46]:
h=-1
for i in keys:
    h=h+1
    try:
        res = client.get(
            index='docdb_cos',
            id=i,
        )
        if 'embedding'in res['_source'].keys():
            if 'publicationDate'in res['_source'].keys():
                    
                embedding = res["_source"]["embedding"]
                target_date = res["_source"]["publicationDate"]
                k = 2000
                # Construct the search query with KNN and date filter
                query = {
                    "size": k,
                    "query": {
                        "bool": {
                            "must": {
                                "knn": {
                                    "field": "embedding",
                                    "num_candidates": min(k * 2, 10_000),
                                    "query_vector": embedding
                                }
                            },
                            "filter": {
                                "range": {
                                    "publicationDate": {
                                        "lte": target_date  # Only include patents before this date
                                    }
                                }
                            }
                        }
                    },
                    "_source": ["familyID"] 
                }
                # Perform the search
                search = client.search(
                    index="docdb_cos",
                    body=query
                )

                #print(search["hits"]["hits"])
                l=[] # to store all the similar patents corresponding to the patent with key=i
                l2=[] #to store all the similar patents family id corresponding to the patent with key=i
                for j in search["hits"]["hits"]:
                    l.append(j['_id'])
                    l2.append(j["_source"]["familyID"])
                d[i]=l
                d2[i]=l2
                data.at[h,'similar_patents']=l
                data.at[h,'similar_patents_familyIDs']=l2
            
            else:
                publicationdate_not_found.append(i)    
        else: 
            embeddings_not_found.append(i)
    except NotFoundError:
        patents_not_found.append(i)

In [47]:
len(d)

124310

In [48]:
len(embeddings_not_found)

85153

In [49]:
len(patents_not_found)

11487

In [50]:
len(publicationdate_not_found)

8332

In [51]:
df_embeddings_not_found = pd.DataFrame(embeddings_not_found, columns=['key'])
df_patents_not_found = pd.DataFrame(patents_not_found, columns=['key'])
df_publicationdate_not_found = pd.DataFrame(publicationdate_not_found, columns=['key'])

In [52]:
df_publicationdate_not_found.head()

Unnamed: 0,key
0,EP0838187A1
1,EP0437418A1
2,EP0664096A1
3,EP0786276A2
4,EP0864710A2


In [53]:
df_embeddings_not_found.to_csv('patents_without_embeddings.csv', index=False)
df_patents_not_found.to_csv('patents_not_found.csv', index=False)
df_publicationdate_not_found.to_csv('patents_without_publicationdate.csv', index=False)

In [54]:
data.head()

Unnamed: 0,docdb_family_id,pat_publn_id,earliest_filing_date,appln_auth,granted,publn_auth,publn_nr,publn_kind,key,similar_patents,similar_patents_familyIDs
0,574528.0,307102421.0,1985-04-10,EP,Y,EP,286634,A1,EP0286634A1,,
1,574586.0,387396319.0,1987-06-08,EP,Y,EP,320505,A1,EP0320505A1,,
2,574660.0,387398325.0,1986-08-04,EP,Y,EP,401206,A1,EP0401206A1,,
3,1226456.0,303518198.0,1991-08-29,EP,Y,EP,670744,A1,EP0670744A1,,
4,1226485.0,302640285.0,1992-08-12,EP,Y,EP,607158,A1,EP0607158A1,,


In [55]:
data2=data[data['similar_patents'].notna()].reset_index() #creating the dataframe with only those patents for which similar patents were found
data2.drop(columns='index',inplace=True)

In [56]:
data2.head()

Unnamed: 0,docdb_family_id,pat_publn_id,earliest_filing_date,appln_auth,granted,publn_auth,publn_nr,publn_kind,key,similar_patents,similar_patents_familyIDs
0,1239749.0,303117700.0,1991-12-25,EP,Y,EP,574583,A1,EP0574583A1,"[EP0574583A1, WO9312941A1, EP0439049A1, EP0121...","[1239749, 1239749, 11772668, 13228411, 1272152..."
1,3411930.0,301922022.0,1992-04-15,EP,N,EP,594850,A1,EP0594850A1,"[EP0594850A1, EP0555928A1, EP0524158A1, EP0331...","[3411930, 11361954, 11349746, 11150663, 932212..."
2,3460833.0,290372405.0,2000-08-15,EP,N,EP,1180562,A1,EP1180562A1,"[EP1180562A1, EP0676509A1, EP0383740A1, EP0327...","[3460833, 9462223, 26977734, 9362924, 26977734..."
3,3460835.0,290339286.0,2000-08-31,EP,N,EP,1183973,A1,EP1183973A1,"[EP1183973A1, EP0331225A1, EP0341176A1, EP0457...","[3460835, 19851828, 9364981, 24051253, 2405125..."
4,3460912.0,287876166.0,2001-11-15,EP,N,EP,1312399,A1,EP1312399A1,"[EP1312399A1, US5779379A, EP1068947A1, WO88089...","[3460912, 25249552, 8243487, 27489383, 2670732..."


In [57]:
len(data2['similar_patents_familyIDs'][0])

2000

In [58]:
len(np.unique(data2['similar_patents_familyIDs'][0]))

1842

In [25]:
def count_unique_elements(lst):
    return len(np.unique(lst))

In [60]:
#to store number of unique similar patents families
data2['num_unique_familyIDs'] = data2['similar_patents_familyIDs'].apply(count_unique_elements)

In [61]:
data2.head()

Unnamed: 0,docdb_family_id,pat_publn_id,earliest_filing_date,appln_auth,granted,publn_auth,publn_nr,publn_kind,key,similar_patents,similar_patents_familyIDs,num_unique_familyIDs
0,1239749.0,303117700.0,1991-12-25,EP,Y,EP,574583,A1,EP0574583A1,"[EP0574583A1, WO9312941A1, EP0439049A1, EP0121...","[1239749, 1239749, 11772668, 13228411, 1272152...",1842
1,3411930.0,301922022.0,1992-04-15,EP,N,EP,594850,A1,EP0594850A1,"[EP0594850A1, EP0555928A1, EP0524158A1, EP0331...","[3411930, 11361954, 11349746, 11150663, 932212...",1900
2,3460833.0,290372405.0,2000-08-15,EP,N,EP,1180562,A1,EP1180562A1,"[EP1180562A1, EP0676509A1, EP0383740A1, EP0327...","[3460833, 9462223, 26977734, 9362924, 26977734...",1922
3,3460835.0,290339286.0,2000-08-31,EP,N,EP,1183973,A1,EP1183973A1,"[EP1183973A1, EP0331225A1, EP0341176A1, EP0457...","[3460835, 19851828, 9364981, 24051253, 2405125...",1907
4,3460912.0,287876166.0,2001-11-15,EP,N,EP,1312399,A1,EP1312399A1,"[EP1312399A1, US5779379A, EP1068947A1, WO88089...","[3460912, 25249552, 8243487, 27489383, 2670732...",1746


In [62]:
data2.to_csv('data_final.csv', index=False)

In [63]:
data2[data2['num_unique_familyIDs']<1000]

Unnamed: 0,docdb_family_id,pat_publn_id,earliest_filing_date,appln_auth,granted,publn_auth,publn_nr,publn_kind,key,similar_patents,similar_patents_familyIDs,num_unique_familyIDs
317,3492171.0,310993507.0,1979-01-24,EP,Y,EP,13978,A1,EP0013978A1,"[EP0013978A1, EP0010763A1, US3744186A, US39083...","[3492171, 6053937, 22225039, 9108277, 5727525,...",697
419,3496892.0,310576331.0,1980-05-05,EP,Y,EP,40189,A1,EP0040189A1,"[EP0040189A1, US3744159A, US4232459A, US401285...","[3496892, 11903362, 11198666, 24641423, 273045...",323
507,3501543.0,311495411.0,1978-02-08,EP,Y,EP,3715,A1,EP0003715A1,"[EP0003715A1, US3900966A, US3983669A, US407453...","[3501543, 8358862, 27159397, 25768687, 5851936...",476
654,3509044.0,315990994.0,1982-03-29,EP,N,EP,90792,A2,EP0090792A2,"[EP0090792A2, EP0089850A1, EP0044880A1, EP0007...","[3509044, 10529126, 8186735, 9210597, 8188143,...",608
780,3515298.0,310056793.0,1981-04-01,EP,N,EP,62028,A2,EP0062028A2,"[EP0062028A2, EP0042711A1, EP0046220A1, EP0026...","[3515298, 22581068, 8513685, 3586184, 27420904...",999
...,...,...,...,...,...,...,...,...,...,...,...,...
121341,46754239.0,379462562.0,2011-06-30,EP,N,EP,2541515,A1,EP2541515A1,"[EP2541515A1, US2010227677A1, US8038528B2, WO2...","[46754239, 38510144, 35428275, 38510144, 35428...",999
121888,47008324.0,405189039.0,2011-09-21,EP,Y,EP,2578276,A1,EP2578276A1,"[US2013072323A1, EP2578276A1, WO0013874A1, US6...","[47008324, 47008324, 22517910, 22517910, 40087...",994
121912,47010435.0,406580211.0,2011-11-17,EP,N,EP,2594733,A1,EP2594733A1,"[EP2594733A1, US7603265B2, WO2008103986B1, WO2...","[47010435, 34610791, 39477549, 39477549, 46383...",993
122617,47501540.0,408781793.0,2011-12-22,EP,N,EP,2626507,A1,EP2626507A1,"[WO2013101636A2, EP2626507A1, US2012119744A1, ...","[47501540, 47501540, 45048243, 45048243, 45048...",973


In [65]:
len(data2[data2['num_unique_familyIDs']<1000])

2251

In [64]:
(pd.read_csv("data_final.csv")).head()

Unnamed: 0,docdb_family_id,pat_publn_id,earliest_filing_date,appln_auth,granted,publn_auth,publn_nr,publn_kind,key,similar_patents,similar_patents_familyIDs,num_unique_familyIDs
0,1239749.0,303117700.0,1991-12-25,EP,Y,EP,574583,A1,EP0574583A1,"['EP0574583A1', 'WO9312941A1', 'EP0439049A1', ...","['1239749', '1239749', '11772668', '13228411',...",1842
1,3411930.0,301922022.0,1992-04-15,EP,N,EP,594850,A1,EP0594850A1,"['EP0594850A1', 'EP0555928A1', 'EP0524158A1', ...","['3411930', '11361954', '11349746', '11150663'...",1900
2,3460833.0,290372405.0,2000-08-15,EP,N,EP,1180562,A1,EP1180562A1,"['EP1180562A1', 'EP0676509A1', 'EP0383740A1', ...","['3460833', '9462223', '26977734', '9362924', ...",1922
3,3460835.0,290339286.0,2000-08-31,EP,N,EP,1183973,A1,EP1183973A1,"['EP1183973A1', 'EP0331225A1', 'EP0341176A1', ...","['3460835', '19851828', '9364981', '24051253',...",1907
4,3460912.0,287876166.0,2001-11-15,EP,N,EP,1312399,A1,EP1312399A1,"['EP1312399A1', 'US5779379A', 'EP1068947A1', '...","['3460912', '25249552', '8243487', '27489383',...",1746


In [69]:
len(pd.read_csv("data_final.csv"))

124310

In [None]:
#number of patents in data_final.csv =124310
#number of patents in patents_not_found.csv =11487
#number of patents in patents_without_embeddings.csv =85153
#number of patents in patents_without_publicationdate.csv =8332

In [2]:
124310+11487+85153+8332

229282

In [None]:
#number of patents with unique family ids less than 1000 = 2251

In [2]:
#making correction to avoid the representative patent itself in the list of similar patents
import pandas as pd
data_final2=pd.read_csv("data_final.csv")

In [3]:
data_final2.head()

Unnamed: 0,docdb_family_id,pat_publn_id,earliest_filing_date,appln_auth,granted,publn_auth,publn_nr,publn_kind,key,similar_patents,similar_patents_familyIDs,num_unique_familyIDs
0,1239749.0,303117700.0,1991-12-25,EP,Y,EP,574583,A1,EP0574583A1,"['EP0574583A1', 'WO9312941A1', 'EP0439049A1', ...","['1239749', '1239749', '11772668', '13228411',...",1842
1,3411930.0,301922022.0,1992-04-15,EP,N,EP,594850,A1,EP0594850A1,"['EP0594850A1', 'EP0555928A1', 'EP0524158A1', ...","['3411930', '11361954', '11349746', '11150663'...",1900
2,3460833.0,290372405.0,2000-08-15,EP,N,EP,1180562,A1,EP1180562A1,"['EP1180562A1', 'EP0676509A1', 'EP0383740A1', ...","['3460833', '9462223', '26977734', '9362924', ...",1922
3,3460835.0,290339286.0,2000-08-31,EP,N,EP,1183973,A1,EP1183973A1,"['EP1183973A1', 'EP0331225A1', 'EP0341176A1', ...","['3460835', '19851828', '9364981', '24051253',...",1907
4,3460912.0,287876166.0,2001-11-15,EP,N,EP,1312399,A1,EP1312399A1,"['EP1312399A1', 'US5779379A', 'EP1068947A1', '...","['3460912', '25249552', '8243487', '27489383',...",1746


In [4]:
def skip_first(lst):
    return(lst[1:])


In [5]:
skip_first([1,2,4])

[2, 4]

In [11]:
type(data_final2['similar_patents'][0])

str

In [14]:
import ast
type(ast.literal_eval(data_final2['similar_patents'][0]))

list

In [15]:
type(data_final2['similar_patents_familyIDs'][0])

str

In [16]:
type(ast.literal_eval(data_final2['similar_patents_familyIDs'][0]))


list

In [17]:
def skip_first(lst):
    lst1=ast.literal_eval(lst)
    return(lst1[1:])

In [18]:
data_final2['similar_patents'][0:10]

0    ['EP0574583A1', 'WO9312941A1', 'EP0439049A1', ...
1    ['EP0594850A1', 'EP0555928A1', 'EP0524158A1', ...
2    ['EP1180562A1', 'EP0676509A1', 'EP0383740A1', ...
3    ['EP1183973A1', 'EP0331225A1', 'EP0341176A1', ...
4    ['EP1312399A1', 'US5779379A', 'EP1068947A1', '...
5    ['EP0824178A2', 'US4442632A', 'EP0701038A1', '...
6    ['EP0891738A2', 'EP0283468A1', 'EP0718212A1', ...
7    ['EP0893202A2', 'EP0029865A1', 'EP0114654A2', ...
8    ['EP0997608A2', 'EP0530961A1', 'EP0426884A1', ...
9    ['EP0997084A2', 'EP0188213A2', 'EP0527098A2', ...
Name: similar_patents, dtype: object

In [19]:
data_final2['similar_patents'][0:10].apply(skip_first)

0    [WO9312941A1, EP0439049A1, EP0121935A2, US4599...
1    [EP0555928A1, EP0524158A1, EP0331653A2, EP0211...
2    [EP0676509A1, EP0383740A1, EP0327465A1, WO9009...
3    [EP0331225A1, EP0341176A1, EP0457068A2, EP0627...
4    [US5779379A, EP1068947A1, WO8808962A1, WO98234...
5    [US4442632A, EP0701038A1, EP0273789A1, US47947...
6    [EP0283468A1, EP0718212A1, US5579946A, EP06417...
7    [EP0029865A1, EP0114654A2, EP0739731A1, WO8604...
8    [EP0530961A1, EP0426884A1, WO9522005A1, EP0189...
9    [EP0188213A2, EP0527098A2, EP0548785A1, EP0127...
Name: similar_patents, dtype: object

In [20]:
data_final2['similar_patents_familyIDs'][0:10]

0    ['1239749', '1239749', '11772668', '13228411',...
1    ['3411930', '11361954', '11349746', '11150663'...
2    ['3460833', '9462223', '26977734', '9362924', ...
3    ['3460835', '19851828', '9364981', '24051253',...
4    ['3460912', '25249552', '8243487', '27489383',...
5    ['3460975', '23274906', '6525156', '9341337', ...
6    ['3461011', '3495412', '23415338', '23310524',...
7    ['3461021', '8186250', '6189076', '26140671', ...
8    ['3461121', '16474957', '8202097', '20152299',...
9    ['3461123', '25828406', '9415020', '25214899',...
Name: similar_patents_familyIDs, dtype: object

In [21]:
data_final2['similar_patents_familyIDs'][0:10].apply(skip_first)

0    [1239749, 11772668, 13228411, 12721521, 263571...
1    [11361954, 11349746, 11150663, 9322120, 152369...
2    [9462223, 26977734, 9362924, 26977734, 8206602...
3    [19851828, 9364981, 24051253, 24051253, 946569...
4    [25249552, 8243487, 27489383, 26707326, 172770...
5    [23274906, 6525156, 9341337, 16329953, 8281267...
6    [3495412, 23415338, 23310524, 6496905, 6234207...
7    [8186250, 6189076, 26140671, 8089446, 6792024,...
8    [16474957, 8202097, 20152299, 3559341, 6309937...
9    [25828406, 9415020, 25214899, 8425866, 6449236...
Name: similar_patents_familyIDs, dtype: object

In [22]:
data_final2['similar_patents']=data_final2['similar_patents'].apply(skip_first)

In [23]:
data_final2['similar_patents_familyIDs']=data_final2['similar_patents_familyIDs'].apply(skip_first)

In [27]:
import numpy as np
data_final2['num_unique_familyIDs'] = data_final2['similar_patents_familyIDs'].apply(count_unique_elements)

In [None]:
data_final2.to_csv("7similar_patents_less_than_date.csv",index=False)

In [28]:
data_final2.head()

Unnamed: 0,docdb_family_id,pat_publn_id,earliest_filing_date,appln_auth,granted,publn_auth,publn_nr,publn_kind,key,similar_patents,similar_patents_familyIDs,num_unique_familyIDs
0,1239749.0,303117700.0,1991-12-25,EP,Y,EP,574583,A1,EP0574583A1,"[WO9312941A1, EP0439049A1, EP0121935A2, US4599...","[1239749, 11772668, 13228411, 12721521, 263571...",1842
1,3411930.0,301922022.0,1992-04-15,EP,N,EP,594850,A1,EP0594850A1,"[EP0555928A1, EP0524158A1, EP0331653A2, EP0211...","[11361954, 11349746, 11150663, 9322120, 152369...",1899
2,3460833.0,290372405.0,2000-08-15,EP,N,EP,1180562,A1,EP1180562A1,"[EP0676509A1, EP0383740A1, EP0327465A1, WO9009...","[9462223, 26977734, 9362924, 26977734, 8206602...",1921
3,3460835.0,290339286.0,2000-08-31,EP,N,EP,1183973,A1,EP1183973A1,"[EP0331225A1, EP0341176A1, EP0457068A2, EP0627...","[19851828, 9364981, 24051253, 24051253, 946569...",1906
4,3460912.0,287876166.0,2001-11-15,EP,N,EP,1312399,A1,EP1312399A1,"[US5779379A, EP1068947A1, WO8808962A1, WO98234...","[25249552, 8243487, 27489383, 26707326, 172770...",1745


In [29]:
data_final2[data_final2['num_unique_familyIDs']<1000]

Unnamed: 0,docdb_family_id,pat_publn_id,earliest_filing_date,appln_auth,granted,publn_auth,publn_nr,publn_kind,key,similar_patents,similar_patents_familyIDs,num_unique_familyIDs
317,3492171.0,310993507.0,1979-01-24,EP,Y,EP,13978,A1,EP0013978A1,"[EP0010763A1, US3744186A, US3908312A, US365625...","[6053937, 22225039, 9108277, 5727525, 13242513...",696
419,3496892.0,310576331.0,1980-05-05,EP,Y,EP,40189,A1,EP0040189A1,"[US3744159A, US4232459A, US4012854A, US3758964...","[11903362, 11198666, 24641423, 27304570, 81864...",322
507,3501543.0,311495411.0,1978-02-08,EP,Y,EP,3715,A1,EP0003715A1,"[US3900966A, US3983669A, US4074538A, US3919821...","[8358862, 27159397, 25768687, 5851936, 2239564...",475
654,3509044.0,315990994.0,1982-03-29,EP,N,EP,90792,A2,EP0090792A2,"[EP0089850A1, EP0044880A1, EP0007819A1, EP0071...","[10529126, 8186735, 9210597, 8188143, 6126033,...",607
780,3515298.0,310056793.0,1981-04-01,EP,N,EP,62028,A2,EP0062028A2,"[EP0042711A1, EP0046220A1, EP0026756A1, EP0043...","[22581068, 8513685, 3586184, 27420904, 8513901...",998
...,...,...,...,...,...,...,...,...,...,...,...,...
121341,46754239.0,379462562.0,2011-06-30,EP,N,EP,2541515,A1,EP2541515A1,"[US2010227677A1, US8038528B2, WO2007106659A3, ...","[38510144, 35428275, 38510144, 35428275, 35428...",998
121888,47008324.0,405189039.0,2011-09-21,EP,Y,EP,2578276,A1,EP2578276A1,"[EP2578276A1, WO0013874A1, US6207095B1, US2008...","[47008324, 22517910, 22517910, 40087229, 25341...",994
121912,47010435.0,406580211.0,2011-11-17,EP,N,EP,2594733,A1,EP2594733A1,"[US7603265B2, WO2008103986B1, WO2008103986A1, ...","[34610791, 39477549, 39477549, 46383461, 34610...",992
122617,47501540.0,408781793.0,2011-12-22,EP,N,EP,2626507,A1,EP2626507A1,"[EP2626507A1, US2012119744A1, WO2012067905A2, ...","[47501540, 45048243, 45048243, 45048243, 45096...",973


In [30]:
len(data_final2[data_final2['num_unique_familyIDs']<1000])

2253