Skip to content

Commit

Permalink
Update P2N-Indexer.py
Browse files Browse the repository at this point in the history
Cleaning iramuteq stuff (star line) before indexing
  • Loading branch information
Patent2net committed Jul 31, 2021
1 parent 49b6cd3 commit 71e1442
Showing 1 changed file with 20 additions and 2 deletions.
22 changes: 20 additions & 2 deletions Patent2Net/P2N-Indexer.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,11 @@ def GenereListeFichiers(rep):
except:
lstDesc, lstDesc2 = [], []
cpt = 0
def iramCleaner (texte):
if '****' in texte:
texte = texte.split("\n")
texte ="\n".join(texte[1:])
return texte
for bre in LstBrevet: # get patent list from request file
cpt += 1
if bre['label'] in lstAbs or bre[
Expand All @@ -121,6 +126,7 @@ def GenereListeFichiers(rep):
abstract = ''
else:
abstract = ''

if bre['label'] in lstClaims or bre['label'] in lstClaims2: # if Claims file exists
# fic = [truc for truc in os.listdir(Rep+'//Claims') if truc.split('-')[1].replace('.txt', '') == bre ['label']]
fic = []
Expand Down Expand Up @@ -172,7 +178,16 @@ def GenereListeFichiers(rep):
Description = ''
else:
Description = ''

# cleaning IramuTeq stuff
if '****' in abstract:
abstract = abstract.split("\n")
abstract ="\n".join(abstract[1:])
if '****' in Description:
Description = Description.split("\n")
Description ="\n".join(Description[1:])
if '****' in Claims:
Claims = Claims.split("\n")
Claims ="\n".join(Claims[1:])
doc = { # indexing a doc field:content
# hacks should provide other views: citation equivalents or CIB counts... ?
# I don't know how to do such for the moment
Expand Down Expand Up @@ -343,7 +358,10 @@ def GenereListeFichiers(rep):
cpt[indexLang.split('-')[0]] += 1
else:
cpt[indexLang.split('-')[0]] = 1


for cle in doc.keys():
if "****" in doc [cle]:
doc [cle] = iramCleaner(doc [cle])
res = es.index(index=indexLang.lower(), id=cpt[indexLang.split('-')[0]], body=doc)

for lang in cpt.keys():
Expand Down

0 comments on commit 71e1442

Please sign in to comment.