In [1]:
from utils import collection_iterator
from elasticsearch import Elasticsearch, helpers

import json
import os
import gc
import sys
import pickle

es_host = "http://193.136.175.98:8125"
index_name = "trec-pm-2020-ncbi"
zipped_collection = "/backup/TREC-PM/Corpus/collection-json.tar.gz"

es = Elasticsearch([es_host])

In [2]:
es.indices.delete(index=index_name, ignore=[400, 404])

es.indices.create(index=index_name,
                       body={
                           "settings": {
                                "analysis": {
                                  "filter": {
                                    "english_stop": {
                                      "type":       "stop",
                                      "stopwords":  "_english_" 
                                    },
                                    "english_keywords": {
                                      "type":       "keyword_marker",
                                      "keywords":   [] 
                                    },
                                    "english_stemmer": {
                                      "type":       "stemmer",
                                      "language":   "english"
                                    },
                                    "english_possessive_stemmer": {
                                      "type":       "stemmer",
                                      "language":   "possessive_english"
                                    },
                                    "synonyms_all" : {
                                      "type" : "synonym",
                                      "synonyms_path" : "analysis/gene_synonyms.txt"
                                    },
                                    "synonyms_symbol" : {
                                      "type" : "synonym",
                                      "synonyms_path" : "analysis/gene_synonyms_symbol.txt"
                                    },
                                    "synonyms_NCBI": {
                                      "type": "synonym",
                                      "synonyms_path": "analysis/gene_synonyms_NCBI.txt"
                                    },
                                    "synonyms_complete_symbols": {
                                      "type": "synonym",
                                      "synonyms_path": "analysis/gene_synonyms_complete_symbols.txt"
                                    },
                                    "synonyms_symbol_ortho" : {
                                      "type" : "synonym",
                                      "synonyms_path" : "analysis/gene_synonyms_symbol_ortho.txt"
                                    }
                                  },
                                  "analyzer": {
                                    "gene_synonym_all": {
                                      "tokenizer":  "standard",
                                      "filter": [
                                        "english_possessive_stemmer",
                                        "lowercase",
                                        "synonyms_all",
                                        "english_stop",
                                        "english_keywords",
                                        "english_stemmer"
                                      ]
                                    },
                                    "gene_synonym_symbol": {
                                      "tokenizer":  "standard",
                                      "filter": [
                                        "english_possessive_stemmer",
                                        "lowercase",
                                        "synonyms_symbol",
                                        "english_stop",
                                        "english_keywords",
                                        "english_stemmer"
                                      ]
                                    },
                                    "gene_synonym_complete_symbols": {
                                        "filter": [
                                          "english_possessive_stemmer",
                                          "lowercase",
                                          "synonyms_complete_symbols",
                                          "english_stop",
                                          "english_keywords",
                                          "english_stemmer"
                                        ],
                                        "tokenizer": "standard"
                                    },
                                    "gene_synonym_symbol_ortho": {
                                      "tokenizer":  "standard",
                                      "filter": [
                                        "english_possessive_stemmer",
                                        "lowercase",
                                        "synonyms_symbol_ortho",
                                        "english_stop",
                                        "english_keywords",
                                        "english_stemmer"
                                      ]
                                    },  
                                    "gene_synonym_NCBI": {
                                        "filter": [
                                          "english_possessive_stemmer",
                                          "lowercase",
                                          "synonyms_NCBI",
                                          "english_stop",
                                          "english_keywords",
                                          "english_stemmer"
                                        ],
                                        "tokenizer": "standard"
                                    },
                                    "semicolon_analyzer": {
                                      "type":      "pattern",
                                      "pattern":   ";", 
                                      "lowercase": True
                                    }
                                  }
                                }
                              },
                             "mappings": {
                                 "dynamic": "false",
                                 "properties": {
                                     "id": {
                                         "type": "keyword"
                                         },
                                     "text": {
                                         "analyzer": "gene_synonym_NCBI",
                                         "type": "text"
                                         },
                                     "mesh_terms": {
                                         "analyzer": "simple",
                                         "type": "text"
                                         },
                                     "author": {
                                         "analyzer": "semicolon_analyzer",
                                         "type": "text"
                                         },
                                     "keywords": {
                                         "analyzer": "semicolon_analyzer",
                                         "type": "text"
                                         },
                                     "title": {
                                         "type": "keyword",
                                         "store": "true"
                                         },
                                     "pubdate": {
                                         "type": "date",
                                         "format": "yyyy-MM||yyyy||yyyy-MM-dd",
                                         }
                                     }
                                 }
                             }, request_timeout=300)

{'acknowledged': True,
 'index': 'trec-pm-2020-ncbi',
 'shards_acknowledged': True}

In [3]:
not_empty_date = lambda x: "2020-12" if x=="" else x
unique_pmid = set()

def data_to_index_iterator():
    index = 0
    skipped = 0
    # Batch read the collection to memory
    #for articles in collection_iterator(zipped_collection):
    for article_subset in collection_iterator(zipped_collection):
        for article in article_subset:
            
            #skip empty abstracts
            if article["abstract"]=="":
                skipped+=1
                continue
            
            if article["id"] in unique_pmid:
                continue
                
            unique_pmid.add(article["id"])
            
            yield {
              "_index": index_name,
              "id": article["id"],
              "text": article["title"]+" "+article["abstract"],
              "mesh_terms": article["mesh_terms"],
              "author": article["author"],
              "keywords": article["keywords"],
              "title": article["title"],
              "pubdate": not_empty_date(article["pubdate"])
            }
            index += 1
            if not index % 10000:
                print("{} documents indexed".format(index),end="\r")

        

helpers.bulk(es, data_to_index_iterator(), chunk_size=500, request_timeout=300)

[CORPORA] Openning tar file /backup/TREC-PM/Corpus/collection-json.tar.gz
[CORPORA] Openning tar file tmp/tmpvcyemaof/TREC-PM-baseline-00000000-to-06000000
[CORPORA] Openning tar file tmp/tmpvcyemaof/TREC-PM-baseline-06000000-to-12000000
[CORPORA] Openning tar file tmp/tmpvcyemaof/TREC-PM-baseline-12000000-to-18000000
[CORPORA] Openning tar file tmp/tmpvcyemaof/TREC-PM-baseline-24000000-to-29138919
18800000 documents indexed

(18806413, [])

''

In [12]:
articles[0].keys()

dict_keys(['title', 'author', 'id', 'affiliation', 'keywords', 'abstract', 'pubdate', 'mesh_terms'])

In [16]:
list(filter(lambda x: x["keywords"]!="", articles))

[{'abstract': 'The etiologic and pathophysiologic findings described in the first part of this paper have important consequences: The recognition of the specific etiology of diarrhea requires new laboratory methods: most of these, however, are technically easy to perform and do not require a large laboratory. A long-ranging consequence of this changed concept is a well-founded modification of therapy. The most important discovery was, that in a well balanced glucose electrolyte solution sodium and glucose enhance their absorption mutually and increase the absorption of water by solvent drag. Since in most acute diarrheas the mechanisms of absorption of glucose and electrolytes are retained this mechanism can be utilized for fast oral rehydration and reinstitution of normal intestinal homeostasis. Prompt institution of a diet consisting of the previously mentioned glucose-electrolyte solution usually prevents severe dehydration and the need for stationary treatment. The elimination of l