In [5]:
import json
import time

from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk

# Use tensorflow 1 behavior to match the Universal Sentence Encoder
# examples (https://tfhub.dev/google/universal-sentence-encoder/2).
import tensorflow.compat.v1 as tf
import tensorflow_hub as hub

In [1]:
from source.ontology_parsing.data_loading import get_all_concept_file_paths, get_graphs_from_files
from config import ONTOLOGY_CORE_DIR

# reading ontology
files = get_all_concept_file_paths(ONTOLOGY_CORE_DIR)
graphs = get_graphs_from_files(files)

# To be defined in config - index column names and their uris
pred_uri_to_idx_colname = {
    'http://www.w3.org/2004/02/skos/core#prefLabel': 'label',
    'http://www.w3.org/2004/02/skos/core#closeMatch': 'closeMatch',
    'http://www.w3.org/2004/02/skos/core#related': 'related',
    'http://www.w3.org/2004/02/skos/core#broader': 'broader'
}
pred_uri_to_idx_colname


from source.ontology_parsing.graph_utils import get_uri_to_colname_dict_from_ontology

# or derived from the ontology automatically
pred_uri_to_idx_colname = get_uri_to_colname_dict_from_ontology(graphs)
pred_uri_to_idx_colname

from source.es_index.IndexBaseline import IndexBaseline

index_builder = IndexBaseline(pred_uri_to_idx_colname, graphs, include_concept_type=True)

rows = index_builder.build_rows()

# TEXT ANALYZER PROPERY

In [23]:
template = """
{
    "settings": {
    "number_of_shards": 2,
    "number_of_replicas": 1
    },
    "mappings": {
        "dynamic": "true",
        "_source": {
        "enabled": "true"
        },
        "properties": {
            "name": {
                "type": "text"
            },
            "label": {
                "type": "text",
                "analyzer":"english"
            },
            "type": {
                "type": "text"
            },
            "closeMatch": {
                "type": "text"
            },
            "related": {
                "type": "text",
                "analyzer":"english"
            },
            "broader": {
                "type": "text",
                "analyzer":"english"
            }
        }
    }
}
"""

In [24]:
query = json.loads(template)
response = requests.put('http://localhost:9200/ontology_analyzer',
                       json=query)
response.json()

{'acknowledged': True,
 'shards_acknowledged': True,
 'index': 'ontology_analyzer'}

In [30]:
def bulk_load(rows, client, index_name):
    requests = []
    for i, row in enumerate(rows):
        request = row
        request["_op_type"] = "index"
        request["_index"] = index_name
        requests.append(request)
    bulk(client, requests)

In [28]:
def connect_elasticsearch():
    _es = None
    _es = Elasticsearch([{'host': 'localhost', 'port': 9200}])
    if _es.ping():
        print('Yay Connected')
    else:
        print('Awww it could not connect!')
    return _es

In [29]:
client = connect_elasticsearch()
bulk_load(rows, client)

Yay Connected


# KEYWORD TYPE

In [31]:
template = """
{
    "mappings": {
        "properties": {
            "name": {
                "type": "text"
            },
            "label": {
                "type": "keyword"
            },
            "type": {
                "type": "text"
            },
            "closeMatch": {
                "type": "text"
            },
            "related": {
                "type": "keyword"
            },
            "broader": {
                "type": "keyword"
            }
        }
    }
}
"""

In [32]:
query = json.loads(template)
response = requests.put('http://localhost:9200/ontology_keyword',
                       json=query)
response.json()

{'acknowledged': True,
 'shards_acknowledged': True,
 'index': 'ontology_keyword'}

In [33]:
client = connect_elasticsearch()
bulk_load(rows, client, 'ontology_keyword')

Yay Connected


# SIMILARITY

In [None]:
template = """
{
    "settings": {
        "index": {
        "similarity": {
            "my_similarity": {
                "type": "DFR",
                "basic_model": "g",
                "after_effect": "l",
                "normalization": "h2",
                "normalization.h2.c": "3.0"
                }
            }
        }
    }
    "mappings": {
        "properties": {
            "name": {
                "type": "text"
            },
            "label": {
                "type": "keyword",
                "similarity" : "my_similarity"
            },
            "type": {
                "type": "text"
            },
            "closeMatch": {
                "type": "text"
            },
            "related": {
                "type": "keyword",
                "similarity" : "my_similarity"
            },
            "broader": {
                "type": "keyword",
                "similarity" : "my_similarity"
            }
        }
    }
}
"""

In [34]:
query = json.loads(template)
response = requests.put('http://localhost:9200/ontology_similarity',
                       json=query)
response.json()

{'acknowledged': True,
 'shards_acknowledged': True,
 'index': 'ontology_similarity'}

In [35]:
client = connect_elasticsearch()
bulk_load(rows, client, 'ontology_similarity')

Yay Connected


# SIMILARITY TFIDF

In [36]:
template = """
{
  "settings": {
    "number_of_shards": 1,
    "similarity": {
      "scripted_tfidf": {
        "type": "scripted",
        "script": {
          "source": "double tf = Math.sqrt(doc.freq); double idf = Math.log((field.docCount+1.0)/(term.docFreq+1.0)) + 1.0; double norm = 1/Math.sqrt(doc.length); return query.boost * tf * idf * norm;"
        }
      }
    }
  },
  "mappings": {
        "properties": {
            "name": {
                "type": "text"
            },
            "label": {
                "type": "keyword",
                "similarity" : "scripted_tfidf"
            },
            "type": {
                "type": "text"
            },
            "closeMatch": {
                "type": "text"
            },
            "related": {
                "type": "keyword",
                "similarity" : "scripted_tfidf"
            },
            "broader": {
                "type": "keyword",
                "similarity" : "scripted_tfidf"
            }
        }
    }
}
"""

In [37]:
query = json.loads(template)
response = requests.put('http://localhost:9200/ontology_similarity_tfidf',
                       json=query)
response.json()

{'acknowledged': True,
 'shards_acknowledged': True,
 'index': 'ontology_similarity_tfidf'}

In [38]:
client = connect_elasticsearch()
bulk_load(rows, client, 'ontology_similarity_tfidf')

Yay Connected
