## Annotation ingestion notebook

Given a pair of strings representing a well formatted list of base companies and a list
of annotated peers respectively, this notebook processes them into an easy machine-understandable dictionary that can be used for evaluation.

In [1]:
from collections import defaultdict
from elasticsearch import Elasticsearch, NotFoundError
from elasticsearch_dsl import Search
import numpy as np
import ssl
from elasticsearch.connection import create_ssl_context
from elasticsearch_dsl import Q
from graphene import ObjectType, String, Int, ID, Float, List, Schema
import urllib3

urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

HOST = "host.docker.internal"
PORT = "9200"
SSL = False
INDEX = "company"

if SSL:
    ssl_context = create_ssl_context()
    ssl_context.check_hostname = False
    ssl_context.verify_mode = ssl.CERT_NONE
else:
    ssl_context = None

client = Elasticsearch(
    hosts=[{'host': HOST, 'port': PORT}],
    indices=[INDEX],
    scheme="https" if SSL else "http",
    ssl_context=ssl_context)

In [2]:
region_to_currency = {
    "europe": "EUR",
    "asia pacific": "MYR",
}

In [3]:
import requests

headers = {
  "X-API-Key": "ONAI_API_KEY",
  "X-Stack-Host": "Development",
  "X-Request-Id": "-1",
  "X-Stack-User": "Anonymous User"
}

get_conversion_rates_query = '''
query{
  currencyConversionRates(keys:
    [
      {sourceCurrency: "MYR",
        targetCurrency:"EUR",
        spotRate: "2019-11-30"
      },
      {sourceCurrency: "USD",
        targetCurrency:"EUR",
        spotRate: "2019-11-30"
      },
    ]
  )
  {
    sourceCurrency{
      code
    }
    rate
  }
}
'''

res = requests.post('https://data-services.onai.cloud/api/', 
                            json={'query': get_conversion_rates_query}, 
                            headers=headers).json()['data']['currencyConversionRates']
conversion_rates_to_eur = {
    el["sourceCurrency"]["code"]: el["rate"]
    for el in res
}

In [12]:
search_companies_str = '''Akatronik GmbH
Alpin Gastronomie GmbH
Backwelt Pilz GMBH
BHK Bau
Biomay AG
Compact Electric GmbH
DBP Group 
Faber GmbH
Felbermayer Fenster und Turen GmbH
Gebrüder Kofler GmbH
GESIG  Gesellschaft für Signalanlagen GmbH
HAZET Bauunternehmung GmbH
Holmes Place Wien GmbH
Innofreight Speditions GmbH
Interseroh Austria GmbH
ISTAC Promotion GmbH
Molin Industrie Inbetriebnahme & Montage GesmbH & Co KG
Mona Naturprodukte GmbH
MTRent Group
Peter Spak GmbH
Rienhoff GmbH
SAEXINGER Gesmbh
Sanatorium Liebhartstal Formanek GmbH
SPS Beteiligungs und Management GmbH
VIVO IT GmbH
WERNA Beteiligungs GmbH
Wiener Privatklinik Betriebs GmbH & Co KG
American Bank Note Company 
Cheer Pack North America, LLC
DN Tanks, Inc
Esler Companies, LLC
F. B. Packing Company, Inc
Feeney Brothers Excavation, LLC 
Forrester Research, Inc.
Rand Whitney Container, LLC
Symmons Industries, Inc.
Leong Hup (Malaysia)
Mine Logistics Sdn Bhd
PKT Logistics
Syarikat Logistik Petikemas SDN BHD
Teo Seng Cap Bhd
Laurentiussen Group
Stantons International
Creswick Pharmacy
Strategem Group
Bendigo United Friendly Society Pharmacies
Business and Wealth partners
Bowcole Pty Ltd
'''

In [5]:
def search_query(base_name, size=10):
    tst_query_fuzzy  = {
      '_source': True, 
      'from': 0, 
      'size': size, 

      "query": {
          "bool": {
            "should": [
             {
              "multi_match": {
                "query": base_name,
                "fuzziness": "2",
                "prefix_length": 1,
                "fields": [
                  "name",
                  "name.cleaned",
                ],
                "minimum_should_match": "1",
                "type": "most_fields"
              }
             },
            {
              "multi_match": {
                "query": base_name,
                "fuzziness": "1",
                "prefix_length": 1,
                "fields": [
                  "name",
                  "name.cleaned",
                ],
                "minimum_should_match": "1",
                "type": "most_fields",
                "boost": 2
              }
             },
            {
              "multi_match": {
                "query": base_name,
                "fields": [
                  "name",
                  "name.cleaned",
                ],
                "minimum_should_match": "1",
                "type": "most_fields",
                "boost": 4
              }
             }
            ]
           }
          } 
    }

    json_result = client.search(index=INDEX, body=tst_query_fuzzy)
    return [hit["_source"] for hit in json_result["hits"]["hits"]]

In [15]:
for line in search_companies_str.split("\n"):
    ret = search_query(line)
    if len(ret) == 0:
        print(f"Failed to find {line}.")
    else:
        print(ret[0]["name"], "	",ret[0].get("primary_sic_node_desc", "Empty"))
        

RKM GmbH Personaldienstleistungen 	 Employment agencies
Alpin Gastronomie GmbH 	 Eating places
Pilz Schindler Gmbh 	 Fresh fruits and vegetables
Bm Bhk Bau-Gmbh 	 Residential construction
Biomay Ag 	 Empty
Compact Verlag GmbH 	 Periodicals
Dbp 	 Empty
Faber Gmbh 	 Automobiles and other motor vehicles
Felbermayer Fenster Und Turen Erzeugungs-Gmbh 	 Millwork
Gebruder Kofler Gesellschaft M.B.H. 	 Packaged frozen foods
Gesig Gesellschaft Fur Signalanlagen Gesellschaft M.B.H. 	 Communications equipment
HAZET Bauunternehmung GmbH 	 Residential construction
Holmes Place Lifestyle Clubs GmbH 	 Miscellaneous personal services
Innofreight Speditions Gmbh 	 Equipment rental and leasing
Interseroh Austria Gmbh 	 Refuse systems
Competition Partner Promotion Gmbh 	 Advertising agencies
Molin-Industrie-Inbetriebnahme-Montage-Gesellschaft M.B.H. & Co. Kg. 	 Special trade contractors
Mona Naturprodukte GmbH 	 Groceries and related products
Matrent 	 Empty
Peter Mahnke GmbH Industrievulkanisation 	 Tire