In [7]:
from collections import defaultdict
from elasticsearch import Elasticsearch, NotFoundError
from elasticsearch_dsl import Search
import numpy as np
import ssl
from elasticsearch.connection import create_ssl_context
from elasticsearch_dsl import Q
from graphene import ObjectType, String, Int, ID, Float, List, Schema
import urllib3

urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

HOST = "host.docker.internal"
PORT = "9200"
SSL = False
INDEX = "company"

if SSL:
    ssl_context = create_ssl_context()
    ssl_context.check_hostname = False
    ssl_context.verify_mode = ssl.CERT_NONE
else:
    ssl_context = None

client = Elasticsearch(
    hosts=[{'host': HOST, 'port': PORT}],
    indices=[INDEX],
    scheme="https" if SSL else "http",
    ssl_context=ssl_context)

number of public company labels: 215


In [36]:
def search_query(base_name, size=10):
    tst_query_fuzzy  = {
      '_source': True, 
      'from': 0, 
      'size': size, 

      "query": {
          "bool": {
            "should": [
             {
              "multi_match": {
                "query": base_name,
                "fuzziness": "2",
                "prefix_length": 1,
                "fields": [
                  "name",
                  "name.cleaned",
                ],
                "minimum_should_match": "1",
                "type": "most_fields"
              }
             },
            {
              "multi_match": {
                "query": base_name,
                "fuzziness": "1",
                "prefix_length": 1,
                "fields": [
                  "name",
                  "name.cleaned",
                ],
                "minimum_should_match": "1",
                "type": "most_fields",
                "boost": 2
              }
             },
            {
              "multi_match": {
                "query": base_name,
                "fields": [
                  "name",
                  "name.cleaned",
                ],
                "minimum_should_match": "1",
                "type": "most_fields",
                "boost": 4
              }
             }
            ]
           }
          } 
    }

    json_result = client.search(index=INDEX, body=tst_query_fuzzy)
    return [hit["_source"] for hit in json_result["hits"]["hits"]]

In [22]:
Positive, Neutral, Negative = 1,0,-1

In [38]:
annotated_companies = [
    {"base_company": {
        "name": "Gebrüder Kofler GmbH ML Test",
        "business_description": "Gebrüder Kofler GmbH (henceforth also referred as ‘Kofler’ or ‘Borrower’ in this report) was established in 1965 as a supplier of fresh fruits and vegetables to Tyrol’s catering industry by the second generation of Kofler family. It is currently managed by the third generation of Kofler family and the business is headquartered in Landeck (Austria). It supplies fresh and frozen food products to hotel and catering industry (also known as gastronomy sector) in the Tyrol state of Austria.",
        "region": "Europe",
        "revenue": {"min": 2080000, "max": 208000000},
        "industry": "Food and Beverage Distribution"
    },
    "annotations":{
        "Gebrüder Woerle Ges.m.b.H":{"label": Positive},
        "Gebrüder Haider Bauunternehmung GmbH":{"label": Negative}, 
        "Cobral Sarl":{"label": Negative}, 
        "Culinor NV":{"label": Negative}, 
        "Gastina GmbH":{"label": Negative}, 
        "Peckham & Rye Ltd.":{"label": Negative}, 
        "Bonfait B.V.":{"label": Positive},
        "Total Gas & Power Limited":{"label": Negative}, 
        "Reynolds Catering Supplies Limited":{"label": Positive},
        "Dominioni Punto & Pasta Srl":{"label": Negative}, 
        }
    },
    {"base_company": {
        "name": "HAZET Bauunternehmung GmbH ML Test",
        "business_description": "HAZET Bauunternehmung GmbH provides construction and restructuring services. The company was founded in 1997 and is based in Vienna, Austria. As of July 15, 2013, HAZET Bauunternehmung GmbH operates as a subsidiary of Bauservice-Fuhs Gesellschaft M.B.H.\\",
        "region": "Europe",
        "revenue": {"min": 9750000, "max": 975000000},
        "industry": "Construction of Residential and Non- Residential Buildings " 
        },
    "annotations": {
        "HERZOG BAU GesmbH":{"label": Positive},
        "Kelly Gesellschaft m.b.H":{"label": Negative}, 
        "TNT Express (Austria) Gesellschaft M.B.H.":{"label": Negative}, 
        "IBM Osterreich Internationale Bueromaschinen Gesellschaft m.b.H":{"label": Negative}, 
        "Straka Bau GmbH":{"label": Positive},
        "Madaus Gesellschaft m.b.H.":{"label": Negative}, 
        "Verkehrsverbund Ost-Region (VOR) Gesellschaft M.B.H.":{"label": Negative}, 
        "Bank Austria Leasing Ikarus Immobilien Leasing Gesellschaft m.b.h.":{"label": Negative}, 
        "Unicredit Kfz Leasing Gmbh":{"label": Negative},
        }
    },
    {"base_company": {
        "name": "Holmes Place Wien GmbH ML Test",
        "business_description": "Headquartered in Vienna (Austria), Holmes Place Wien GmbH (“Holmes Place” or “Company”) is part of Holmes Place Group which runs international chain of health clubs and operates 80 clubs in eight countries spread across Europe and Israel. The group has patronage of ~290,000 members. Holmes Place Group is owned by Fisher & Kirsh families. ▪ Holmes Place which started with one club in December 2000, currently operates four luxury health clubs in Vienna, Austria. Clubs are located in Wipplingerstrasse, Hütteldorfer Straße, Wehlistraße & Wagramer Str. Holmes Place clubs in Vienna offer various facilities including sports, pool, fitness, spa, Yoga, aerobics, wellness services and personal training.",
        "region": "Europe",
        "revenue": {"min": 1460000, "max":146000000},
        "industry": "Fitness Centres"
    },
    "annotations": {
        "ELIXIA Austria GmbH":{"label": Positive},
        "Holmes Place Health Clubs Ltd.":{"label": Positive},
        "Holmes Place Health Clubs GmbH":{"label": Positive},
        "Vienna Airport Business Park Immobilienbesitzgesellschaft M.B.H.":{"label": Negative}, 
        "Unilever BCS Austria GmbH":{"label": Negative}, 
        "Reckitt Benckiser Austria GmbH":{"label": Negative}, 
        "Brenntag Austria Holding Gmbh":{"label": Negative}, 
        "Iglo Austria Holding GmbH":{"label": Negative}, 
        "Novelis Deutschland Gmbh (Austria)":{"label": Negative}, 
        "MFC Holding Austria GmbH":{"label": Negative}, 
        }
    },
    {"base_company": {
        "name": "Innofreight Speditions GmbH ML Test",
        "business_description": "Formed in 2002, Innofreight Group (“IF Group” or “the Group”) is an Austria-based integrated rail logistics systems provider. It manufactures, sells and leases wagons, containers, pallets, and unloading systems. It develops innovative wagons, containers, and unloading systems in cooperation with customers for industries like steel, energy, timber, building material, agriculture, and fluids. It moves a million container loads a year using 12,000 containers, 1,200 InnoWaggons, 150 block trains, 58 forklifts, and 6 unloading stations. The Group is a technology leader as reflected in its innovative products, such as ultra-light, multipurpose container waggon, the InnoWaggon. With its ITECCO1 program, it proposes to achieve leadership in Steel industry material transport and unloading systems.",
        "region": "Europe",
        "revenue": {"min": 7830000, "max":783000000},
        "industry": "Transport / Rail Logistics"
    },
    "annotations": {
        "HLA Rosshafen Terminal GmbH":{"label": Negative}, 
        "Lormafer S.A.":{"label": Negative}, 
        "Pegas Container, s.r.o.":{"label": Negative}, 
        "Titagarh Wagons AFR S.A.":{"label": Negative}, 
        "CHS Container Handel GmbH":{"label": Negative}, 
        "Cocomat Holdings Limited":{"label": Negative}, 
        "Torgovyi Dom Soyuz Spets Sbyt":{"label": Neutral},
        "ILAB Container AB":{"label": Neutral},
        "On Rail Gesellschaft für Eisenbahnausrüstung und Zubehör mbH":{"label": Negative}, 
        "UCON AG Containersysteme KG":{"label": Neutral},
        }
    },
    {"base_company": {
        "name": "Interseroh Austria GmbH ML Test",
        "business_description": "Interseroh Austria GmbH (“ISA” or “the Company”), incorporated in 1987 and based in Austria, operates waste collection and recycling system (translating to ‘Sammel und Verwertungssysteme in German’ or SVS) for packaging, electrical and electronic equipment and batteries throughout Austria on behalf of its customers. In addition, it also offers consulting solutions in waste management and undertakes waste bin product sales. ISA also acts as the holding company for the Groups non-German operations. ISA is part of the Interseroh group (Parent) which has a strong presence in Germany. Interseroh group is held by Germany based Alba group and China based Techcent group which are large players in waste management.",
        "region": "Europe",
        "revenue": {2390000-239000000},
        "industry": "Waste Management"
    },
    "annotations": {
        "ITERSEROH Jade-Stahl GmbH":{"label": Positive},
        "INTERSEROH Scrap and Metals Trading GmbH":{"label": Neutral},
        "INTERSEROH MAB Ost GmbH":{"label": Positive},
        "Interseroh NRW GmbH":{"label": Positive},
        "ALBA Servicios Verdes, S.L.":{"label": Neutral},
        "INTERSEROH Hansa Finance GmbH":{"label": Negative}, 
        "ALBA plc & Co. KGaA":{"label": Positive},
        "Hidroplasto Srl":{"label": Negative}, 
        "Northgate Information Solutions Limited":{"label": Negative}, 
        "ATON-HT S.A.":{"label": Neutral},
        }
    },
    {"base_company": {
        "name": "ISTAC Promotion GmbH",
        "business_description": "ISTAC Service GmbH (ISG), established in October 2017, is engaged in providing office supplies to banks and financial institutions. ISG is wholly owned by ISTAC Promotion GmbH (IPG). IPG undertakes orders for customized promotional products and gifts. ISG and IPG are together referred to as ISTAC Group (‘Group’ or ‘ISTAC’) and are based out of Pasching, Austria. The Group is a full-service promotional product and office supplies provider offering wide range of products with services from design to order to storage to worldwide delivery. ▪ The Group has a showroom in Pasching and an online web-shop to showcase the products on offer. It also offers its clients customized web-shops showing promotional products to employees, customers and sales partners.",
        "region": "Europe",
        "revenue": {"min": 1000000, "max":51000000},
        "industry": "Promotional Products and Office Supplies"
    },
    "annotations": {
        "ISG Pearce Limited":{"label": Negative}, 
        "Formula A/S":{"label": Positive},
        "Norwood Promotional Products Europe SLU":{"label": Neutral},
        "Sappi Finland Operations Oy":{"label": Negative}, 
        "Ocay Sverige I AB":{"label": Positive},
        "Buffetti Group SpA":{"label": Positive},
        "Whitegrove Group Limited":{"label": Positive},
        "Lyreco Finland Oy":{"label": Positive},
        "Dag Aasboe Travel As":{"label": Negative}, 
        "Media Partners Group B.V.":{"label": Negative}, 
        }
    },
    {"base_company": {
        "name": "Molin Industrie Inbetriebnahme & Montage GesmbH & Co KG ML Test",
        "business_description": "Molin-Industrie-Inbetriebnahme-Montage-Gesellschaft mbH & Co KG (“Molin”, “Company”) was established in 1983 in Linz, Austria. Ranked among the top 10 HVAC service providers in Austria, it is involved in the business of planning, installation, start-up and servicing & maintenance for building services and industrial plant engineering. Its client list includes market leaders from various sectors like BMW, Shell, BP, Opel, Spar, Aldi etc. ▪ Heard-quartered in Wels, Molin has branch offices in Vienna, Spillern and Hartberg in Austria. Majority of revenue comes from Austria (70%+) while the rest comes from overseas, primarily Germany. It had operations in other countries in Central and Eastern European, however, foreign subsidiaries have been closed in 2017 and 2018 in order to streamline the business. Further closure of subsidiaries in Germany and Romania is planned by 2019. ▪ Molin underwent major restructuring drive during 2017 and 2018. In addition to shutting down of foreign subsidiaries, certain business division and investment from one of the shareholders was merged into Molin.",
        "region": "Europe",
        "revenue": {"min": 5290000, "max":529000000},
        "industry": "Heating, Ventilation and Air Conditioning"
    },
    "annotations": {
        "SIGMA PLUS d.o.o.":{"label": Negative}, 
        "YIT Austria GmbH":{"label": Positive},
        "Ortner Ges.M.B.H.":{"label": Neutral},
        "Proenergy Contracting Gmbh":{"label": Positive},
        "Twinputki Oy":{"label": Negative}, 
        "Luzian Bouvier Haustechnik & Fliesen Gmbh":{"label": Positive},
        "KGT Gebäudetechnik GmbH":{"label": Neutral},
        "ART-RASVJETA d.o.o.":{"label": Negative}, 
        "Kristl, Seibt & Co. Gesellschaft M.B.H.":{"label": Neutral},
        "Daikin Airconditioning Central Europe HandelsGmbH":{"label": Positive},
        }
    }
]

In [45]:
model_path = "s3://oaknorth-ml-dev-eu-west-1/andrei/peers/"

In [39]:
for annotated_set in annotated_companies:
    for base_name,metadata in annotated_set["annotations"].items():
        found_name = [hit["name"] for hit in search_query(base_name)][0]
        if found_name != base_name:
            print(f"Failed to find {base_name}, found {found_name} instead.")
        metadata["entity_id"] = [hit["entity_id"] for hit in search_query(base_name)][0]
        print("---------")

---------
---------
---------
---------
---------
---------
---------
---------
---------
---------
---------
---------
Failed to find TNT Express (Austria) Gesellschaft M.B.H., found TNT Express (Austria) Gesellschaft m.b.H. instead.
---------
---------
---------
---------
---------
---------
---------
---------
---------
Failed to find Holmes Place Health Clubs GmbH, found Holmes Place Health Clubs  GmbH instead.
---------
---------
---------
---------
Failed to find Brenntag Austria Holding Gmbh, found Brenntag Austria Holding GmbH instead.
---------
---------
---------
---------
Failed to find HLA Rosshafen Terminal GmbH, found HHLA Rosshafen Terminal GmbH instead.
---------
---------
Failed to find Pegas Container, s.r.o., found Pegas Container S.R.O. instead.
---------
---------
---------
---------
---------
---------
---------
---------
Failed to find ITERSEROH Jade-Stahl GmbH, found INTERSEROH Jade-Stahl GmbH instead.
---------
---------
---------
---------
---------
---------


In [50]:
import smart_open
import pickle

with smart_open.open(f"{model_path}/annotated_peers.pkl", "wb") as f:
    pickle.dump(annotated_companies, f)