## Annotation ingestion notebook

Given a pair of strings representing a well formatted list of base companies and a list
of annotated peers respectively, this notebook processes them into an easy machine-understandable dictionary that can be used for evaluation.

In [1]:
from collections import defaultdict
from elasticsearch import Elasticsearch, NotFoundError
from elasticsearch_dsl import Search
import numpy as np
import ssl
from elasticsearch.connection import create_ssl_context
from elasticsearch_dsl import Q
from graphene import ObjectType, String, Int, ID, Float, List, Schema
import urllib3

urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

HOST = "host.docker.internal"
PORT = "9200"
SSL = False
INDEX = "company"

if SSL:
    ssl_context = create_ssl_context()
    ssl_context.check_hostname = False
    ssl_context.verify_mode = ssl.CERT_NONE
else:
    ssl_context = None

client = Elasticsearch(
    hosts=[{'host': HOST, 'port': PORT}],
    indices=[INDEX],
    scheme="https" if SSL else "http",
    ssl_context=ssl_context)

In [2]:
region_to_currency = {
    "europe": "EUR",
    "asia pacific": "MYR",
}

In [3]:
import requests

headers = {
  "X-API-Key": "ONAI_API_KEY",
  "X-Stack-Host": "Development",
  "X-Request-Id": "-1",
  "X-Stack-User": "Anonymous User"
}

get_conversion_rates_query = '''
query{
  currencyConversionRates(keys:
    [
      {sourceCurrency: "MYR",
        targetCurrency:"EUR",
        spotRate: "2019-11-30"
      },
      {sourceCurrency: "USD",
        targetCurrency:"EUR",
        spotRate: "2019-11-30"
      },
    ]
  )
  {
    sourceCurrency{
      code
    }
    rate
  }
}
'''

res = requests.post('https://data-services.onai.cloud/api/', 
                            json={'query': get_conversion_rates_query}, 
                            headers=headers).json()['data']['currencyConversionRates']
conversion_rates_to_eur = {
    el["sourceCurrency"]["code"]: el["rate"]
    for el in res
}

In [4]:
base_companies_str = '''
Jerash Holdings (US), Inc.	Jerash Holdings (US), Inc., through its subsidiaries, manufactures and exports customized and ready-made sports and outerwear. The company offers jackets, polo shirts, crew neck shirts, pants, and shorts made from knitted fabric. It serves various brand-name retailers in the United States, Jordan, and internationally. The company was founded in 2016 and is based in Rochester, New York.	United States and Canada	Consumer Staples	USD 85mn	
Sequential Brands Group, Inc	Sequential Brands Group, Inc. owns a portfolio of consumer brands in the home, active, and fashion categories in the United States and internationally. It offers products in the apparel, footwear, eyewear, fashion accessories, home goods, food, wine, and media related assets, such as magazines, books, and other print and digital content. The company operates under the Martha Stewart, Jessica Simpson, AND1, Avia, GAIAM, Joe’s, Ellen Tracy, Emeril Lagasse, William Rast, Heelys, Caribbean Joe, DVS, The Franklin Mint, Linens N Things, SPRI, and Nevados brands. The company licenses its brands through various distribution channels to retailers, wholesalers, and distributors. Sequential Brands Group, Inc. was incorporated in 1982 and is headquartered in New York, New York.	United States and Canada	Consumer Staples	USD 170mn	
Crown Crafts, Inc.	Crown Crafts, Inc., through its subsidiaries, operates in the consumer products industry in the United States and internationally. It provides infant, toddler, and juvenile products, including infant and toddler beddings; blankets and swaddle blankets; nursery and toddler accessories; room décors; reusable and disposable bibs; burp cloths; hooded bath towels and washcloths; reusable and disposable placemats, and floor mats; disposable toilet seat covers and changing mats; developmental toys; feeding and care goods; and other infant, toddler, and juvenile soft goods. The company sells its products primarily to mass merchants, mid-tier retailers, juvenile specialty stores, value channel stores, grocery and drug stores, restaurants, Internet accounts, and wholesale clubs through a network of sales force and independent commissioned sales representatives. Crown Crafts, Inc. was founded in 1957 and is headquartered in Gonzales, Louisiana.	United States and Canada	Consumer Staples	USD 76mn	
Summer Infant, Inc.	Summer Infant, Inc., together with its subsidiaries, designs, markets, and distributes branded juvenile health, safety, and wellness products primarily worldwide. It offers audio and video monitors; safety products, including gates, bedrails, baby proofing products, potties, bath products, positioners, and infant health products; nursery products, such as specialty blankets, sleep aides and soothers, and travel accessories; and baby gear products consisting of strollers, bassinets, high chairs, and playards under the Summer, SwaddleMe, and born free brand names. The company sells its products directly to retailers through own direct sales force and a network of independent manufacturers' representatives and distributors; and through partner's Websites and its summerinfant.com Website, as well as indirectly through distributors, representatives, and international retail customers. Summer Infant, Inc. was founded in 1985 and is headquartered in Woonsocket, Rhode Island.	United States and Canada	Consumer Staples	USD 174mn	
"Tandy Leather Factory, Inc."	Tandy Leather Factory, Inc. operates as a retailer and wholesale distributor of a range of leather and leathercraft related items in North America and internationally. The company offers leather, quality tools, hardware, small machines, accessories, liquids, lace, kits, open workbenches, and teaching materials. It also manufactures leather lace and do-it-yourself kits. The company sells its products through company-owned stores; and orders generated from its Website, tandyleather.com. As of March 6, 2019, the company had 115 North American stores located in 42 states of the United States and 7 Canadian provinces; and 2 stores located in the United Kingdom and Spain. It serves individual retail customers; and wholesale, manufacturer, and institutional groups, such as horse and tack shops, Western wear, crafters, upholsterers, cobblers, auto repair, education, hospitals, prisons, and other businesses that use its products as raw materials to produce goods for resale. The company was formerly known as The Leather Factory, Inc. and changed its name to Tandy Leather Factory, Inc. in 2005. Tandy Leather Factory, Inc. was founded in 1980 and is headquartered in Fort Worth, Texas.	United States and Canada	Consumer Staples	USD 80mn	
'''

In [5]:
fields = [el.lower().replace(" ", "_")
          for el in
          ("company name	Business Description	Region	Sector / or  SIC Codes	Revenue"
          .strip("	")
          .split("	")
         )]
annotated_companies = []
for line in base_companies_str.split("\n")[1:]:
    line_dict = {"annotations": {}}
    base_company = {}
    for i,el in enumerate(line.strip("	").split("	")):
        if el.lower() == "not found":
            continue
        if fields[i] == "revenue":
            currency,value = el.split()
            value = float(value[:-2])
            base_company["currency"] = currency
            base_company["revenue_range_pretty"] = f"{currency} {value/10}mn-{value*10}mn"
            continue
        if fields[i] == "converted_to_eur_mn":
            base_company["revenue_range"] = {
                "min": float(el)*1e5,
                "max": float(el)*1e7
            }
            continue
        base_company[fields[i]] = el
    line_dict["base_company"] = base_company
    annotated_companies.append(line_dict)

In [6]:
def search_query(base_name, size=10):
    tst_query_fuzzy  = {
      '_source': True, 
      'from': 0, 
      'size': size, 

      "query": {
          "bool": {
            "should": [
             {
              "multi_match": {
                "query": base_name,
                "fuzziness": "2",
                "prefix_length": 1,
                "fields": [
                  "name",
                  "name.cleaned",
                ],
                "minimum_should_match": "1",
                "type": "most_fields"
              }
             },
            {
              "multi_match": {
                "query": base_name,
                "fuzziness": "1",
                "prefix_length": 1,
                "fields": [
                  "name",
                  "name.cleaned",
                ],
                "minimum_should_match": "1",
                "type": "most_fields",
                "boost": 2
              }
             },
            {
              "multi_match": {
                "query": base_name,
                "fields": [
                  "name",
                  "name.cleaned",
                ],
                "minimum_should_match": "1",
                "type": "most_fields",
                "boost": 4
              }
             }
            ]
           }
          } 
    }

    json_result = client.search(index=INDEX, body=tst_query_fuzzy)
    return [hit["_source"] for hit in json_result["hits"]["hits"]]

In [7]:
Positive, Neutral, Negative = 1,0,-1

In [8]:
model_path = "s3://oaknorth-ml-dev-eu-west-1/andrei/peers/"

In [9]:
[hit["name"] for hit in search_query("Electronic4you GmbH")]

['electronic4you GmbH',
 'Electronic4you Gmbh',
 'RKM GmbH Personaldienstleistungen',
 'BANKPOWER GmbH Personaldienstleistungen',
 'Teamkompetent Gmbh Personaldienstleistungen',
 'teamkompetent GmbH Personaldienstleistungen',
 'Conexa Gmbh Präzisionsarmaturen',
 'Genopersonalconsult Gmbh',
 'Donaldson GmbH',
 'Konzentration GmbH']

In [10]:
for annotated_set in annotated_companies:
    for base_name,metadata in annotated_set["annotations"].items():
        found_name = [hit["name"] for hit in search_query(base_name)][0]
        if found_name != base_name:
            print(f"Failed to find {base_name}, found {found_name} instead.")
        metadata["entity_id"] = [hit["entity_id"] for hit in search_query(base_name)][0]
        print("---------")

In [11]:
import smart_open
import pickle

with smart_open.open(f"{model_path}annotated_peers_batch2_5s.pkl", "wb") as f:
    pickle.dump(annotated_companies, f)