In [1]:
from flair.models import SequenceTagger

from flair.data import Sentence

import graphene
from elasticsearch import Elasticsearch
from elasticsearch.helpers import scan
from elasticsearch_dsl import Q, Search
import numpy as np
import pickle
import string
import sys
import timeit
import os
import smart_open

from itertools import islice
from collections import deque, defaultdict, Counter

from nltk import ChartParser
from nltk.parse.generate import generate
from nltk.grammar import CFG, Nonterminal
from nltk.tree import Tree
from semantic.numbers import NumberService

es_index_address = os.environ.get("ES_INDEX_ADDRESS", "localhost")

HOST = f"http://{es_index_address}:9200"
INDEX = "company"

client = Elasticsearch(HOST+"/")

### Crawl documents for keywords
* first get all keywords, lower-case them and collect them into counter
* remove keywords that have a frequency below `min_kw_frequency` 
  or appear in the description of fewer than `min_kw_companies`
* create a grammar and pickle it as well as the keywords

In [2]:
min_kw_frequency = 5
min_kw_companies = 5

In [3]:
def get_all_field_values(document_field):
    company_types = (
        Search(index=INDEX)
        .using(client)
    )
    (company_types
     .aggs
     .bucket("bucket", "terms", field=document_field, size=((1<<31)-1))
    )
    
    resp = company_types.execute()
    return {hit.key.lower() for hit in resp.aggregations['bucket']['buckets']}

In [4]:
def clean_keyword(kw):
    de_punc = str.maketrans(string.punctuation.replace("'",""), ' '*(len(string.punctuation)-1))
    kw = kw.translate(de_punc).strip()
    
    for token in kw.split():
        if len(token) > 1:
            yield token.lower()

In [5]:
industry_types = get_all_field_values("industry_class_code_desc.keyword")

In [6]:
company_description_words = Counter()
companies_per_words = Counter()
ct = 0
for hit in scan(query={"query":{"exists":{
                                        "field": "company_description"
                                        }
                                      }
                            }
                , client=client
                , index=INDEX
               ):
    ct += 1
    cleaned_kw = [kw for kw in clean_keyword(hit["_source"]["company_description"])]
    company_description_words.update(cleaned_kw)
    # Another company has these keywords, so update the company counter
    companies_per_words.update(set(cleaned_kw))

In [7]:
industry_keywords = Counter(kw for el in industry_types for kw in clean_keyword(el))

In [8]:
company_description_words = Counter({k:v for k,v in company_description_words.items() 
                                     if v > min_kw_frequency and companies_per_words[k] > min_kw_companies})
industry_keywords = industry_keywords + company_description_words

In [9]:
with smart_open.open('s3://onai-ml-dev-eu-west-1/query_parsing/industry_kw.csv', 'w') as f:
    for word,freq in industry_keywords.items():
        f.write(f"{word},{freq}\n")