In [None]:
import os
os.chdir('../..')

In [None]:
from elasticsearch import Elasticsearch, RequestsHttpConnection
from requests_aws4auth import AWS4Auth
from elasticsearch import helpers
import boto3

host = 'search-metrics-store-vp6epbqctqfib7zheyo5rgckkm.eu-central-1.es.amazonaws.com' # For example, my-test-domain.us-east-1.es.amazonaws.com
region = 'eu-central-1' # e.g. us-west-1

service = 'es'
credentials = boto3.Session().get_credentials()
awsauth = AWS4Auth(credentials.access_key, credentials.secret_key, region, service, session_token=credentials.token)

es = Elasticsearch(
    hosts = [{'host': host, 'port': 443}],
    http_auth = awsauth,
    use_ssl = True,
    verify_certs = True,
    connection_class = RequestsHttpConnection
)

In [None]:
results = helpers.scan(
    es,
    index="metrics-*",
    scroll="3m",
    size=10,
)


In [None]:
import pandas as pd

In [None]:
df = pd.DataFrame([item['_source'] for item in results])

In [None]:
df.columns

In [None]:
df[df.link]

In [None]:
import srsly
srsly.write_jsonl(
    "data/metrics.jsonl",
    df[["title", "description", "identifier"]].fillna('').rename(
        {"identifier": "id", "description": "text"}
    , axis=1).to_dict('records'),
)


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
import numpy as np

In [None]:
from scripts.nounchunker import get_nounchunks

In [None]:
vectorizer = TfidfVectorizer(
    min_df=5,
    max_df=10,
    ngram_range=(1, 3),
    #                              vocabulary=list(noun_chunks),
    preprocessor=lambda x: x.lower(),
    stop_words="english",
    token_pattern=r"(?u)\b[A-Za-z]\w+\b",
    sublinear_tf=True,
    smooth_idf=True,
)

X = vectorizer.fit_transform(
    get_nounchunks(df.title[~df.title.isna()])
)
print(len(vectorizer.get_feature_names()))
print(vectorizer.get_feature_names())


In [None]:
words = vectorizer.get_feature_names()

In [None]:
vocabulary = []
for word in words:
    if not any([word in other_word for other_word in words if word != other_word]):
        vocabulary.append(word)

In [None]:
vocabulary

In [None]:
vectorizer.set_params()

In [None]:
import spacy

In [None]:
nlp = spacy.load('en_core_web_trf')

In [None]:
df.columns

In [None]:
df = df.fillna('')

In [None]:
# merge text fields
text_series = df.title + '. ' + df.subtitle + '. ' + df.description + '. ' + df['Human rights issue'] + '. ' + df['SDG indicator']

In [None]:
text_series[~text_series.isna()]

In [None]:
import re
noun_chunks = set()
for doc in nlp.pipe(text_series):
    for noun in doc.noun_chunks:
        if not all([re.match(r'[A-Za-z]\w+', tok.text) for tok in noun]):
            continue
        
        # strip morphology from single words
        if len(noun) == 1:
            text = noun.lemma_
        # strip plural from noun chunks
        elif noun[-1].text.strip(noun[-1].lemma_) == 's':
            text = noun.text[:-1].lower()
        # always lower
        else:
            text = noun.text.lower()
        noun_chunks.add(text)
        break

In [None]:
len(noun_chunks)

In [None]:
noun_chunks

In [None]:
for tok in nlp(text_series.iloc[0]):
    print(tok)

In [None]:
from thinc.config import Config

In [None]:
with open('../../textcat_demo/configs/config.cfg') as f:
    config = Config().from_str(f.read())

In [None]:
from pyairtable import Table
from pyairtable.formulas import match
api_key = 'keyN3wfCJD6wMCAMA'
base_id = 'appGSJaOzNaIZ4lSm'

In [None]:
table = Table(api_key, base_id, "Key Info")
formula = match({"Name": "Carbon Instead UG (v0.2)"})

In [None]:
table.first()

In [None]:
venture_keyinfo_row = table.first(formula="{Assessment/Venture}='Green Fusion'")

In [None]:
venture_keyinfo_row

In [None]:
values = []
for key, value in venture_keyinfo_row['fields'].items():
    if key not in ['Market/sector', 'IRIS impact categories', 'Headquarter', 'Venture in one sentence', 'Mission/Vision/Purpose statement',
                  'Inputs - Value Chain', 'Activities - Value Chain', 'Outputs - Value Chain', 'Assessment/Primary SDGs Rollup']:
        continue
    if type(value) == list:
        values += value
    elif '\n' in value:
        values += [v.strip().strip('- ') for v in value.split('\n') if v.strip() != '']
    else:
        values += [value]

In [None]:
values

In [None]:
question_nouns = list(nlp("Do you measure GHG emissions generated directly or indirectly by your organisation?").noun_chunks)

In [None]:
[n.lemma_ for n in question_nouns]

In [None]:
query = {
    "query": {
        "bool": {
            "should": [
                {"terms": {
                    "title": [v.lower() for v in values],
                    "boost": 1.0
                }},
                {"terms": {
                    "title": [n.lemma_ for n in question_nouns],
                    "boost": 10.0
                }}
            ]
        }
    }
}

In [None]:
results = es.search(query, 'metrics-*')

In [None]:
df = pd.DataFrame([r['_source'] | {'_score': r['_score']} for r in results['hits']['hits']])

In [None]:
from IPython.core.display import display, HTML
html = df[['title', '_score']].to_html()
display(HTML(html))