In [None]:
from elasticsearch import Elasticsearch, RequestsHttpConnection
from requests_aws4auth import AWS4Auth
import os

In [None]:
import numpy as np
import pandas as pd

In [None]:
service = "es"
region = "eu-central-1"
awsauth = AWS4Auth(
    os.environ.get("AWS_ACCESS_KEY_ID"),
    os.environ.get("AWS_SECRET_ACCESS_KEY"),
    region,
    service,
)

host = "search-ix-documents-rzvvmiarxdl7rnn47lj6ynnz4i.eu-central-1.es.amazonaws.com"
es = Elasticsearch(
    hosts=[{"host": host, "port": 443}],
    http_auth=awsauth,
    use_ssl=True,
    verify_certs=True,
    connection_class=RequestsHttpConnection,
)

In [None]:
# TODO: Add indicators
keywords_by_indicator = {
    "Importance to stakeholders": [
        "stakeholders",
        "importance to stakeholders",
        "survey",
        "research on stakeholders"
    ],
    "SDG importance": [
        "SDG",
        "Sustainable Development Goals",
        "SDG target"
    ],
    "Societal importance": [
        "society",
        "societal importance",
        "political institutions",
        "civil society",
        "media"
    ]
}

query_input = {
    "search_terms": ["Impact investing", "social startups"],
    "geographies": ["EU", "Global", "Germany"],
    "industries": ["Business intelligence"],
    "products": ["ESG Assessment", "Impact assessment"],
    "value_chain": ["Assessment", "Measurement"],
    "problems_addressed": ["Complexity of impact assessment & measurement"],
    "stakeholders": ["Impact startups", "Startups", "Corporate ventures"],
    "primary_sdgs": [9, 12],
    "sdg_keywords": ["SDG 9", "SDG 12", "Industry", "Innovation", "Infrastructure", "Responsible consumption", "Responsible production"],
    "ontology_keywords": ["Environment", "Social"],
}


In [254]:
from dataclasses import dataclass
from dataclasses import field
from itertools import chain


@dataclass
class QueryInput:

    search_terms: list[str]
    geographies: list[str]
    industries: list[str]
    products: list[str]
    value_chain: list[str]
    problems_addressed: list[str]
    stakeholders: list[str]
    primary_sdgs: list[int]
    sdg_keywords: list[str]
    ontology_keywords: list[str]
    indicator_keywords: list[str] = field(default_factory=list)

    def to_es_query(self):
        es_query = self.to_es_query_baseline()
        es_query["query"]["bool"]["should"].extend(
            [
                {
                    "rank_feature": {
                        "field": "entity_counts.REFERENCE_VALUE",
                        "boost": 25.0
                    }
                },
            ]
        )

        # Match GPEs
        for gpe in self.geographies:
            es_query["query"]["bool"]["should"].append(
                {"match_phrase": {"entities.GPE.text": {"query": gpe}}}
            )

        # Boost primary SDGs
        for goal_num in self.primary_sdgs:
            es_query["query"]["bool"]["should"].append(
                {
                    "rank_feature": {
                        "field": f"id_counts.sdg_{goal_num}",
                        "boost": 10.0
                    }
                }
            )

        # TODO remove
        print(json.dumps(es_query, indent=4))
        
        return es_query

    def to_es_query_baseline(self):
        keyword_list = list(
            chain(
                self.industries,
                self.products,
                self.value_chain,
                self.problems_addressed,
                self.stakeholders,
                self.sdg_keywords,
                self.ontology_keywords,
                self.indicator_keywords,
            )
        )

        # At least one of the search terms has to be matched
        es_query = {
            "query": {
                "bool": {
                    "must": [
                        {
                            "bool": {
                                "should": [
                                    {
                                        "match_phrase": {
                                            "text": {
                                                "query": search_term
                                            }
                                        }
                                    }
                                    for search_term in self.search_terms
                                ]
                            }
                        },
                    ],
                    "should": []
                }
            }
        }

        es_query["query"]["bool"]["should"].extend(
            [
                {"match_phrase": {"text": {"query": keyword}}}
                for keyword in keyword_list
            ]
        )

        return es_query

In [None]:
def es_response_to_dataframe(response, size=10):
    data = [
        [
            hit.get("_source", {}).get("text"),
            hit.get("_source", {}).get("url")[:25],
            hit.get("_source", {}).get("entity_counts").get("IX_REFERENCE_VALUE", 0),
            hit.get("_source", {}).get("label") == "INTERESTING",
            hit.get("_score")
        ]
        for hit in response.get("hits", {}).get("hits", [])[:size]
    ]
    df = pd.DataFrame(data, columns=["Text", "URL", "RefV", "Gold", "Score"])
    df.index = np.arange(1, len(df)+1)
    return df


In [None]:
query_input_obj = QueryInput(**query_input)

score = 0
max_score = 0

for indicator_keywords in keywords_by_indicator.values():
    query_input_obj.indicator_keywords = indicator_keywords
    es_query = query_input_obj.to_es_query()

    response = es.search(index="paragraphs-*", body=es_query, size=500)

    df = es_response_to_dataframe(response)

    for result in df.itertuples():
        score_increment = len(df)+1 - result.Index
        if result.Gold:
            score += score_increment
        max_score += score_increment

    # TODO remove
    break

recall = score/max_score
print(f"Recall: {recall:.0%}")

# Evaluation

| Query | Recall |
| ----- | ------ |
| Baseline | 0% |

In [None]:
pd.set_option('display.max_colwidth', None)
df.style.set_properties(**{'text-align': 'left'})