In [1]:
import boto3
from elasticsearch import Elasticsearch, RequestsHttpConnection, helpers
from requests_aws4auth import AWS4Auth

In [2]:
import os
os.chdir('../..')
!pwd

/home/oguz/Workspace/smart-evidence


In [3]:
host = (
        "search-ix-documents-rzvvmiarxdl7rnn47lj6ynnz4i.eu-central-1.es.amazonaws.com"
    )
region = "eu-central-1"

service = "es"
credentials = boto3.Session().get_credentials()
awsauth = AWS4Auth(
    credentials.access_key,
    credentials.secret_key,
    region,
    service,
    session_token=credentials.token,
)

es = Elasticsearch(
    hosts=[
        {"host": host, "port": 443, "requestTimeout": 60000, "readTimeout": 60000}
    ],
    http_auth=awsauth,
    readTimeout=60000,
    requestTimeout=60000,
    use_ssl=True,
    verify_certs=True,
    connection_class=RequestsHttpConnection,
)

In [4]:
from scripts.components.keyword_ruler import get_keywords_from_globs

In [21]:
KEYWORD_GLOBS = [
    "assets/keywords_clean/Impact/IRIS/**/IMPACT.tsv",
    "assets/keywords_clean/Products & Activities/**/IMPACT.tsv",
]
impact_keywords = get_keywords_from_globs(KEYWORD_GLOBS)
impact_keywords = [keyword['keyword'] for keyword in impact_keywords]

KEYWORD_GLOBS = [
    "assets/keywords_clean/Products & Activities/CPA/PRODUCT.tsv",
    "assets/keywords_clean/Products & Activities/CPA/ACTIVITY.tsv",
]
product_keywords = get_keywords_from_globs(KEYWORD_GLOBS)
product_keywords = [keyword['keyword'] for keyword in product_keywords]

KEYWORD_GLOBS = [
    "assets/keywords_clean/**/BANLIST.tsv",
    "assets/keywords_clean/Products & Activities/**/BANLIST.tsv",
]
banlist_keywords = get_keywords_from_globs(KEYWORD_GLOBS)
banlist_keywords = [keyword['keyword'] for keyword in banlist_keywords]


In [26]:
query = {
    "query": {
        "bool": {
            "must": [
                {
                    "bool": {
                        "should": [
                            {"match_phrase": {"sentences": keyword}}
                            for keyword in impact_keywords
                        ]
                    }
                },
                {
                    "bool": {
                        "should": [
                            {"match_phrase": {"sentences": keyword}}
                            for keyword in product_keywords
                        ]
                    }
                },
            ],
            "must_not": [
                {"match_phrase": {"sentences": keyword}} for keyword in banlist_keywords
            ],
        }
    },
}

In [62]:
query = {
    "query": {
        "bool": {
            "should": [
                {"term": {"predictions.sustainability_potential": "SOLUTION"}},
                {"term": {"predictions.sustainability_potential": "PROBLEM+SOLUTION"}},
                {"match_phrase": {"text": "cement"}},
            ],
            "must_not": [
                {"match_phrase": {"sentences": keyword}} for keyword in banlist_keywords
            ],
        }
    },
    "sort": [{"n_keywords": "desc"}]
}

In [63]:
def opensearch_iterate_all_documents(es, index, query, pagesize=250, scroll_timeout="1m", **kwargs):
    """
    Helper to iterate ALL values from a single index
    Yields all the documents.
    """
    is_first = True
    while True:
        # Scroll next
        if is_first: # Initialize scroll
            result = es.search(index=index, scroll="1m", **kwargs, body={
                **query,
                "size": pagesize
            }, terminate_after=1000)
            is_first = False
        else:
            result = es.scroll(body={
                "scroll_id": scroll_id,
                "scroll": scroll_timeout
            })

        scroll_id = result["_scroll_id"]
        hits = result["hits"]["hits"]
        # Stop after no more docs
        if not hits:
            break
        # Yield each entry
        yield from (hit for hit in hits)

In [72]:
paragraphs = opensearch_iterate_all_documents(es, 'paragraphs-*', query, scroll_timeout="1m")

In [73]:
import numpy as np
out_paragraphs = [par['_source'] for par in paragraphs]

In [74]:
import srsly
from pathlib import Path
out_path = Path('results/summary-cluster-research/sorted_cement_paragraphs-22_02_03.jsonl')

out_path.parent.mkdir(parents=True, exist_ok=True)
srsly.write_jsonl(out_path, out_paragraphs)

In [75]:
len(out_paragraphs)

9402