In [1]:
import os
os.chdir('..')

In [2]:
from haystack.document_stores import OpenSearchDocumentStore
from smart_evidence.helpers import opensearch_connection
from smart_evidence.components.concept_extractor import ES_MAPPING

# from smart_evidence.components.company_impact_classifier import (
#     CompanyImpactClassifier,
# )


# DocumentStore: holds all your data
document_store = OpenSearchDocumentStore(
    username="admin",
    password="R9$Cix3vD$BU#z",
    host=opensearch_connection.HOST,
    port=443,
    timeout=120,
    aws4auth=opensearch_connection.AWS_AUTH,
    verify_certs=True,
    index="haystack-paragraphs",
    label_index="haystack-paragraphs-labels",
    search_fields=["text", "title"],
    similarity="cosine",
    content_field="text",
    name_field="title",
    custom_mapping=ES_MAPPING,
    analyzer="english",
    embedding_field="embedding",
    duplicate_documents="overwrite",
    return_embedding=True,
)


  from .autonotebook import tqdm as notebook_tqdm
INFO - haystack.modeling.model.optimization -  apex not found, won't use it. See https://nvidia.github.io/apex/


In [11]:
def get_sources_without_concepts():
    query = {
        "size": 0,
        "query": {"bool": {"must_not": {"exists": {"field": "concepts"}}}},
        "aggs": {"sources": {"terms": {"field": "source_file", "size": 500}}},
    }

    result = document_store.client.search(query)["aggregations"]["sources"]["buckets"]
    return [item["key"] for item in result]
    



In [12]:
sources_without_concepts = get_sources_without_concepts()

In [13]:
sources_without_concepts

['wikipedia_Category:Shipping.jsonl',
 'wikipedia_Category:Psychology.jsonl',
 'wikipedia_Category:Agriculture.jsonl',
 'wikipedia-Category:Climate change.jsonl',
 'wikipedia-Category:Economy and the environment.jsonl',
 'wikipedia_Category:Computer architecture.jsonl',
 'wikipedia_Category:Marketing.jsonl',
 'wikipedia_Category:Nanotechnology.jsonl',
 'wikipedia_Category:Deforestation.jsonl']

In [14]:
from collections import Counter
from tqdm import tqdm
import re

document_batch = []
for document in tqdm(
    document_store.get_all_documents_generator(batch_size=10000, return_embedding=True, filters={'source_file': sources_without_concepts})
):
    entities = document.meta["entities"]
    impact_concept_counts = Counter()
    company_concept_counts = Counter()
    for i in range(len(entities)):
        ent = entities[i]
        if 'id' not in ent: continue
        entities[i]["concept_label"] = ent["id"].split(";")[-1]
        if ent["label"] == "IMPACT":
            # for iris_category in re.findall(r"IRIS\/(.*?)[;\/$]", ent["id"]):
            #     impact_concept_counts[iris_category] = impact_concept_counts.get(
            #         iris_category, Counter()
            #     )
            #     impact_concept_counts[iris_category].update([ent["concept_label"]])
            impact_concept_counts.update([ent['concept_label']])
        elif ent["label"] == "COMPANY":
            company_concept_counts.update([ent['concept_label']])

    document.meta["impact_concepts"] = list(impact_concept_counts.keys())
    document.meta["impact_concept_counts"] = dict(impact_concept_counts)
    document.meta["company_concepts"] = list(company_concept_counts.keys())
    document.meta["company_concept_counts"] = dict(company_concept_counts)

    concept_counts = impact_concept_counts.copy()
    concept_counts.update(company_concept_counts)
    
    document.meta["concepts"] = list(concept_counts.keys())
    if not concept_counts.keys():
        print('No concept:', document.id)
        document_store.delete_documents([document])
    document.meta["concept_counts"] = dict(concept_counts)
    document_batch.append(document)
    if len(document_batch) == 10000:
        print(document.id)
        print(document_batch[0])
        print(len(document_batch))
        document_store.write_documents(document_batch, duplicate_documents="overwrite")
        document_batch = []
document_store.write_documents(document_batch, duplicate_documents="overwrite")
document_batch = []


9595it [00:12, 2734.05it/s]

5e1fdbb2021e4eaeb6443f53210e5415
<Document: id=0b3e0e1429cc76c23d3bec75cde6c7cc, content='Lund University library was established in 1668 at the same time as the university and is one of Swe ...'>
10000


19480it [01:18, 795.66it/s]

5ebed40e125276fbdb5fc9c94973359a
<Document: id=c73c69eb8b5fd6d1a276bacf1ca92490, content='The C&SF established 470,000 acres (1,900 km2) for the Everglades Agricultural Area—27 percent of th ...'>
10000


19999it [01:18, 253.35it/s]


KeyboardInterrupt: 

In [5]:
from collections import Counter
from tqdm import tqdm
import re

document_batch = []
for document in tqdm(
    document_store.query(
        "",
        custom_query="""{"size": 3000, "query": {"bool": {"must_not": {"exists": {"field": "concepts"}}}}}""",
        top_k=4000,
    )
):
    impact_concept_counts = Counter()
    company_concept_counts = Counter()
    for i in range(len(document.meta["entities"])):
        ent = document.meta["entities"][i]
        if "id" not in ent:
            continue
        document.meta["entities"][i]["concept_label"] = ent["id"].split(";")[-1]
        if ent["label"] == "IMPACT":
            # for iris_category in re.findall(r"IRIS\/(.*?)[;\/$]", ent["id"]):
            #     impact_concept_counts[iris_category] = impact_concept_counts.get(
            #         iris_category, Counter()
            #     )
            #     impact_concept_counts[iris_category].update([ent["concept_label"]])
            impact_concept_counts.update([ent["concept_label"]])
        elif ent["label"] == "COMPANY":
            company_concept_counts.update([ent["concept_label"]])

    document.meta["impact_concepts"] = list(impact_concept_counts.keys())
    document.meta["impact_concept_counts"] = dict(impact_concept_counts)
    document.meta["company_concepts"] = list(company_concept_counts.keys())
    document.meta["company_concept_counts"] = dict(company_concept_counts)

    concept_counts = impact_concept_counts.copy()
    concept_counts.update(company_concept_counts)

    document.meta["concepts"] = list(concept_counts.keys())
    document.meta["concept_counts"] = dict(concept_counts)
    document_batch.append(document)
    if len(document_batch) == 10000:
        print(document.id)
        print(document_batch[0])
        print(len(document_batch))
        document_store.write_documents(document_batch, duplicate_documents="overwrite")
        document_batch = []

document_store.write_documents(document_batch, duplicate_documents="overwrite")
document_batch = []


100%|██████████| 1989/1989 [00:00<00:00, 45304.80it/s]


In [None]:
from collections import Counter
from tqdm import tqdm
import re

document_batch = []
for document in tqdm(
    document_store.get_all_documents_generator(batch_size=10000, return_embedding=True, filters={'source_file': sources_without_concepts})
):
    entities = document.meta["entities"]
    impact_concept_counts = Counter()
    company_concept_counts = Counter()
    for i in range(len(entities)):
        ent = entities[i]
        if 'id' not in ent: continue
        entities[i]["concept_label"] = ent["id"].split(";")[-1]
        if ent["label"] == "IMPACT":
            # for iris_category in re.findall(r"IRIS\/(.*?)[;\/$]", ent["id"]):
            #     impact_concept_counts[iris_category] = impact_concept_counts.get(
            #         iris_category, Counter()
            #     )
            #     impact_concept_counts[iris_category].update([ent["concept_label"]])
            impact_concept_counts.update([ent['concept_label']])
        elif ent["label"] == "COMPANY":
            company_concept_counts.update([ent['concept_label']])

    document.meta["impact_concepts"] = list(impact_concept_counts.keys())
    document.meta["impact_concept_counts"] = dict(impact_concept_counts)
    document.meta["company_concepts"] = list(company_concept_counts.keys())
    document.meta["company_concept_counts"] = dict(company_concept_counts)

    concept_counts = impact_concept_counts.copy()
    concept_counts.update(company_concept_counts)
    
    document.meta["concepts"] = list(concept_counts.keys())
    if not concept_counts.keys():
        print('No concept:', document.id)
        document_store.delete_documents([document])
    document.meta["concept_counts"] = dict(concept_counts)
    document_batch.append(document)
    if len(document_batch) == 10000:
        print(document.id)
        print(document_batch[0])
        print(len(document_batch))
        document_store.write_documents(document_batch, duplicate_documents="overwrite")
        document_batch = []


9595it [00:12, 2734.05it/s]

5e1fdbb2021e4eaeb6443f53210e5415
<Document: id=0b3e0e1429cc76c23d3bec75cde6c7cc, content='Lund University library was established in 1668 at the same time as the university and is one of Swe ...'>
10000


19480it [01:18, 795.66it/s]

5ebed40e125276fbdb5fc9c94973359a
<Document: id=c73c69eb8b5fd6d1a276bacf1ca92490, content='The C&SF established 470,000 acres (1,900 km2) for the Everglades Agricultural Area—27 percent of th ...'>
10000


19999it [01:18, 253.35it/s]


KeyboardInterrupt: 

In [28]:
result = document_store.query('',
        custom_query='''{"size": 3000, "query": {"bool": {"must_not": {"exists": {"field": "concepts"}}}}}''',
top_k=4000)

In [29]:
len(result)

2824