In [None]:
import boto3
from requests_aws4auth import AWS4Auth

In [None]:
import os
os.chdir('../..')
!pwd

In [None]:
agg_query = {
    "query": {
        "bool": {
            "should": [
                {"term": {"predictions.sustainability_potential": "SOLUTION"}},
                {"term": {"predictions.sustainability_potential": "PROBLEM+SOLUTION"}},
            ],
        }
    },
    "size": 1000,
    "aggs": {
        "sources": {
            "terms": {"field": "scraper"},
            "aggs": {
                "documents": {
                    "terms": {
                        "field": "document_id",
                        "size": 2
                    },
                }
            }
        },
    },
}


In [None]:
from scripts import opensearch_connection
paragraphs = opensearch_connection.opensearch_iterate_all_documents('paragraphs-*', agg_query, scroll_timeout="1m")

In [None]:
paragraphs = list(paragraphs)

In [None]:
len(paragraphs)

In [None]:
import pandas as pd
df = pd.DataFrame([{**par['_source'], id: par['_id']} for par in paragraphs])

In [None]:
df['n_key_tok'] = df.n_keywords/df.n_tokens
df['n_ent_tok'] = df.entities.apply(len)/df.n_tokens

In [None]:
df = df.drop_duplicates('id')

In [None]:
from pathlib import Path

OUT_PATH = Path("datasets/intermediate/labeling/X_paragraph_sentences.tsv")
OUT_PATH.parent.mkdir(parents=True, exist_ok=True)

rows = []
for row in df.itertuples(index=False):
    par_row = {
        "SCRAPER": row.scraper,
        "TITLE": row.title,
        "URL": row.url,
        "ID": row.id,
        "PAR_IND": row.par_ind,
        "N_KEYWORDS/TOKEN": row.n_key_tok,
        "SENT_IND": -1,
    }
    par_row |= {
        f"KEYWORDS_{label}": ",".join(
            set([ent["text"].lower() for ent in row.entities if ent["label"] == label])
        )
        for label in ["IX_IMPACT", "IX_PRODUCT", "IX_BANLIST"]
    }
    par_row |= {
        f"PRED_{cls.upper()}": row.predictions[cls]
        for cls in ["domain", "sustainability_potential", "financial_tone"]
    }
    par_row |= {
        f"X_{cls.upper()}": ""
        for cls in ["domain", "sustainability_potential", "financial_tone"]
    }
    rows.append(par_row)

    for i, (predictions, sentence) in enumerate(
        zip(row.sentence_predictions, row.sentences)
    ):
        sent_row = {
            "ID": row.id,
            "N_KEYWORDS/TOKEN": row.n_key_tok,
            "PAR_IND": row.par_ind,
            "SENT_IND": i,
            "SENT": sentence,
        }
        sent_row |= {
            f"KEYWORDS_{label}": ",".join(
                set(
                    [
                        ent["text"].lower()
                        for ent in row.entities
                        if ent["label"] == label and ent["sent_ind"] == i
                    ]
                )
            )
            for label in ["IX_IMPACT", "IX_PRODUCT", "IX_BANLIST"]
        }
        sent_row |= {
            f"PRED_{cls.upper()}": predictions.get(cls, "")
            for cls in ["domain", "sustainability_potential", "financial_tone"]
        }
        sent_row |= {
            cls.upper(): ""
            for cls in ["domain", "sustainability_potential", "financial_tone"]
        }
        rows.append(sent_row)


In [None]:
COLUMNS = [
    "SCRAPER",
    "ID",
    "PAR_IND",
    "N_KEYWORDS/TOKEN",
    "SENT_IND",
    "KEYWORDS_IX_IMPACT",
    "KEYWORDS_IX_PRODUCT",
    "KEYWORDS_IX_BANLIST",
    "PRED_DOMAIN",
    "PRED_SUSTAINABILITY_POTENTIAL",
    "PRED_FINANCIAL_TONE",
    "DOMAIN",
    "SUSTAINABILITY_POTENTIAL",
    "FINANCIAL_TONE",
    "SENT",
]

In [None]:
labeling_df = pd.DataFrame(rows, columns=COLUMNS)
labeling_df.to_csv(OUT_PATH, sep='\t')

In [None]:
labeling_df.sort_values(['N_KEYWORDS/TOKEN', 'ID', 'SENT_IND'], ascending=[False, True, True]).iloc[:100].to_csv(OUT_PATH.with_name(OUT_PATH.stem + "_sorted.csv"), sep=',')

In [None]:
labeling_df