In [None]:
import os
os.chdir('../..')

In [None]:
import srsly
from glob import glob

In [None]:
def paragraphs_generator(glob_path='corpus/new-paragraphs/*.jsonl'):
    for file_path in glob(glob_path):
        yield from srsly.read_jsonl(file_path)

In [None]:
LABEL_TRANSLATOR = {
    # "POSITIVE_CONTRADICTION": "POSITIVE",
    # "NEGATIVE_CONTRADICTION": "NEGATIVE",
    "POSITIVE": "POSITIVE",
    "NEGATIVE": "NEGATIVE",
}

In [None]:
from haystack.document_stores import OpenSearchDocumentStore
from smart_evidence.helpers import opensearch_connection
from opensearchpy.helpers import scan

def paragraphs_generator(index="paragraphs"):
    document_store = OpenSearchDocumentStore(
        username="admin",
        password="R9$Cix3vD$BU#z",
        host=opensearch_connection.HOST,
        port=443,
        timeout=60,
        aws4auth=opensearch_connection.AWS_AUTH,
        verify_certs=True,
        index=index,
        label_index="haystack-paragraphs-labels",
        search_fields=["text", "title"],
        similarity="cosine",
        content_field="text",
        name_field="title",
        analyzer="english",
        duplicate_documents="skip",
    )
    body = {
        "query": {
            "bool": {
                "must_not": [
                    {
                        "nested": {
                            "path": "concept_relations",
                            "query": {"exists": {"field": "concept_relations.LABEL"}},
                        }
                    }
                ]
            }
        }
    }

    items = scan(
        document_store.client,
        query=body,
        index=index,
        size=1000,
        scroll="5m",
    )

    items = (
        document_store._convert_es_hit_to_document(item, return_embedding=False)
        for item in items
    )
    yield from (d.to_dict() for d in document_store.get_all_documents_generator())

In [None]:
from smart_evidence.helpers.concept_patterns import get_concepts_from_yamls

def create_cooccurrence_matrix(paragraphs):
    """Create co occurrence matrix from given list of annotated paragraphs.

    Returns:
    - company vocabs: dictionary of company concept counts
    - impact vocabs: dictionary of impact concept counts
    - co_occ_matrix_sparse: sparse co occurrence matrix

    Example:
    ===========
    vocabs,co_occ = create_cooccurrence_matrix(sentences)

    df_co_occ  = pd.DataFrame(co_occ.todense(),
                              index=vocabs.keys(),
                              columns = vocabs.keys())

    df_co_occ = df_co_occ.sort_index()[sorted(vocabs.keys())]

    df_co_occ.style.applymap(lambda x: 'color: red' if x>0 else '')

    """
    import scipy

    company_vocabulary = {}
    impact_vocabulary = {}
    data = []
    row = []
    col = []
    impact_concepts = {k['concept_label']: k['id'] for k in get_concepts_from_yamls(['assets/keywords_clean/IMPACT.yaml'])}
    company_concepts = {k['concept_label']: k['id'] for k in get_concepts_from_yamls(['assets/keywords_clean/COMPANY.yaml']) } 

    for paragraph in paragraphs:
        sentence_predictions = paragraph['meta'].get('concept_relations', [])
        for sentence_prediction in sentence_predictions:
            if sentence_prediction['LABEL'] not in LABEL_TRANSLATOR.keys(): continue
            if 'company_concept' not in sentence_prediction: continue
            
            label = LABEL_TRANSLATOR[sentence_prediction['LABEL']]
            
            company_concept_label = sentence_prediction['company_concept']['concept_label']
            if company_concept_label not in company_concepts:
                continue
            company_concept_id = company_concepts[company_concept_label] + f"|{label}"
            
            impact_concept_label = sentence_prediction['impact_concept']['concept_label']
            if impact_concept_label not in impact_concepts:
                continue
            impact_concept_id = impact_concepts[impact_concept_label]

            i = company_vocabulary.setdefault(company_concept_id, len(company_vocabulary))
            j = impact_vocabulary.setdefault(impact_concept_id, len(impact_vocabulary))
            data.append(1)
            row.append(i)
            col.append(j)

    cooccurrence_matrix_sparse = scipy.sparse.coo_matrix((data, (row, col)))
    return company_vocabulary, impact_vocabulary, cooccurrence_matrix_sparse

In [None]:
company_vocabulary, impact_vocabulary, cooccurrence_matrix_sparse = create_cooccurrence_matrix(paragraphs_generator('paragraphs_v4'))

In [None]:
len(company_vocabulary), len(impact_vocabulary), cooccurrence_matrix_sparse.shape, cooccurrence_matrix_sparse.sum()

``` py
# 4.4.2022
len(company_vocabulary), len(impact_vocabulary), cooccurrence_matrix_sparse.shape
> (1157, 211, (1157, 211))
```

``` py
# 9.5.2022
len(company_vocabulary), len(impact_vocabulary), cooccurrence_matrix_sparse.shape
> (3945, 236, (3945, 236))
```

``` py
# 17.5.2022
len(company_vocabulary), len(impact_vocabulary), cooccurrence_matrix_sparse.shape
> (3431, 223, (3431, 223))
```

``` py
# 24.5.2022
len(company_vocabulary), len(impact_vocabulary), cooccurrence_matrix_sparse.shape
> (3305, 375, (3305, 375))
```

``` py
# 25.5.2022
len(company_vocabulary), len(impact_vocabulary), cooccurrence_matrix_sparse.shape
> (3060, 360, (3060, 360), 309127)
```

``` py
# 02.06.2022
len(company_vocabulary), len(impact_vocabulary), cooccurrence_matrix_sparse.shape
> (3235, 361, (3235, 361), 425382)
```

In [None]:
import pickle
from datetime import date
with open(f'cooccurance_{date.today().strftime("%Y-%m-%d")}.pickle', 'bw') as f:
    pickle.dump((company_vocabulary, impact_vocabulary, cooccurrence_matrix_sparse.todense()), f)

In [None]:
import pickle
latest_file = sorted(glob('cooccurance_*.pickle'))[-1]
with open(latest_file, 'br') as f:
    company_vocabulary, impact_vocabulary, cooccurrence_matrix_sparse = pickle.load(f)

In [None]:
[(vocab, ind) for vocab, ind in company_vocabulary.items() if 'Isenheim' in vocab.lower()]

In [None]:
len(company_vocabulary), len(impact_vocabulary), cooccurrence_matrix_sparse.shape

In [None]:
import re
import numpy as np
import pandas as pd

def create_cooccurance_df(file_path="cooccurance-2022-05-09.pickle") -> pd.DataFrame:
    with open(file_path, "br") as f:
        company_vocabulary, impact_vocabulary, cooccurrence_matrix_sparse = pickle.load(
            f
        )

    import pandas as pd

    df = pd.DataFrame(
        cooccurrence_matrix_sparse,
        index=company_vocabulary.keys(),
        columns=impact_vocabulary.keys(),
    )

    columns = list(df.columns)
    for id in columns:
        if id.count(";") > 1:
            parts = id.split(";")
            concept_label = parts[-1]
            taxonomies = parts[:-1]
            for taxonomy in taxonomies:
                df[f"{taxonomy};{concept_label}"] = df[id]
            df = df.drop(columns=id)

    indices = list(df.index)
    for id in indices:
        if id.count(";") > 1:
            parts = id.split(";")
            concept_label = parts[-1]
            taxonomies = parts[:-1]
            for taxonomy in taxonomies:
                df.loc[f"{taxonomy};{concept_label}"] = df.loc[id]
            df = df.drop(index=id)

    def turn_string_to_multi(index):
        parts = re.split('[\|\/]', index)
        indices = []
        for part in parts:
            if len(parts) != 6 and part in ["COMPANY"]:
                indices += [""] * (6 - len(parts))
            indices.append(part.strip())
        return tuple(indices)

    df.columns = pd.MultiIndex.from_tuples([re.split(r'[\|\/]', col) for col in df.columns])

    df.index = pd.MultiIndex.from_tuples(
        [turn_string_to_multi(ind) for ind in df.index]
    )

    df.style.applymap(lambda x: "color: red" if x > 0 else "")

    df = df.iloc[:, np.argsort(-df.sum(0).values)]

    return df

In [None]:
df = create_cooccurance_df('cooccurance_2022-06-02.pickle')

In [None]:
df.sum().sum()