In [None]:
import os
os.chdir('../..')

In [None]:
from app.models.annotation import AnnotatedInsight

In [None]:
import logging
import os
import json
from typing import Dict, Tuple

from opensearchpy import OpenSearch, RequestsHttpConnection
from opensearchpy.helpers import scan
from pyairtable import Table
from pyairtable.formulas import match
from requests_aws4auth import AWS4Auth

from app.models.annotation import (
    AnnotatedInsight,
    AnnotatedInsightInput,
    AnnotationResponse,
    AnnotationType,
    InsightAnnotationInput,
    InsightLabelingRequest,
    InsightAnnotations
)
from app.models.concepts import CompanyConcept, ImpactConcept

service = "es"
region = "eu-central-1"
host = "search-ix-documents-rzvvmiarxdl7rnn47lj6ynnz4i.eu-central-1.es.amazonaws.com"
awsauth = AWS4Auth(
    os.environ.get("AWS_ACCESS_KEY"),
    os.environ.get("AWS_SECRET_ACCESS_KEY"),
    region,
    service,
)

opensearch = OpenSearch(
    hosts=[{"host": host, "port": 443}],
    http_auth=awsauth,
    use_ssl=True,
    verify_certs=True,
    connection_class=RequestsHttpConnection,
)

INSIGHT_INDEX = "insights_v2"
AIRTABLE_API_KEY = os.environ["AIRTABLE_API_KEY"]

In [None]:
ANNOTATOR_A = "marie-sophie.litz@impactnexus.io"
ANNOTATOR_B = "andreas.schalm@impactnexus.io"

In [None]:
query_body = {
  "query": {
    "bool": {
      "must": [
        {
          "nested": {
            "path": "annotations.concepts",
            "query": {
              "term": {"annotations.concepts.created_by": ANNOTATOR_B}  
            }
          }
        }
      ]
    }
  }
}
if "@" in ANNOTATOR_A:
  query_body['query']['bool']['must'].append({
    "nested": {
      "path": "annotations.concepts",
      "query": {
        "term": {"annotations.concepts.created_by": ANNOTATOR_A}  
      }
    }
  },)

In [None]:
BATCH_SIZE = 1000

items = scan(
    opensearch,
    query=query_body,
    index=INSIGHT_INDEX,
    size=BATCH_SIZE,
    scroll="1h",
)

In [None]:
documents = [
    AnnotatedInsight.from_elasticsearch(**h) for h in items
]

In [None]:
len(documents)

In [None]:
def get_annoation_for(annotations, created_by = None, type = None):
    for annotation in annotations:
        if type is None and created_by is not None and annotation.created_by == created_by:
            return annotation
        if created_by is None and type is not None and annotation.type == type:
            return annotation

In [None]:
def get_company_annotations(annotations: InsightAnnotations, created_by):
    annotation = get_annoation_for(annotations.concepts, created_by)
    if annotation is None:
        return []
    return [concept.label for concept in annotation.annotation.company_concepts]

def get_impact_annotations(annotations: InsightAnnotations, created_by):
    annotation = get_annoation_for(annotations.concepts, created_by)
    if annotation is None:
        return []
    return [concept.label for concept in annotation.annotation.impact_concepts]

In [None]:
import json
def get_relation_annotations(annotations: InsightAnnotations, created_by):
    annotation = get_annoation_for(annotations.relations, created_by)
    if annotation is None:
        return []
    return json.loads(annotation.json())['annotation']

In [None]:
import numpy as np
from sklearn.metrics import jaccard_score


def compute_jaccard_for_concept_annotation(annotation_a, annotation_b):
    items = list(set(annotation_a).union(set(annotation_b)))
    ann_a = np.zeros(len(items))
    ann_b = np.zeros(len(items))

    for i in annotation_a:
        ann_a[items.index(i)] = 1
        
    for i in annotation_b:
        ann_b[items.index(i)] = 1
        
    return(jaccard_score(ann_a, ann_b, average = 'binary'))



In [None]:
def get_relation_triple(relation_annotation):
    return relation_annotation['company_concept']['label'], relation_annotation['impact_concept']['label'], relation_annotation['relation']

In [None]:
def compute_jaccard_for_relation_annotation(annotation_a, annotation_b):
    relations_a = [get_relation_triple(ann) for ann in annotation_a]
    relations_b = [get_relation_triple(ann) for ann in annotation_b]
    
    pairs = list(set([(rel[0], rel[1]) for rel in relations_a]).union(set([(rel[0], rel[1]) for rel in relations_b])))
    
    ann_a = np.zeros(len(pairs))
    ann_b = np.zeros(len(pairs))

    RELATION_TO_LABEL = {"POSITIVE": 2, "NEGATIVE": 1}

    for relation in relations_a:
        pair = (relation[0], relation[1])
        ann_a[pairs.index(pair)] = RELATION_TO_LABEL[relation[2]]
        
    for relation in relations_b:
        pair = (relation[0], relation[1])
        ann_b[pairs.index(pair)] = RELATION_TO_LABEL[relation[2]]
        
    disagreements = [1 for (x, y) in zip(ann_a, ann_b) if x > 0 and y > 0 and x != y]
    ann_a = [1 if x > 0 else 0 for x in ann_a]
    ann_b = [1 if x > 0 else 0 for x in ann_b]

    return jaccard_score(ann_a, ann_b, average='binary'), len(disagreements)

In [None]:
def build_rows(documents, annotation_getter, scoring_function):
    for document in documents:
        annotation_a = annotation_getter(document.annotations, ANNOTATOR_A)
        annotation_b = annotation_getter(document.annotations, ANNOTATOR_B)
        yield {
            'id': document.id, 
            'text': document.text,
            "annotator_a": ANNOTATOR_A,
            "annotator_b": ANNOTATOR_B,
            'annotation_a': annotation_a,
            'annotation_b': annotation_b,
            'jaccard_score': scoring_function(annotation_a, annotation_b),
        }

In [None]:
import pandas as pd
company_df = pd.DataFrame.from_records(build_rows(documents, get_company_annotations, compute_jaccard_for_concept_annotation), index='id')
company_df = company_df.style.set_caption('Company concept annotation')
company_df

In [None]:
import pandas as pd
impact_df = pd.DataFrame.from_records(build_rows(documents, get_impact_annotations, compute_jaccard_for_concept_annotation), index='id')
impact_df = impact_df.style.set_caption('Impact concept annotation')
impact_df

In [None]:
import pandas as pd
relation_df = pd.DataFrame.from_records(build_rows(documents, get_relation_annotations, compute_jaccard_for_relation_annotation))
relation_df = relation_df.style.set_caption('Relation annotation')
relation_df