In [1]:
import os
os.chdir('../..')

In [2]:
from smart_evidence.components.company_impact_classifier import CompanyImpactClassifier

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
os.environ['PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION'] = 'python'

In [4]:
clf = CompanyImpactClassifier()

Downloading: 100%|██████████| 2.95k/2.95k [00:00<00:00, 1.40MB/s]


In [5]:
import srsly
documents = list(srsly.read_jsonl('data/impact_polarity.jsonl'))

In [6]:
def translate_concept(concept):
    label = concept.pop('concept_label')
    return {**concept, 'label': label}

In [7]:
documents = [{'text': d['text'], 
    'meta': {
        'predictions': {'concepts':{'annotation': {'company_concepts': [translate_concept(d['concept_relation']['company_concept'])],
        "impact_concepts": [translate_concept(d['concept_relation']['impact_concept'])]}}},
    }, 
    'annotation': ''.join(d['accept'])} for d in documents if d['answer'] == 'accept']

In [8]:
annotated_documents = clf.run(documents)

100%|██████████| 110/110 [00:15<00:00,  6.89it/s]


In [9]:
annotated_documents[0]['meta']['predictions']['relations']['annotation']

[{'company_concept': {'label': 'Cement',
   'id': 'http://dbpedia.org/resource/Cement'},
  'impact_concept': {'label': 'Recycling',
   'id': 'http://dbpedia.org/resource/Recycling'},
  'label': <ImpactPolarity.positive: 'POSITIVE'>}]

In [10]:
import pandas as pd

LABEL_TRANSLATION = {
    "NEGATIVE": "NEGATIVE",
    "POSITIVE": "POSITIVE",
    "NOT_RELATED": "NOT_RELATED",
    "POSITIVE_CONTRADICTION": "NOT_RELATED",
    "NEGATIVE_CONTRADICTION": "NOT_RELATED",
    "CONTRADICTION": "NOT_RELATED"
}

df = pd.DataFrame(
    [
        (
            d["text"],
            d["meta"]['predictions']['relations']['annotation'][0]["company_concept"]["label"],
            d["meta"]['predictions']['relations']['annotation'][0]["impact_concept"]["label"],
            LABEL_TRANSLATION[d["meta"]['predictions']['relations']['annotation'][0]["label"].value],
            d["annotation"],
        )
        for d in annotated_documents
    ],
    columns=["text", "company_concept", "impact_concept", "prediction", "annotation"],
)


In [11]:
documents[0]['text'][548:]

'There is a medium level of evidence and agreement on the benefit of recycling of construction minerals, with high agreement that existing recycling as aggregates reduces the energy demand associated with aggregate production, but limited evidence for the benefit of recycling cement or concrete to anything but aggregate. There is insufficient evidence to evaluate the suitability of recycling of construction minerals and plastics under future conditions of a more stringent emissions control policy.'

In [12]:
sample = documents[0]

In [13]:
clf.process_result_to_meta(sample['text'][548:], sample['meta']['predictions']['concepts']['annotation']['company_concepts'], sample['meta']['predictions']['concepts']['annotation']['impact_concepts'])



[{'company_concept': {'id': 'http://dbpedia.org/resource/Cement',
   'label': 'Cement'},
  'impact_concept': {'id': 'http://dbpedia.org/resource/Recycling',
   'label': 'Recycling'},
  'label': 'POSITIVE'}]

In [14]:
df['is_correct'] = df['annotation'] == df['prediction']

In [15]:
sample = df[False == df['is_correct']].loc[0]

In [16]:
dict(sample)

{'text': 'There is a medium level of evidence and a high level of agreement that the recycling of metals from buildings and vehicles already contributes to substantial emission reductions, while the recycling of EEE addresses other environmental concerns but contributes little to overall GHG mitigation. There is a limited level of evidence but agreement that further emission reductions can be achieved by sorting metals according to alloys to avoid the contamination of metal flows and allow for recycling even when metal stocks are no longer increasing. There is a medium level of evidence and agreement on the benefit of recycling of construction minerals, with high agreement that existing recycling as aggregates reduces the energy demand associated with aggregate production, but limited evidence for the benefit of recycling cement or concrete to anything but aggregate. There is insufficient evidence to evaluate the suitability of recycling of construction minerals and plastics under futu

In [17]:
df['annotation'].value_counts()

NOT_RELATED    58
NEGATIVE       30
POSITIVE       22
Name: annotation, dtype: int64

In [18]:
df['prediction'].value_counts()

NOT_RELATED    55
POSITIVE       30
NEGATIVE       25
Name: prediction, dtype: int64

In [22]:
from sklearn.metrics import classification_report
print(classification_report(df['annotation'], df['prediction']))

              precision    recall  f1-score   support

    NEGATIVE       0.40      0.33      0.36        30
 NOT_RELATED       0.53      0.50      0.51        58
    POSITIVE       0.43      0.59      0.50        22

    accuracy                           0.47       110
   macro avg       0.45      0.47      0.46       110
weighted avg       0.47      0.47      0.47       110



In [21]:
                        precision    recall  f1-score   support

              NEGATIVE       0.32      0.27      0.29        30
NEGATIVE_CONTRADICTION       0.00      0.00      0.00         0
           NOT_RELATED       0.46      0.40      0.43        58
              POSITIVE       0.45      0.68      0.55        22

              accuracy                           0.42       110
             macro avg       0.31      0.34      0.32       110
          weighted avg       0.42      0.42      0.41       110


IndentationError: unindent does not match any outer indentation level (<tokenize>, line 9)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(df['annotation'], df['prediction']))

In [None]:
from sklearn.metrics import classification_report
print(classification_report(df['annotation'], df['prediction']))

In [None]:
# NEGATIVE       0.44       0.81      0.57        36
# NOT_RELATED    0.00       0.00      0.00        59
# POSITIVE       0.48       0.97      0.64        29

# accuracy                            0.46       124
# macro avg       0.31      0.59      0.40       124
# weighted avg    0.24      0.46      0.32       124