In [None]:
import os
os.chdir('../..')
!pwd

In [None]:
import spacy
from scripts.components.component_keyword import add_entity_ruler
import scripts.component_lowercase_lemmas

# for ._.domain_label extension!
from scripts.components import predict_domain_from_keywords
from scripts.data_helpers import extract_sentence_entities

nlp = spacy.load("en_core_web_sm")
nlp.add_pipe("lower_case_lemmas", after="lemmatizer")
nlp.remove_pipe("ner")
ruler = add_entity_ruler(nlp)
nlp.add_pipe("domain_classifier")


In [None]:
from scripts.components import financial_tone_classifier
from scripts.components import sustainability_potential_classifier

nlp.add_pipe("financial_tone_classifier")
nlp.add_pipe("sustainability_potential_classifier")

In [None]:
doc = nlp('Building operations are responsible for 41.7% of U.S. energy consumption, with building construction and materials accounting for an additional 5.9% of consumption, as shown in Fig. 1. Over the past decades, designers have become more aware of the need to conserve natural resources, reduce energy use, and minimise carbon pollution. The strategies have primarily focused on reducing energy use from carbon emitting sources during the operation of buildings, but this is only part of the carbon emissions story.')
print([[(ent, ent.label_) for ent in sent.ents] for sent in doc.sents])
print([sent._.labels for sent in doc.sents])
print(doc._.labels)
print()
print(extract_sentence_entities(doc))

In [None]:
import pandas as pd
from typing import List


In [None]:
paragraphs = []
banter_paragraphs = []

In [None]:
import srsly
from tqdm.notebook import tqdm
from scripts.components.component_keyword import ID_TO_LABEL

items = srsly.read_jsonl("corpus/paragraphs/paragraphs.jsonl")

pbar = tqdm(
    enumerate(
        nlp.pipe([(item["text"], item) for i, item in enumerate(items)], as_tuples=True)
    )
)
for par_i, (doc, item) in pbar:
    sentences = [sent for sent in doc.sents]
    if len(sentences) < 3:
        continue
    n_sentences = len(sentences)

    if not (ent for ent in doc.ents if ent.label_ in nlp.pipe):
        continue

    paragraph_domain = doc._.labels['domain']
    
    paragraph = item | {
        "index": par_i,
        "domain": paragraph_domain,
        "sents": [sent.text for sent in sentences],
    }

    if (paragraph_domain == "ENV"):
        paragraphs.append(paragraph)
        break

In [None]:
nlp.pipeline

In [None]:
doc._.labels

In [None]:

def to_json(doc):
    sentences = [sent for sent in doc.sents]
    n_sentences = len(sentences)

    paragraph_domain = doc._.labels['domain']

    paragraph = {
        "index": par_i,
        **doc._.labels,
        "sents": [sent.text for sent in sentences],
        "entities": extract_sentence_entities(doc),
        "sent_labels": [{'sent_ind':i, **sent._.labels} for i, sent in enumerate(sentences)]
    }
    return paragraph



In [None]:
keyword_components=["entity_ruler"]
[','.join(set([ent.text for ent in doc.ents if ent.label_ == label])) for comp in keyword_components for label in nlp.pipe_labels[comp]]

In [None]:
[label for comp in keyword_components for label in nlp.pipe_labels[comp]]

In [None]:
from spacy.tokens import Doc
from typing import List, Iterable


def documents_to_tsv(
    out_path: str,
    docs_items: Iterable[Doc],
    keyword_components=["entity_ruler"],
    classifiers=["financial_tone"],
):
    HEADER = [
        "SCRAPER",
        "TITLE",
        "URL",
        "PARAGRAPH_INDEX",
        *["KEYWORDS_" + label for comp in keyword_components
                    for label in nlp.pipe_labels[comp]],
        *[classifier for classifier in classifiers],
        "SENTENCE",
    ]
    with open(out_path, "wt") as file:
        file.write("\t".join(HEADER) + "\n")
        for doc, item in docs_items:
            par_line = [
                item["scraper"],
                item["title"],
                item["url"],
                str(item["par_index"]),
                *[
                    ",".join(set([ent.text for ent in doc.ents if ent.label_ == label]))
                    for comp in keyword_components
                    for label in nlp.pipe_labels[comp]
                ],
                *[doc._.labels.get(cls, '') for cls in classifiers],
            ]
            par_line = "\t".join(par_line) + '\n'
            file.write(par_line)
            for sent in doc.sents:
                sent_line = [
                    "",
                    "",
                    "",
                    "",
                    *[
                        ",".join(
                            set([ent.text for ent in sent.ents if ent.label_ == label])
                        )
                        for comp in keyword_components
                        for label in nlp.pipe_labels[comp]
                    ],
                    *[sent._.labels.get(cls, '') for cls in classifiers],
                    sent.text,
                ]
                sent_line = "\t".join(sent_line) + '\n'
                file.write(sent_line)
            file.write("\n")
            yield (doc, item)


# docs_items = nlp.pipe([(item["text"], item) for i, item in enumerate(items)], as_tuples=True)
# docs_items = documents_to_tsv("14012022_labeled_sentences.tsv")

In [None]:
docs_items = documents_to_tsv("14012022_labeled_sentences.tsv", [(doc, item)])

In [None]:
list(doc.sents)[2]._.labels

In [None]:
for _ in docs_items:
    pass

In [None]:
df = pd.DataFrame(env_sentences)
df.to_csv('env_sentences.csv')
df = pd.DataFrame(social_sentences)
df.to_csv('social_sentences.csv')
df = pd.DataFrame(banter_sentences)
df.to_csv('banter_sentences.csv')

In [None]:
import pandas as pd
df = pd.read_csv('notebooks/sentences.csv', index_col=0)

In [None]:
df = df.iloc[:100000]

In [None]:
df = df.groupby(['text', 'sentiment']).agg(lambda x: list(x)).reset_index()

In [None]:
df[df['sentiment'] != 'neutral']

In [None]:
corpus = (st.CorpusFromPandas(df[df['sentiment'] != 'neutral'],
                              category_col='sentiment',
                              text_col='text',
                              nlp=st.whitespace_nlp_with_sentences)
          .build()
          .get_stoplisted_unigram_corpus()
          .compact(st.ClassPercentageCompactor(term_count=2,
                                               term_ranker=st.OncePerDocFrequencyRanker)))

In [None]:
html = st.produce_characteristic_explorer(
	corpus,
	category='positive',
	category_name='positive',
	not_category_name='negative',
)
open('demo_characteristic_chart.html', 'wb').write(html.encode('utf-8'))

In [None]:
df = st.SampleCorpora.ConventionData2012.get_data().assign(
    parse=lambda df: df.text.apply(st.whitespace_nlp_with_sentences)
)