In [136]:
#!pip install spacy
#!python -m spacy download en_core_web_sm
import spacy
from spacy.matcher import Matcher
from zipfile import ZipFile
from pathlib import Path
from tqdm import autonotebook as tqdm

In [137]:
nlp = spacy.load("en_core_web_sm")

In [138]:
with Path("sentence_classification_text_train.zip").open("rb") as file:
    zip_file = ZipFile(file)
    with zip_file.open("input.txt") as f:
        sentences = f.read().decode("utf-8").splitlines()
with Path("sentence_classification_references_train.zip").open("rb") as file:
    zip_file = ZipFile(file)
    with zip_file.open("references.txt") as f:
        labels = f.read().decode("utf-8").splitlines()

In [139]:
for label, sentence in zip(labels[:10], sentences[:10]):
    print(label, sentence)
    

1 Herbal medications, spider bites, iopamidol (used for radiocontrast), lacquers, mercury, psoralen (combined with ultraviolet A to treat psoriasis), and xenobiotics have been associated with the development of AGEP in case reports.
1 The jar from the fall had caused a subdural hematoma on the right front side of my brain that shows up in the CT.
1 It's very rewarding and I get a great deal of enjoyment and satisfaction from mentoring.
1 The video then shows the damage caused by the aircraft as it hit the north tower, follows the disintegrating plane through the interior, and then shows the airplane metal, ignited fuel, dust and smoke exiting the building on the opposite side.
0 It has been assumed that the quench starts at the middle cross-section of the magnet.
0 Her diamonds are locked in a safe deposit box.
0 A large marble was dropped into the bowl.
1 Relatives of the prisoners gathered outside the facility to discover the fate of the incarcerated, eventually leading to clashes wi

In [140]:
#count most common words in sentences that have causal relationship
from collections import Counter
from collections.abc import Iterable

# from: https://stackoverflow.com/questions/17485747/how-to-convert-a-nested-list-into-a-one-dimensional-list-in-python
def flatten(lis): #pretty ugly solution but we have to flatten the list since every new sentence adds "[]" which Counter can't deal with
     for item in lis:
         if isinstance(item, Iterable) and not isinstance(item, str):
             for x in flatten(item):
                 yield x
         else:        
             yield item

                
words = []
nonCausalWords = []

for label, sentence in zip(labels, sentences):
    if label == "1": #if sentence is causal
        doc = nlp(sentence)
        wordsHelp = [token.text for token in doc if not token.is_stop and not token.is_punct and token.pos_ != "NOUN" and token.pos_ != "ADJ"] 
        words.append(wordsHelp) #append all words to a list if they are NOT nouns & NOT punctuation & NOT adjectives

    if label == "0": #if sentence is NOT causal
        doc = nlp(sentence)
        wordsHelp = [token.text for token in doc if not token.is_stop and not token.is_punct and token.pos_ != "NOUN" and token.pos_ != "ADJ"] 
        nonCausalWords.append(wordsHelp)
        


In [141]:
        
# n most common tokens
nMostCommon = 20
causal_freq = Counter(flatten(words))
nonCausal_freq = Counter(flatten(nonCausalWords))
relativeOccurence = []
for word in causal_freq:
    causal_freq[word] = causal_freq[word]/(nonCausal_freq[word]+1)
common_words = causal_freq.most_common(nMostCommon)
print(common_words)

[('caused', 243.0), ('resulted', 40.0), ('causes', 40.0), ('leading', 29.0), ('cause', 27.5), ('occur', 19.0), ('triggered', 18.5), ('causing', 16.5), ('lead', 15.5), ('AGEP', 12.0), ('Haiti', 11.0), ('induced', 11.0), ('associated', 10.5), ('include', 10.333333333333334), ('occurs', 10.0), ('enamel', 10.0), ('approximately', 9.0), ('leads', 9.0), ('coughing', 9.0), ('commonly', 8.0)]


In [142]:
doc = nlp(sentences[0])
for token in doc:
    print(token.text, token.pos_, token.dep_)

Herbal ADJ amod
medications NOUN nsubjpass
, PUNCT punct
spider NOUN compound
bites NOUN appos
, PUNCT punct
iopamidol PROPN conj
( PUNCT punct
used VERB parataxis
for ADP prep
radiocontrast NOUN pobj
) PUNCT punct
, PUNCT punct
lacquers NOUN nsubjpass
, PUNCT punct
mercury NOUN nmod
, PUNCT punct
psoralen PROPN conj
( PUNCT punct
combined VERB prep
with ADP prep
ultraviolet ADJ compound
A NOUN pobj
to PART aux
treat VERB advcl
psoriasis NOUN dobj
) PUNCT punct
, PUNCT punct
and CCONJ cc
xenobiotics NOUN conj
have AUX aux
been AUX auxpass
associated VERB ROOT
with ADP prep
the DET det
development NOUN pobj
of ADP prep
AGEP PROPN pobj
in ADP prep
case NOUN compound
reports VERB pobj
. PUNCT punct


In [143]:
causal_cues = ["cause", "because", "since", "due to", "causes"]

for words, counts in common_words:
    causal_cues.append(str(words))

# Naive Improvements
- co-occurence frequencies
- patterns in syntactic / dependency trees
- causal verbs / causal phrases
- lemmatization

In [144]:
# def predict(sentence, causal_cues):
#     doc = nlp(sentence)
#     for token in doc:
#         if token.text.lower() in causal_cues:
#             return 1
#     return 0

def predict(sentence, causal_cues):
    # matcher can be found at https://spacy.io/api/matcher
    matcher = Matcher(nlp.vocab)
    pattern = [[{"LEMMA": cue}] for cue in causal_cues]
    matcher.add("CAUSAL", pattern)

    doc = nlp(sentence)
    matches = matcher(doc)
    return bool(matches)

In [145]:
predictions = []
for sentence in tqdm.tqdm(sentences):
    predictions.append(predict(sentence, causal_cues))

  0%|          | 0/3260 [00:00<?, ?it/s]

In [146]:
def evaluate(predictions, labels):

    tp = sum([int(p) == 1 and int(l) == 1 for p, l in zip(predictions, labels)])
    fp = sum([int(p) == 1 and int(l) == 0 for p, l in zip(predictions, labels)])
    tn = sum([int(p) == 0 and int(l) == 0 for p, l in zip(predictions, labels)])
    fn = sum([int(p) == 0 and int(l) == 1 for p, l in zip(predictions, labels)])
    print(f"True positives: {tp}")
    print(f"False positives: {fp}")
    print(f"True negatives: {tn}")
    print(f"False negatives: {fn}")
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    f1 = 2 * precision * recall / (precision + recall)
    accuracy = (tp + tn) / (tp + tn + fp + fn)
    print(f"Precision: {precision:.2f}")
    print(f"Recall: {recall:.2f}")
    print(f"F1: {f1:.2f}")
    print(f"Accuracy: {accuracy:.2f}")

evaluate(predictions, labels)


# previous
# True positives: 884
# False positives: 76
# True negatives: 1554
# False negatives: 746
# Precision: 0.92
# Recall: 0.54
# F1: 0.68
# Accuracy: 0.75

True positives: 897
False positives: 48
True negatives: 1582
False negatives: 733
Precision: 0.95
Recall: 0.55
F1: 0.70
Accuracy: 0.76


In [147]:
with Path("sentence_classification_text_test.zip").open("rb") as file:
    zip_file = ZipFile(file)
    with zip_file.open("input.txt") as f:
        test_sentences = f.read().decode("utf-8").splitlines()

In [148]:
test_predictions = []
for sentence in tqdm.tqdm(test_sentences):
    test_predictions.append(predict(sentence, causal_cues))
Path("predictions.txt").write_text("\n".join(map(str, test_predictions)));

  0%|          | 0/840 [00:00<?, ?it/s]