Set the environment and import the data:

In [1]:
import spacy
from spacy.matcher import Matcher, PhraseMatcher
from spacy.tokens import DocBin, Doc
import pandas as pd
import ftfy
import key_dict as k
from pathlib import Path

In [2]:
nlp = spacy.load("en_core_web_sm", disable=["ner", "parser"])
matcher = Matcher(nlp.vocab)
phraseMatcher = PhraseMatcher(nlp.vocab, attr="LEMMA")

In [None]:
df = pd.read_csv("") # specify file path here
data = df[df["retweeted"] == False] # to exclude retweets

Choose units to collect (sentence only or whole documents):

In [None]:
 # to collect sentence only
matched_sents = [] 

def collect_sents(matcher, doc, i, matches):
    match_id, start, end = matches[i]
    span = doc[start:end]  # Matched span
    sent = span.sent  # Sentence containing matched span
    matched_sents.append(sent.text)

In [11]:
 # to collect entire doc/cell
matched_docs = [] 

def collect_docs(matcher, doc, i, matches):
    matched_docs.append(doc.text)

Import selected patterns from keywords_and_patterns.py:

In [12]:
# Specify the patterns to add to the matcher
matcher.add("VERBS", k.belief_patterns["verbs"], on_match=collect_docs)
matcher.add("NOUNS", k.belief_patterns["nouns"], on_match=collect_docs)
matcher.add("ADVERBS", k.belief_patterns["adverbs"], on_match=collect_docs)
matcher.add("ADJECTIVES", k.belief_patterns["adjectives"], on_match=collect_docs)
matcher.add("SURE", k.belief_patterns["sure"], on_match=collect_docs)
phraseMatcher.add("PHRASES", k.belief_patterns["phrases"], on_match=collect_docs)

Compute the `doc` object on the dataset. This should only be done when the dataset changes as it can take time. 

In [None]:
doc = data["text"].apply(lambda x: nlp(ftfy.fix_text(str(x))))
doc.__len__()

Once the `doc` object has been created, save it to disk:

In [None]:
docbin = DocBin(docs=doc)
docbin.__len__()
docbin.to_disk(path="docbin.spacy")

Load saved `doc`. Make sure the vocabulary used to create the `doc` matches the loaded vocabulary (see `nlp = spacy.load()`).  Then, convert it to a list to make it iterable.

If you have several saved docs, then:

In [3]:
corpus_dir = Path("") #specify folder path here
files = list(corpus_dir.glob(pattern="*.spacy"))

In [4]:
docbin_list = []
for file in files:
    loaded_docbin = DocBin().from_disk(path=file)
    docbin_list.append(list(loaded_docbin.get_docs(nlp.vocab)))
docbin_list.__len__() # this should return the number of docs saved
merged_docs = [item for sublist in docbin_list for item in sublist]

If you have a single saved doc, then:

In [None]:
docbin_loaded = DocBin().from_disk(path="") # specify file path
docs_loaded = list(docbin_loaded.get_docs(nlp.vocab))
docs_loaded.__len__()

Load the matchers on the doc object.

In [None]:
for x in merged_docs:
    matcher(x)
    phraseMatcher(x)
len(matched_docs)

Delete duplicates (docs that satisfy more than one matcher):

In [None]:
output = [i for n, i in enumerate(matched_docs) if i not in matched_docs[:n]]
len(output)

Choose what to output:

In [None]:
# to save texts only
pd.DataFrame(output).to_excel("output.xlsx", index=False, header=False)

In [None]:
# to save entire rows (this includes Twitter metadata)
boolean_series = df.text.isin(output)
filtered_df = df[boolean_series]
pd.DataFrame(filtered_df.drop_duplicates()).to_excel("output.xlsx", index=False, header=False)