# <font color="#49699E" size=40>Processing Natural Language Data</font>

# LEARNING OBJECTIVES
# LEARNING MATERIALS
# INTRODUCTION
## Package Imports

In [ ]:
import pandas as pd
pd.set_option("display.notebook_repr_html", False)
import seaborn as sns
import matplotlib.pyplot as plt

from dcss.plotting import format_axes_commas, custom_seaborn
from dcss.text import bigram_process, preprocess

import spacy
from spacy import displacy
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

custom_seaborn()

# TEXT PROCESSING


## Getting to Know SpaCy


In [ ]:
nlp = spacy.load("en_core_web_sm", disable=['ner', 'parser'])

### The SpaCy NLP Pipeline


### The SpaCy Containers


In [ ]:
with open('../data/txt_files/bonikowski_2017.txt', 'r') as f:
    abstract = f.read()

#### `Doc`s


In [ ]:
doc = nlp(abstract)
print(f'There are {len(doc)} tokens in this document.')

In [ ]:
from spacy.tokens import DocBin

doc_export = DocBin()
doc_export.add(doc)
doc_export.to_disk('../data/misc/bart_bonikowski_doc.spacy')

In [ ]:
doc_import = DocBin().from_disk('../data/misc/bart_bonikowski_doc.spacy')
docs = list(doc_import.get_docs(nlp.vocab))
doc = docs[0]
print(f'There are {len(doc)} tokens in this document.')

#### `Token`
##### `Span`


# NORMALIZING TEXT VIA LEMMATIZATION


In [ ]:
nlp = spacy.load('en_core_web_sm', disable=['ner'], exclude = ['lemmatizer'])
lemmatizer = nlp.add_pipe('lemmatizer', config = {'mode': 'rule'})
lemmatizer.initialize()

In [ ]:
doc = nlp(abstract)
lemmatized = [(token.text, token.lemma_) for token in doc]

In [ ]:
for each in lemmatized[:100]:
    if each[0].lower() != each[1].lower():
        print(f'{each[0]} ({each[1]})')

# PART-OF-SPEECH TAGGING


In [ ]:
for item in doc[:20]:
    print(f'{item.text} ({item.pos_})')

In [ ]:
nouns = [item.text for item in doc if item.pos_ == 'NOUN']
print(nouns[:20])

In [ ]:
adjectives = [item.text for item in doc if item.pos_ == 'ADJ']
adjectives[:20]

In [ ]:
parts = ['NOUN', 'ADJ']
words = [item.text for item in doc if item.pos_ in parts]
words[:20]

# SYNTACTIC DEPENDENCY PARSING


In [ ]:
sentence = nlp("This book is a practical guide to computational social science")

## Noun Chunks 


In [ ]:
for item in list(doc.noun_chunks)[:10]:
    print(item.text)

## Extracting Words by Dependency Labels:  Subject, Verb, Object Triplets


In [ ]:
for sent in doc.sents:
    tvdo = [(token.head.text, token.text) for token in sent if token.dep_ == 'dobj']
    print(tvdo)

In [ ]:
from dcss.svo import subject_verb_object_triples

list(subject_verb_object_triples(doc))

# CONCLUSION
## Key Points 
