# Doing things with text 6

## Part-of-speech, Named entity recognition _for preprocessed texts_

In [None]:
import pandas as pd
import os
from collections import defaultdict
from collections import Counter
import datetime
import spacy

download spacy model if needed (see https://spacy.io/models)

In [None]:
# from spacy.cli.download import download
# download(model="nl_core_news_sm") # en_core_web_sm is the standard model for English

In [None]:
nlp = spacy.load("nl_core_news_sm")
nlp.max_length = 2000000
nlp.Defaults.stop_words |= {'the'} # add words as 'word', 'word', 'word'

#### Define in- and out-directories

In [None]:
indir = r'/path/to/indir/'
outdir = r'/path/to/outdir/'
os.makedirs(os.path.dirname(outdir), exist_ok=True) # makes outdir if it doesn't exist already

dataset = 'dataset' # give a name to your dataset for outfiles

### Create a dataframe 
Df with the texts in "text" column and the file name (=date) in "file_name" column

In [None]:
results = defaultdict(list)

# list all files in a given directory
files = os.listdir(indir)
files = [f for f in files if not f.startswith('.')]

for infile in files:
    # avoid opening files such as .DS_Store
    if infile.startswith('.'):
        continue
    # open the file and do something with it, close when done
    with open(indir+infile, "r") as f:
        # try / except clause to catch encoding errors
        try:
            text = f.read()
        except Exception:
            print(Exception)
        results["year"].append(infile[:-4])
        results["text"].append(text)
        
df = pd.DataFrame(results)

In [None]:
print(df.head())

Turn "file_name" column into datetime and set as index

In [None]:
df["date"] = pd.to_datetime(df["year"], format ="%Y")

In [None]:
df = df.set_index("date")
df = df.sort_index()

In [None]:
print(df.head())

In [None]:
df['year'] = df.index.strftime('%Y')

## POS tags 

In [None]:
n = 40

#### To print n most common words with a particular POS-tag (resp. adjectives, verbs, proper nouns and nouns)

### Adjectives

In [None]:
for year, text in zip(df.year, df['text']): ### zip om door meerdere kolommen te itereren
    doc = nlp(text)
    
    # adj tokens that arent stop words or punctuations
    adjs = [token.text
         for token in doc
         if (not token.is_stop and
             not token.is_punct and
             token.pos_ == "ADJ")]

    print('These are the top', str(n), 'adjectives from', str(year),':')

    # five most common adj tokens
    adj_freq = Counter(adjs)
    common_adjs = adj_freq.most_common(n)
    print(common_adjs)

    print('\n')

### Verbs

In [None]:
for year, text in zip(df.year, df['text']):
    doc = nlp(text) 
    
    # verb tokens that arent stop words or punctuations
    verbs = [token.text
         for token in doc
         if (not token.is_stop and
             not token.is_punct and
             token.pos_ == "VERB")]

    print('These are the top', str(n), 'verbs from', str(year),':')
    
    # five most common verb tokens
    verbs_freq = Counter(verbs)
    common_verbs = verbs_freq.most_common(n)
    print(common_verbs)

    print('\n')

### Proper nouns

In [None]:
for year, text in zip(df.year, df['text']):
    doc = nlp(text)
    
    # proper noun tokens that arent stop words or punctuations
    pnouns = [token.text
         for token in doc
         if (not token.is_stop and
             not token.is_punct and
             token.pos_ == "PROPN")]

    print('These are the top', str(n), 'proper nouns from', str(year),':')
    
    # five most common proper noun tokens
    pnoun_freq = Counter(pnouns)
    common_pnouns = pnoun_freq.most_common(n)
    print(common_pnouns)

    print('\n')

### Nouns

In [None]:
for year, text in zip(df.year, df['text']):
    doc = nlp(text)

    # noun tokens that arent stop words or punctuations
    nouns = [token.text
         for token in doc
         if (not token.is_stop and
             not token.is_punct and
             token.pos_ == "NOUN")]

    print('These are the top', str(n), 'nouns from', str(year),':')
    
    # five most common noun tokens
    noun_freq = Counter(nouns)
    common_nouns = noun_freq.most_common(n)
    print(common_nouns)

    print('\n')

#### To print ALL words with a particular POS-tag (NOUN, ADJ, VERB, PRON, PROPN, SYM, etc.)

In [None]:
spacy.explain("SYM")

In [None]:
for year, text in zip(df.year, df['text']):
    doc = nlp(text)
    print([(token.text, token.tag_, token.pos_) for token in doc if token.pos_ == 'SYM']) # change POS-tag here

## Create Named Entity list 
(doesn't work well for Dutch)

In [None]:
ner_lst = nlp.pipe_labels['ner']
print(len(ner_lst))
print(ner_lst)

In [None]:
NER_list = ['EVENT', 'FAC', 'LAW', 'LOC', 'MONEY', 'ORG', 'PERSON', 'PRODUCT']

for year, text in zip(df.year, df['text']):
    doc = nlp(text)

    print('Named entities in ' + year +':')
    for ent in doc.ents:
        if ent.label_ in NER_list:
            print(ent.text,  ent.label_)
            print('\n')
 