# Doing things with text 6

Part-of-speech, Named entity recognition _for preprocessed texts_

### Step 0: Download spacy model (only the first time)

See https://spacy.io/models for the available models

In [None]:
# from spacy.cli.download import download
# download(model="nl_core_news_sm") # en_core_web_sm is the standard model for English

### Step 1: Importing required packages 

- `pathlib.Path`: Provides an object-oriented interface for filesystem paths
- `pandas`: Provides tools for handling and analyzing structured data in tables, making it easier to work with datasets.
- `collections.defaultdict`: A dictionary-like object that provides default values for missing keys.
- `collections.Counter`: A dictionary subclass for counting hashable objects.
- `spacy`: A natural language processing library for tasks like tokenization, tagging, and entity recognition.
- `nltk.bigrams`: Creates bigrams (2-word combinations) from a sequence.
- `nltk.collocations`: Provides tools for identifying collocations (frequent word pairings).
- `nltk.FreqDist`: Calculates frequency distributions of items in a dataset.
- `nltk.collocations.*`: Includes utilities for finding collocations like bigram or trigram associations.
- `nltk.WordPunctTokenizer`: Tokenizes text into words and punctuation marks.

In [None]:
from pathlib import Path
import pandas as pd
from collections import defaultdict
from collections import Counter
import spacy
from nltk import WordPunctTokenizer
from nltk import collocations
from nltk import FreqDist
from nltk import bigrams
from nltk.collocations import *

Load the correct (language-specific) spacy model, load the default spacy stop word list and add words as needed

In [None]:
nlp = spacy.load("nl_core_news_sm")
nlp.max_length = 2000000
nlp.Defaults.stop_words |= {'the'} # add words as 'word', 'word', 'word'

### Step 3: Define input and output paths

In [None]:
indir = Path('/Path/to/indir/')
outdir = Path('/Path/to/outdir')
outdir.mkdir(parents=True, exist_ok=True)  # Create the output directory if it doesn't exist

allfiles = sorted(indir.glob("*.txt"))

dataset = 'dataset' # give a name to your dataset for outfiles

In [None]:
def save_corpus(corpus):
    corpus_out = corpus.replace(" ", "_").lower()
    return corpus_out

### Step 4: Load the data and create a dataframe 
Df with the texts in "text" column and the file name (=date) in "file_name" column

In [None]:
results = defaultdict(list)

for infile in allfiles:
    # open the file and do something with it, close when done
    with open(infile, "r") as f:
        # try / except clause to catch encoding errors
        try:
            text = f.read()
        except Exception:
            print(Exception)
        results["year"].append(infile.stem)
        results["text"].append(text)
        
df = pd.DataFrame(results)

In [None]:
print(df.head())

Turn "file_name" column into datetime and set as index

#### Step 4a: Set 'year' column to index and sort

In [None]:
#df["date"] = pd.to_datetime(df["year"], format ="%Y")

In [None]:
df = df.set_index("year")
df = df.sort_index()

In [None]:
print(df.head())

### Step 5: Generate and print Part Of Speech (POS) tags 

#### Step 5a: Create POS tags for the dataset

In [None]:
# Pre-process all texts with SpaCy
docs = [(year, nlp(text)) for year, text in zip(df.index, df['text'])]

#### Step 5b: Initiate function and set n

In [None]:
def pos_tag_to_word(pos_tag):
    if pos_tag == 'ADJ':
        return 'adjective'
    elif pos_tag == 'NOUN':
        return 'noun'
    elif pos_tag == 'PROPN':
        return 'proper noun'
    elif pos_tag == 'VERB':
        return 'verb'
    elif pos_tag == 'SYM':
        return 'symbol'

In [None]:
# Define a function to extract tokens by POS tag
def extract_top_tokens(docs, pos_tag, n):
    for year, doc in docs:
        tokens = [token.text for token in doc
                  if not token.is_stop and not token.is_punct and token.pos_ == pos_tag]
        
        print(f'These are the top {n} {pos_tag_to_word(pos_tag)}s from {year}:')
        
        token_freq = Counter(tokens)
        common_tokens = token_freq.most_common(n)
        print(common_tokens)
        print('\n')

In [None]:
n = 40

Explain what the the spacy abbreviations stand for

In [None]:
spacy.explain("SYM")

#### Step 5c: Print top adjectives

In [None]:
extract_top_tokens(docs, 'ADJ', n)

#### Step 5d: Print top nouns

In [None]:
extract_top_tokens(docs, 'NOUN', n)

#### Step 5e: Print top proper nouns

In [None]:
extract_top_tokens(docs, 'PROPN', n)

#### Step 5f: Print top verbs

In [None]:
extract_top_tokens(docs, 'VERB', n)

### Step 6: Print ALL words with a particular POS-tag (NOUN, ADJ, VERB, PRON, PROPN, SYM, etc.)

#### WARNING: Only use this code if your dataset is manageable

In [None]:
# Function to print all words with a specific POS tag
def print_all_words_by_pos(docs, pos_tag):
    for year, doc in docs:
        # Extract tokens with the desired POS tag
        tokens = [token.text for token in doc if token.pos_ == pos_tag]
        
        # Print all tokens for the year
        print(f'All {pos_tag.lower()}s in {year}:')
        print(tokens)
        print('\n')

In [None]:
print_all_words_by_pos(docs, 'NOUN')

### Step 7: Create Named Entity list 
(doesn't work well for Dutch)

In [None]:
ner_lst = nlp.pipe_labels['ner']
print(len(ner_lst))
print(ner_lst)

In [None]:
NER_list = ['EVENT', 'FAC', 'LAW', 'LOC', 'MONEY', 'ORG', 'PERSON', 'PRODUCT']

for year, text in zip(df.index, df['text']):
    doc = nlp(text)

    print('Named entities in ' + year +':')
    for ent in doc.ents:
        if ent.label_ in NER_list:
            print(ent.text,  ent.label_)
            print('\n')
 

### Step 7: POS Collocations

In [None]:
search_terms = ['']
windows = [10] # add or change to smaller/larger window
algorithms = ['likelihood', 'pmi'] # 'likelihood', 'pmi', 'raw_freq'
coll_to_print = 10

In [None]:
# Define function to filter tokens by POS tags
def filter_tokens_by_pos(doc, pos_tags):
    return [token.text for token in doc if token.pos_ in pos_tags and not token.is_stop and not token.is_punct]

In [None]:
outdir_coll = outdir / f'{save_corpus(dataset)}_pos-collocations'
outdir_coll.mkdir(parents=True, exist_ok=True)  # Create the output directory if it doesn't exist

for term in search_terms:
    for window in windows:
        for algorithm in algorithms:
            outfile_coll = f'{term}_{algorithm}_pos-collocations_{window}.txt'
            outpath_coll = outdir_coll / outfile_coll
            
            with open(outpath_coll, 'a') as f:
                print('Top %s %s collocations of \'%s\' with a window of %s in %s:\n' %(str(coll_to_print), algorithm, term, str(window), dataset))
                print('Top %s %s collocations of \'%s\' with a window of %s in %s:\n' %(str(coll_to_print), algorithm, term, str(window), dataset), file=f)
                for year, doc in docs:
                    # Filter tokens by desired POS types (e.g., nouns and proper nouns)
                    filtered_tokens = filter_tokens_by_pos(doc, {'ADJ'})
                    
                    # Tokenize the filtered tokens
                    tokens = WordPunctTokenizer().tokenize(' '.join(filtered_tokens))

                    bigram_measures = collocations.BigramAssocMeasures()
                    word_fd = FreqDist(tokens)
                    bigram_fd = FreqDist(bigrams(tokens))
                    finder = BigramCollocationFinder(word_fd, bigram_fd, window_size=window)

                    #preprocessing: remove short words and stop words (see above) if only relevant for collocations
                    #finder.apply_word_filter(lambda w: len(w) < 4 or w.lower() in stopwords)
        
                    if algorithm == 'likelihood': 
                        scored = finder.score_ngrams(bigram_measures.likelihood_ratio)
                    elif algorithm == 'pmi': 
                        scored = finder.score_ngrams(bigram_measures.pmi) 
                    else: 
                        scored = finder.score_ngrams(bigram_measures.raw_freq) 
                  
                    # Group bigrams by first word in bigram                                       
                    prefix_keys = defaultdict(list)
                    for key, scores in scored:
                        prefix_keys[key[0]].append((key[1], scores))

                    # Sort keyed bigrams by strongest association                                  
                    for key in prefix_keys:
                        prefix_keys[key].sort(key = lambda x: -x[1])

                    # Print top collocations of term.
                    print(str(year) + ':')
                    print(str(year) + ':', file=f)
                    print(*prefix_keys[term][:coll_to_print], sep='\n')
                    print(*prefix_keys[term][:coll_to_print], sep='\n', file=f)
                    print('\n')
                    print('\n', file=f)