# Doing things with text 6

Part-of-speech, Named entity recognition _for preprocessed texts_

### Step 0: Download spacy model (only the first time)

See https://spacy.io/models for the available models

In [None]:
# from spacy.cli.download import download
# download(model="nl_core_news_sm") # en_core_web_sm is the standard model for English

### Step 1: Importing required packages 

- `pathlib.Path`: Provides an object-oriented interface for filesystem paths
- `pandas`: Provides tools for handling and analyzing structured data in tables, making it easier to work with datasets.
- `collections.defaultdict`: A dictionary-like object that provides default values for missing keys.
- `collections.Counter`: A dictionary subclass for counting hashable objects.
- `spacy`: A natural language processing library for tasks like tokenization, tagging, and entity recognition.
- `nltk.bigrams`: Creates bigrams (2-word combinations) from a sequence.
- `nltk.collocations`: Provides tools for identifying collocations (frequent word pairings).
- `nltk.FreqDist`: Calculates frequency distributions of items in a dataset.
- `nltk.collocations.*`: Includes utilities for finding collocations like bigram or trigram associations.
- `nltk.WordPunctTokenizer`: Tokenizes text into words and punctuation marks.

In [1]:
from pathlib import Path
import pandas as pd
from collections import defaultdict
from collections import Counter
import spacy
from nltk import WordPunctTokenizer
from nltk import collocations
from nltk import FreqDist
from nltk import bigrams
from nltk.collocations import *

Load the correct (language-specific) spacy model, load the default spacy stop word list and add words as needed

In [2]:
nlp = spacy.load("nl_core_news_sm")
nlp.max_length = 2000000
nlp.Defaults.stop_words |= {'the'} # add words as 'word', 'word', 'word'

### Step 3: Define input and output paths

In [11]:
indir = Path('/Users/huijn001/desktop/test2/')
outdir = Path('/Users/huijn001/desktop/output/')
outdir.mkdir(parents=True, exist_ok=True)  # Create the output directory if it doesn't exist

allfiles = sorted(indir.glob("*.csv"))

dataset = 'dataset' # give a name to your dataset for outfiles

In [12]:
for file in allfiles:
    print(file)

/Users/huijn001/desktop/test2/dataset_clean.csv


In [13]:
def save_corpus(corpus):
    corpus_out = corpus.replace(" ", "_").lower()
    return corpus_out

### Step 4: Load the data and create a dataframe 
Df with the texts in "text_clean_str" column and the file name (=date) in "file_name" column

#### Step 4a: Create dataframe df

In [14]:
results = defaultdict(list)

for file in allfiles:
    try:
        # Load the CSV into a DataFrame
        df_infile = pd.read_csv(file, sep='\t')
        
        # Ensure 'date' and 'text' are available
        if 'date' in df_infile.columns:
            date_column = df_infile['date']
        else:
            date_column = df_infile.index  # Use index if 'date' column doesn't exist

        # Append the data to results
        for date, text in zip(date_column, df_infile['text_clean_str']):
            results["date"].append(date)
            results["text"].append(text)
    except Exception as e:
        print(f"Error processing file {file}: {e}")

# Convert the results dictionary into a DataFrame
df = pd.DataFrame(results)

In [15]:
print(df.head())

         date                                               text
0  12-11-1980  mijnheer voorzitter begin verontrustende pakke...
1    4-6-1980  mijnheer voorzitter toch eens zeggen bijna tri...
2    2-6-1980  mevrou voorzitter sluit graag staatssecretaris...
3   13-2-1980  emancipatiebeleid heeft ruime aandacht kamer g...
4   13-2-1980  door leden bischoff heemskerck spek vlist jans...


#### Step 4b: Set 'date' column to datetime index

In [16]:
df = df.set_index("date")
df.index = pd.to_datetime(df.index, format ="%d-%m-%Y")
df = df.sort_index()

In [17]:
print(df.head())

                                                         text
date                                                         
1970-11-18  uitgangspunt mijnheer voorzitter daarin bestaa...
1970-11-18  mijnheer voorzitter toelichting amendement vra...
1970-11-18  mijnheer voorzitter toelichting amendement vra...
1970-11-18  uitgangspunt mijnheer voorzitter daarin bestaa...
1973-10-09  mijnheer voorzitter begin enkele woorden wijde...


#### Step 4c: Group by year

In [18]:
# Group by the 'year' and aggregate 'text' by concatenation and 'num_words' by summation
df = df.groupby(df.index.year).agg({'text': ' '.join})

In [19]:
print(df.head())

                                                   text
date                                                   
1970  uitgangspunt mijnheer voorzitter daarin bestaa...
1973  mijnheer voorzitter begin enkele woorden wijde...
1974  mijnheer voorzitter geachte afgevaardigde heer...
1975  mijnheer voorzitter over volgende onderwerpen ...
1976  mijnheer voorzitter denkt over onderwijs emanc...


### Step 5: Generate and print Part Of Speech (POS) tags 

#### Step 5a: Create POS tags for the dataset

In [20]:
# Pre-process all texts with SpaCy
docs = [(year, nlp(text)) for year, text in zip(df.index, df['text'])]

#### Step 5b: Initiate function and set n

In [31]:
def pos_tag_to_word(pos_tag):
    if pos_tag == 'ADJ':
        return 'adjective'
    elif pos_tag == 'NOUN':
        return 'noun'
    elif pos_tag == 'PROPN':
        return 'proper noun'
    elif pos_tag == 'VERB':
        return 'verb'
    elif pos_tag == 'SYM':
        return 'symbol'
    

In [37]:
# Define a function to extract tokens by POS tag
def extract_top_tokens(docs, pos_tag, n):
    for year, doc in docs:
        tokens = [token.text for token in doc
                  if not token.is_stop and not token.is_punct and token.pos_ == pos_tag]
        
        print(f'These are the top {n} {pos_tag_to_word(pos_tag)}s from {year}:')
        
        token_freq = Counter(tokens)
        common_tokens = token_freq.most_common(n)
        print(common_tokens)
        print('\n')

In [34]:
n = 10

Explain what the the spacy abbreviations stand for

In [25]:
spacy.explain("SYM")

'symbol'

#### Step 5c: Print top adjectives

In [38]:
extract_top_tokens(docs, 'ADJ', n)

These are the top 10 adjectives from 1970:
[('geachte', 20), ('bijzonder', 12), ('maatschappelijk', 8), ('grote', 6), ('duidelijk', 6), ('nationale', 6), ('goede', 6), ('verschillende', 6), ('circulaire', 4), ('nederlandse', 4)]


These are the top 10 adjectives from 1973:
[('tweede', 6), ('politieke', 4), ('hoger', 4), ('gelijke', 4), ('diep', 2), ('tragisch', 2), ('grote', 2), ('militair', 2), ('eenzijdige', 2), ('bestand', 2)]


These are the top 10 adjectives from 1974:
[('geachte', 16), ('tweede', 14), ('bijzonder', 8), ('verschillende', 8), ('maatschappelijke', 8), ('langedijk', 6), ('grote', 6), ('schriftelijk', 6), ('gedeeltelijk', 6), ('nederlandse', 6)]


These are the top 10 adjectives from 1975:
[('sociale', 22), ('verschillende', 20), ('bijzonder', 16), ('nader', 14), ('jongeren', 12), ('vrouwe', 12), ('korte', 10), ('natuurlijk', 10), ('bezig', 10), ('regionaal', 8)]


These are the top 10 adjectives from 1976:
[('maatschappelijk', 24), ('nodig', 14), ('open', 12), ('nieu

#### Step 5d: Print top nouns

In [39]:
extract_top_tokens(docs, 'NOUN', n)

These are the top 10 nouns from 1970:
[('voorzitter', 22), ('zaak', 20), ('mijnheer', 16), ('premie', 16), ('pensioenregeling', 16), ('verschillen', 12), ('financiering', 12), ('mening', 12), ('stand', 12), ('kinderdagverblijven', 12)]


These are the top 10 nouns from 1973:
[('mensen', 14), ('onderwijs', 14), ('voorzitter', 6), ('basis', 6), ('benadering', 6), ('kabinet', 6), ('mevrouw', 6), ('verhaal', 6), ('mijnheer', 4), ('woorden', 4)]


These are the top 10 nouns from 1974:
[('mijnheer', 20), ('voorzitter', 20), ('onderwijs', 20), ('heer', 16), ('minister', 14), ('onderzoek', 14), ('punt', 12), ('aantal', 12), ('opmerkingen', 12), ('beleid', 10)]


These are the top 10 nouns from 1975:
[('minister', 36), ('zaken', 30), ('werk', 30), ('nota', 26), ('beleid', 22), ('positie', 18), ('mensen', 18), ('situatie', 16), ('jaar', 16), ('mogelijkheden', 16)]


These are the top 10 nouns from 1976:
[('beleid', 32), ('werk', 30), ('onderwijs', 26), ('mensen', 22), ('voorzitter', 20), ('samen

#### Step 5e: Print top proper nouns

In [40]:
extract_top_tokens(docs, 'PROPN', n)

These are the top 10 proper nouns from 1970:
[('tilanus', 6), ('augustus', 4), ('kamer', 4), ('reces', 2), ('uitging', 2), ('particulier', 2), ('dettmeijer', 2), ('verheugt', 2), ('ermede', 2), ('instem', 2)]


These are the top 10 proper nouns from 1973:
[('september', 4), ('syrië', 2), ('israël', 2), ('prinsjesdag', 2), ('nederland', 2), ('smit', 2), ('high', 2), ('wiegel', 2), ('hedenmiddag', 2)]


These are the top 10 proper nouns from 1974:
[('vonhoff', 12), ('nederland', 4), ('levenssfeer', 2), ('leeuwen', 2), ('dienstverleningen', 2), ('januari', 2), ('november', 2), ('kleisieriee', 2), ('wolff', 2)]


These are the top 10 proper nouns from 1975:
[('mevrou', 8), ('waaro', 6), ('kinderdagverblijven', 4), ('overblijfcentra', 4), ('vrou', 4), ('kamer', 4), ('adres', 4), ('verbrugh', 4), ('februari', 2), ('hetvrijwilligerswerk', 2)]


These are the top 10 proper nouns from 1976:
[('kamer', 8), ('tweede', 6), ('november', 6), ('worrell', 6), ('oktober', 4), ('zweden', 2), ('frankrijk

#### Step 5f: Print top verbs

In [41]:
extract_top_tokens(docs, 'VERB', n)

These are the top 10 verbs from 1970:
[('afgevaardigde', 16), ('komen', 10), ('zeggen', 10), ('stellen', 10), ('lijkt', 8), ('gevraagd', 8), ('betreft', 8), ('weet', 8), ('gaat', 8), ('komt', 8)]


These are the top 10 verbs from 1973:
[('maakt', 4), ('staat', 4), ('gezegd', 4), ('gericht', 4), ('gerealiseerd', 4), ('wijden', 2), ('uitgerekend', 2), ('veroordeelt', 2), ('egypte', 2), ('sluiten', 2)]


These are the top 10 verbs from 1974:
[('afgevaardigde', 14), ('gesteld', 10), ('willen', 10), ('maken', 10), ('gemaakt', 8), ('zeggen', 8), ('gezegd', 8), ('gericht', 8), ('spreekt', 6), ('geraakt', 6)]


These are the top 10 verbs from 1975:
[('gaan', 28), ('betreft', 22), ('maken', 20), ('komen', 20), ('gaat', 20), ('zeggen', 18), ('barendregt', 18), ('willen', 16), ('gezegd', 16), ('gemaakt', 14)]


These are the top 10 verbs from 1976:
[('willen', 26), ('komen', 22), ('gezien', 20), ('stellen', 16), ('gebracht', 16), ('bewindslieden', 16), ('arbeidsmarkt', 12), ('zeggen', 12), ('vrag

### Step 6: Print ALL words with a particular POS-tag (NOUN, ADJ, VERB, PRON, PROPN, SYM, etc.)

#### WARNING: Only use this code if your dataset is manageable

In [None]:
# Function to print all words with a specific POS tag
def print_all_words_by_pos(docs, pos_tag):
    for year, doc in docs:
        # Extract tokens with the desired POS tag
        tokens = [token.text for token in doc if token.pos_ == pos_tag]
        
        # Print all tokens for the year
        print(f'All {pos_tag.lower()}s in {year}:')
        print(tokens)
        print('\n')

In [None]:
print_all_words_by_pos(docs, 'NOUN')

### Step 7: Create Named Entity list 
(doesn't work well for Dutch)

In [None]:
ner_lst = nlp.pipe_labels['ner']
print(len(ner_lst))
print(ner_lst)

In [None]:
NER_list = ['EVENT', 'FAC', 'LAW', 'LOC', 'MONEY', 'ORG', 'PERSON', 'PRODUCT']

for year, text in zip(df.index, df['text']):
    doc = nlp(text)

    print('Named entities in ' + year +':')
    for ent in doc.ents:
        if ent.label_ in NER_list:
            print(ent.text,  ent.label_)
            print('\n')
 

### Step 7: POS Collocations

In [None]:
search_terms = ['']
windows = [10] # add or change to smaller/larger window
algorithms = ['likelihood', 'pmi'] # 'likelihood', 'pmi', 'raw_freq'
coll_to_print = 10

In [None]:
# Define function to filter tokens by POS tags
def filter_tokens_by_pos(doc, pos_tags):
    return [token.text for token in doc if token.pos_ in pos_tags and not token.is_stop and not token.is_punct]

In [None]:
outdir_coll = outdir / f'{save_corpus(dataset)}_pos-collocations'
outdir_coll.mkdir(parents=True, exist_ok=True)  # Create the output directory if it doesn't exist

for term in search_terms:
    for window in windows:
        for algorithm in algorithms:
            outfile_coll = f'{term}_{algorithm}_pos-collocations_{window}.txt'
            outpath_coll = outdir_coll / outfile_coll
            
            with open(outpath_coll, 'a') as f:
                print('Top %s %s collocations of \'%s\' with a window of %s in %s:\n' %(str(coll_to_print), algorithm, term, str(window), dataset))
                print('Top %s %s collocations of \'%s\' with a window of %s in %s:\n' %(str(coll_to_print), algorithm, term, str(window), dataset), file=f)
                for year, doc in docs:
                    # Filter tokens by desired POS types (e.g., nouns and proper nouns)
                    filtered_tokens = filter_tokens_by_pos(doc, {'ADJ'})
                    
                    # Tokenize the filtered tokens
                    tokens = WordPunctTokenizer().tokenize(' '.join(filtered_tokens))

                    bigram_measures = collocations.BigramAssocMeasures()
                    word_fd = FreqDist(tokens)
                    bigram_fd = FreqDist(bigrams(tokens))
                    finder = BigramCollocationFinder(word_fd, bigram_fd, window_size=window)

                    #preprocessing: remove short words and stop words (see above) if only relevant for collocations
                    #finder.apply_word_filter(lambda w: len(w) < 4 or w.lower() in stopwords)
        
                    if algorithm == 'likelihood': 
                        scored = finder.score_ngrams(bigram_measures.likelihood_ratio)
                    elif algorithm == 'pmi': 
                        scored = finder.score_ngrams(bigram_measures.pmi) 
                    else: 
                        scored = finder.score_ngrams(bigram_measures.raw_freq) 
                  
                    # Group bigrams by first word in bigram                                       
                    prefix_keys = defaultdict(list)
                    for key, scores in scored:
                        prefix_keys[key[0]].append((key[1], scores))

                    # Sort keyed bigrams by strongest association                                  
                    for key in prefix_keys:
                        prefix_keys[key].sort(key = lambda x: -x[1])

                    # Print top collocations of term.
                    print(str(year) + ':')
                    print(str(year) + ':', file=f)
                    print(*prefix_keys[term][:coll_to_print], sep='\n')
                    print(*prefix_keys[term][:coll_to_print], sep='\n', file=f)
                    print('\n')
                    print('\n', file=f)