# Doing things with text 4: Keywords (for csvs)

This notebook provides various functionalities for keyword searches in a corpus of (cleaned) texts:
* keyword frequency
* n-grams
* collocations

The code assumes that the input is a series of txt files with the date (preferably years) as titles, as in, for example, 1980.txt, 1981.txt, 1982.txt, etc.

### Step 1: Importing required packages

- `pathlib.Path`: Provides an object-oriented interface for filesystem paths
- `pandas`: Provides data structures and tools for data manipulation and analysis.
- `collections.defaultdict`: Creates dictionaries with default values for missing keys.
- `collections.Counter`: Counts occurrences of elements in an iterable.
- `datetime`: Supplies classes for manipulating dates and times.
- `nltk.util.ngrams`: Generates n-grams from a sequence.
- `nltk.bigrams`: Creates bigrams (2-word combinations) from a sequence.
- `nltk.collocations`: Provides tools for identifying collocations (frequent word pairings).
- `nltk.FreqDist`: Calculates frequency distributions of items in a dataset.
- `nltk.collocations.*`: Includes utilities for finding collocations like bigram or trigram associations.
- `nltk.WordPunctTokenizer`: Tokenizes text into words and punctuation marks.
- `matplotlib.pyplot`: Creates static, animated, and interactive visualizations in Python.

In [None]:
from pathlib import Path
import pandas as pd
from collections import defaultdict
from collections import Counter
import datetime
from nltk.util import ngrams
from nltk import bigrams
from nltk import collocations
from nltk import FreqDist
from nltk.collocations import *
from nltk import WordPunctTokenizer
import matplotlib.pyplot as plt
import seaborn as sns

### Step 2: Define input and output paths

Define where your text file is located (indir) and where you want to save your processed text (outdir).

In [None]:
# Define input and output paths
indir = Path('/Path/to/indir/')
outdir = Path('/Path/to/indir/')
outdir.mkdir(parents=True, exist_ok=True)  # Create the output directory if it doesn't exist

allfiles = sorted(indir.glob("*.csv"))

dataset = 'dataset' # here the name of your actual dataset for output files

In [None]:
for file in allfiles:
    print(file)

In [None]:
df_test = pd.read_csv(indir / 'dataset_clean.csv', sep='\t')
print(df_test.head())

In [None]:
def save_corpus(corpus):
    corpus_out = corpus.replace(" ", "_").lower()
    return corpus_out

def to_string(list):
    string = '_'.join(list)
    return string

def to_title(words):
    if not words:
        return ''
    elif len(words) == 1:
        return f"'{words[0]}'"
    else:
        formatted_list = [f"'{word}'" for word in words[:-1]]
        return ', '.join(formatted_list) + f" and '{words[-1]}'"

### Step 3: Load the data into a dataframe 
Df with the texts in "text_clean_str" column and the file name (=date) in "date" column

#### Step 3a: Create dataframe df

In [None]:
results = defaultdict(list)

for file in allfiles:
    try:
        # Load the CSV into a DataFrame
        df_infile = pd.read_csv(file, sep='\t')
        
        # Ensure 'date' and 'text' are available
        if 'date' in df_infile.columns:
            date_column = df_infile['date']
        else:
            date_column = df_infile.index  # Use index if 'date' column doesn't exist

        # Append the data to results
        for date, text in zip(date_column, df_infile['text_clean_str']):
            results["date"].append(date)
            results["text"].append(text)
    except Exception as e:
        print(f"Error processing file {file}: {e}")

# Convert the results dictionary into a DataFrame
df = pd.DataFrame(results)

Check the dataframe

In [None]:
print(df.head())

#### Step 3b: Turn "year" column into datetime and set as index

In [None]:
df = df.set_index("date")
df.index = pd.to_datetime(df.index, format ="%d-%m-%Y")
df = df.sort_index()

In [None]:
print(df.head())

#### Step 3c: Group by year

In [None]:
# Group by the 'year' and aggregate 'text' by concatenation and 'num_words' by summation
df = df.groupby(df.index.year).agg({'text': ' '.join})

In [None]:
print(df.head())

### (Optional) Step 4: Removing custom stopwords

In [None]:
stopwords = ['word', 'word']

In [None]:
minimum_word_length = 4

In [None]:
df['text'] = [[x for x in text.split(' ') if len(x) >= minimum_word_length and x not in stopwords] 
              for text in df['text']]
df['text'] = df['text'].apply(' '.join)

### Step 5: Count and plot total words per year (or document)

In [None]:
def word_count(string):
    words = string.split()
    return len(words)

In [None]:
df['num_words'] = df['text'].apply(word_count)

In [None]:
fig = plt.figure(figsize = (15,8))
plt.bar(df_year.index, df_year['num_words'])
plt.ylabel('words')
plt.xlabel('date')
plt.xticks(df.index.year, rotation=45)
plt.title("Total number of words per year in %s" %(dataset))
plt.savefig(outdir / f'total_words_{save_corpus(dataset)}.png', dpi=200, bbox_inches='tight') # change filename as wished)
plt.show()

### Step 6: Finding and visualizing (ngram) strings in the texts

In [None]:
search_words = ['word']

In [None]:
outfile_freq = f'{to_string(search_words)}_{save_corpus(dataset)}_freq.png'
outdir_freq = outdir / f'{save_corpus(dataset)}_keyword_trends/'
outdir_freq.mkdir(parents=True, exist_ok=True)  # Create the output directory if it doesn't exist
outpath_freq = outdir_freq / outfile_freq

fig = plt.figure(figsize = (15,8))

for search_word in search_words:
    df[search_word + '_rel'] = df.text.str.count(pat=search_word + '??') / df.num_words
    plt.scatter(df.index.year, df[search_word + '_rel'], label=search_word)
    with open(outdir_freq / f'{search_word}_{save_corpus(dataset)}_freq.txt', 'a') as f:
        print('Relative frequency of \'%s\' in %s\n' %(search_word, dataset), file=f)
        print(df.text.str.count(pat=search_word + '??') / df.num_words, file=f)

plt.ylabel('frequency')
plt.xlabel('year')
plt.title("Keyword trends in %s" %(dataset))
plt.legend()
plt.xticks(df.index.year, rotation=45)
plt.savefig(outpath_freq)
plt.show()

In [None]:
outfile_freq = f'{to_string(search_words)}_{save_corpus(dataset)}_freq.png'
outdir_freq = outdir / f'{save_corpus(dataset)}_keyword_trends/'
outdir_freq.mkdir(parents=True, exist_ok=True)  # Create the output directory if it doesn't exist
outpath_freq = outdir_freq / outfile_freq

fig = plt.figure(figsize=(15, 8))

for search_word in search_words:
    # Calculate relative frequency
    df[search_word + '_rel'] = df.text.str.count(pat=search_word + '??') / df.num_words
    
    # Scatter plot
    plt.scatter(df.index.year, df[search_word + '_rel'], label=search_word)
    
    # Fit a trend line (linear regression)
    x = df.index.year
    y = df[search_word + '_rel']
    mask = ~np.isnan(y)  # Exclude NaN values if any
    z = np.polyfit(x[mask], y[mask], 1)  # Fit a line (1st degree polynomial)
    p = np.poly1d(z)  # Create a polynomial function
    
    # Plot the trend line
    plt.plot(x, p(x), linestyle='--', label=f"{search_word} trend")
    
    # Save relative frequencies to file
    with open(outdir_freq / f'{search_word}_{save_corpus(dataset)}_freq.txt', 'a') as f:
        print('Relative frequency of \'%s\' in %s\n' % (search_word, dataset), file=f)
        print(df.text.str.count(pat=search_word + '??') / df.num_words, file=f)

# Final plot settings
plt.ylabel('Frequency')
plt.xlabel('Year')
plt.title(f"Keyword trends in {dataset}")
plt.legend()
plt.xticks(df.index.year, rotation=45)
plt.savefig(outpath_freq)
plt.show()

### Step 7a: Finding and printing word endings in the text

In [None]:
ending = 'heid'
min_freq_end = 10

In [None]:
outfile_end = f'top_endings_{ending}_{save_corpus(dataset)}.txt'
outdir_end = outdir / f'{save_corpus(dataset)}_endings_beginnings'
outdir_end.mkdir(parents=True, exist_ok=True)  # Create the output directory if it doesn't exist
outpath_end = outdir_end / outfile_end


with open(outpath_end, 'a') as f:
    for index, row in df.iterrows():
        print(str(index)[:4])
        print(str(index)[:4], file=f)
        word_counts = Counter(row['text'].split())
        word_counts = {k: v for k, v in sorted(word_counts.items(), key=lambda item: item[1], reverse=True)}
        for word, count in word_counts.items():
            if word.endswith(ending) and count >= min_freq_end:
                print(word + ": ", count)
                print(word + ": ", count, file=f)
        print("\n")
        print("\n", file=f)

### Step 7b: Finding and printing word beginnings in the text

In [None]:
begin = 'pre'
min_freq_begin = 10

In [None]:
outfile_begin = f'top_beginnings_{begin}_{save_corpus(dataset)}.txt'
outdir_begin = outdir / f'{save_corpus(dataset)}_endings_beginnings'
outdir_begin.mkdir(parents=True, exist_ok=True)  # Create the output directory if it doesn't exist
outpath_begin = outdir_begin / outfile_begin

with open(outpath_begin, 'a') as f:
    for index, row in df.iterrows():
        print(str(index)[:4])
        print(str(index)[:4], file=f)
        word_counts = Counter(row['text'].split())
        word_counts = {k: v for k, v in sorted(word_counts.items(), key=lambda item: item[1], reverse=True)}
        for word, count in word_counts.items():
            if word.startswith(begin) and count >= min_freq_begin:
                print(word, ": ", count)
                print(word, ": ", count, file=f)
        print("\n")
        print("\n", file=f)

### Step 8: Collocations

From: https://www.nltk.org/howto/collocations.html

In [None]:
search_terms = ['word']
windows = [10] # add or change to smaller/larger window
algorithms = ['likelihood', 'pmi'] # 'likelihood', 'pmi', 'raw_freq'
coll_to_print = 10

In [None]:
outdir_coll = outdir / f'{save_corpus(dataset)}_collocations'
outdir_coll.mkdir(parents=True, exist_ok=True)  # Create the output directory if it doesn't exist

for term in search_terms:
    for window in windows:
        for algorithm in algorithms:
            outfile_coll = f'{term}_{algorithm}_collocations_{window}.txt'
            outpath_coll = outdir_coll / outfile_coll
            
            with open(outpath_coll, 'a') as f:
                print('Top %s %s collocations of \'%s\' with a window of %s in %s:\n' %(str(coll_to_print), algorithm, term, str(window), dataset))
                print('Top %s %s collocations of \'%s\' with a window of %s in %s:\n' %(str(coll_to_print), algorithm, term, str(window), dataset), file=f)
                for year, doc in zip(df.index.year, df.text):
                    tokens = WordPunctTokenizer().tokenize(doc)
            
                    bigram_measures = collocations.BigramAssocMeasures()
                    word_fd = FreqDist(tokens)
                    bigram_fd = FreqDist(bigrams(tokens))
                    finder = BigramCollocationFinder(word_fd, bigram_fd, window_size=window)

                    #preprocessing: remove short words and stop words (see above) if only relevant for collocations
                    #finder.apply_word_filter(lambda w: len(w) < 4 or w.lower() in stopwords)
        
                    if algorithm == 'likelihood': 
                        scored = finder.score_ngrams(bigram_measures.likelihood_ratio)
                    elif algorithm == 'pmi': 
                        scored = finder.score_ngrams(bigram_measures.pmi) 
                    else: 
                        scored = finder.score_ngrams(bigram_measures.raw_freq) 

                    # Group bigrams by first word in bigram                                       
                    prefix_keys = defaultdict(list)
                    for key, scores in scored:
                        prefix_keys[key[0]].append((key[1], scores))

                    # Sort keyed bigrams by strongest association                                  
                    for key in prefix_keys:
                        prefix_keys[key].sort(key = lambda x: -x[1])

                    # Print top collocations of term.
                    print(str(year) + ':')
                    print(str(year) + ':', file=f)
                    print(*prefix_keys[term][:coll_to_print], sep='\n')
                    print(*prefix_keys[term][:coll_to_print], sep='\n', file=f)
                    print('\n')
                    print('\n', file=f)

### Step 9: N-grams - most common; with given start word; with given end word

Define length of n-grams

In [None]:
ngram = 2
words_to_print = 100

#### Step 9a: Print and save to disk the most common ngrams

In [None]:
outfile_ngram = f'{save_corpus(dataset)}_mostcommon_{ngram}_grams.txt'
outdir_ngram = outdir / f'{save_corpus(dataset)}_ngrams'
outdir_ngram.mkdir(parents=True, exist_ok=True)  # Create the output directory if it doesn't exist
outpath_ngram = outdir_ngram / outfile_ngram

with open(outpath_ngram, 'a') as f:

    print('Top ' + str(words_to_print) + ' ' + str(ngram) + '-grams in ' + dataset + ':\n')
    print('Top ' + str(words_to_print) + ' ' + str(ngram) + '-grams in ' + dataset + ':\n', file=f)

    for year, text in zip(df.index.year, df.text):
        grams = ngrams(text.split(), ngram)
        grams_freq = Counter(grams)
        top_grams = grams_freq.most_common(words_to_print) # for n see above
        
        print(year)
        print(year, file=f)
        print(*top_grams, sep='\n')
        print(*top_grams, sep='\n', file=f)
        print('\n')
        print('\n', file=f)

#### Step 9b: Print and save to disk the most common ngrams beginning with a particular word

In [None]:
beginword = 'albert' # type as 'word'

In [None]:
outfile_ngramend = f'{save_corpus(dataset)}_{beginword}_{ngram}_grams_begin.txt'
outdir_ngram = outdir / f'{save_corpus(dataset)}_ngrams'
outdir_ngram.mkdir(parents=True, exist_ok=True)  # Create the output directory if it doesn't exist
outpath_ngramend = outdir_ngram / outfile_ngramend

with open(outpath_ngramend, 'a') as f:
    print(f'Top', str(ngram) + '-grams beginning with \'' + beginword + '\' in ' + dataset + ':\n')
    print(f'Top', str(ngram) + '-grams beginning with \'' + beginword + '\' in ' + dataset + ':\n', file=f)

    for year, text in zip(df.index.year, df.text):
        print(year)
        print(year, file=f)
        grams = ngrams(text.split(), ngram)
        grams_freq = Counter(grams)
        top_grams = grams_freq.most_common(words_to_print)
        for item in top_grams:
            if item[0][0] == beginword:
                print(item, sep='\n')
                print(item, sep='\n', file=f)            

#### Step 9c: Print and save to disk the most common ngrams ending with a particular word

In [None]:
endword = 'heijn' # type as 'word'
end = ngram - 1

In [None]:
outfile_ngrambegin = f'{save_corpus(dataset)}_{endword}_{ngram}_grams_end.txt'
outdir_ngram = outdir / f'{save_corpus(dataset)}_ngrams'
outdir_ngram.mkdir(parents=True, exist_ok=True)  # Create the output directory if it doesn't exist
outpath_ngrambegin = outdir_ngram / outfile_ngrambegin

with open(outpath_ngrambegin, 'a') as f:
    print(f'Top', str(ngram) + '-grams ending with \'' + endword + '\' in ' + dataset + ':')
    print(f'Top', str(ngram) + '-grams ending with \'' + endword + '\' in ' + dataset + ':', file=f)

    for year, text in zip(df.index.year, df.text):
        print(year)
        print(year, file=f)
        grams = ngrams(text.split(), ngram)
        grams_freq = Counter(grams)
        top_grams = grams_freq.most_common(words_to_print)
        for item in top_grams:
            if item[0][end] == endword:
                print(item, sep='\n')
                print(item, sep='\n', file=f)   