# Doing things with text 4

## Word frequency, n-grams, collocations _for preprocessed texts_

In [None]:
from pathlib import Path
import pandas as pd
import os
import numpy as np
from collections import defaultdict
from collections import Counter
import datetime
import nltk
from nltk.util import ngrams
from nltk import bigrams
from nltk.util import everygrams
from nltk import collocations
from nltk import FreqDist
from nltk.collocations import *
from nltk import WordPunctTokenizer
import matplotlib.pyplot as plt

In [None]:
p = Path.home()
indir = p / 'folder' / 'folder'
outpath = p / 'folder' / 'folder'
outdir = str(outpath)+'/'
os.makedirs(os.path.dirname(outdir), exist_ok=True) # makes outdir if it doesn't exist already

dataset = 'dataset' # here the name of your actual dataset for output files

In [None]:
print(outdir)

In [None]:
def save_corpus(corpus):
    corpus_out = corpus.replace(" ", "_").lower()
    return corpus_out

def to_string(list):
    string = '_'.join(list)
    return string

def to_title(words):
    if not words:
        return ''
    elif len(words) == 1:
        return f"'{words[0]}'"
    else:
        formatted_list = [f"'{word}'" for word in words[:-1]]
        return ', '.join(formatted_list) + f" and '{words[-1]}'"

### Create a dataframe 
Df with the texts in "text" column and the file name (=date) in "file_name" column

In [None]:
results = defaultdict(list)


# list all files in a given directory
files = os.listdir(indir)
#files = [f for f in files if not f.startswith('.')]

for infile in files:
    # avoid opening files such as .DS_Store
    if infile.startswith('.'):
        continue
    # open the file and do something with it, close when done
    with open(str(indir) + '/' + infile, "r") as f:
        # try / except clause to catch encoding errors
        try:
            text = f.read()
        except Exception:
            print(Exception)
        results["year"].append(infile[:-4])
        results["text"].append(text)
        
df = pd.DataFrame(results)

Turn "year" column into datetime and set as index

In [None]:
df = df.set_index("year")
df.index = pd.to_datetime(df.index, format ="%Y")
df = df.sort_index()

In [None]:
print(df.head())

## Additional preprocessing (only run if needed)

In [None]:
stopwords = ['word1', 'word2', 'word3']

In [None]:
minimum_word_length = 4

In [None]:
df['text'] = [[x for x in text.split(' ') if len(x) >= minimum_word_length and x not in stopwords] 
              for text in df['text']]
df['text'] = df['text'].apply(' '.join)

## Count and plot word frequencies per year

In [None]:
def word_count(string):
    words = string.split()
    return len(words)

In [None]:
df['num_words'] = df['text'].apply(word_count)

In [None]:
outpath_counts = outdir + f'total_words_{save_corpus(dataset)}.png'

fig = plt.figure(figsize = (15,8))

plt.bar(df.index.year, df['num_words'])
plt.ylabel('words')
plt.xlabel('date')
plt.xticks(df.index.year, rotation=45)
plt.title("Total number of words per year in %s" %(dataset))
plt.savefig(outpath_counts)
plt.show()

## Finding and visualizing (ngram) strings in the texts

In [None]:
search_words = ['word', 'word', 'word']

In [None]:
outfile_freq = f'{to_string(search_words)}_{save_corpus(dataset)}_freq.png'
outdir_freq = os.path.join(outdir, save_corpus(dataset) + '_keyword_trends/')
outpath_freq = outdir_freq + outfile_freq
os.makedirs(os.path.dirname(outpath_freq), exist_ok=True) # makes outdir if it doesn't exist already

fig = plt.figure(figsize = (15,8))

for search_word in search_words:
    df[search_word + '_rel'] = df.text.str.count(pat=search_word + '??') / df.num_words
    plt.plot(df.index.year, df[search_word + '_rel'], label=search_word)
    with open(outdir_freq + search_word + '_' + save_corpus(dataset) + '_freq.txt', 'a') as f:
        print('Relative frequency of \'%s\' in %s\n' %(search_word, dataset), file=f)
        print(df.text.str.count(pat=search_word + '??') / df.num_words, file=f)

plt.ylabel('frequency')
plt.xlabel('year')
plt.title("Keyword trends in %s" %(dataset))
plt.legend()
plt.xticks(df.index.year, rotation=45)
plt.savefig(outpath_freq)
plt.show()

## Finding and printing word endings in the text

In [None]:
ending = 'ing'
min_freq_end = 10

In [None]:
outfile_end = 'top_endings_%s_%s.txt'%(ending, save_corpus(dataset))
outpath_end = os.path.join(outdir, save_corpus(dataset) + '_endings_beginnings', outfile_end)
os.makedirs(os.path.dirname(outpath_end), exist_ok=True) # makes outdir if it doesn't exist already

with open(outpath_end, 'a') as f:
    for index, row in df.iterrows():
        print(str(index)[:4])
        print(str(index)[:4], file=f)
        word_counts = Counter(row['text'].split())
        word_counts = {k: v for k, v in sorted(word_counts.items(), key=lambda item: item[1], reverse=True)}
        for word, count in word_counts.items():
            if word.endswith(ending) and count >= min_freq_end:
                print(word + ": ", count)
                print(word + ": ", count, file=f)
        print("\n")
        print("\n", file=f)

## Finding and printing word beginnings in the text

In [None]:
begin = 'pre'
min_freq_begin = 10

In [None]:
outfile_begin = 'top_beginnings_%s_%s.txt'%(begin, save_corpus(dataset))
outpath_begin = os.path.join(outdir, save_corpus(dataset) + '_endings_beginnings', outfile_begin)
os.makedirs(os.path.dirname(outpath_begin), exist_ok=True) # makes outdir if it doesn't exist already

with open(outpath_begin, 'a') as f:
    for index, row in df.iterrows():
        print(str(index)[:4])
        print(str(index)[:4], file=f)
        word_counts = Counter(row['text'].split())
        word_counts = {k: v for k, v in sorted(word_counts.items(), key=lambda item: item[1], reverse=True)}
        for word, count in word_counts.items():
            if word.startswith(begin) and count >= min_freq_begin:
                print(word, ": ", count)
                print(word, ": ", count, file=f)
        print("\n")
        print("\n", file=f)

## Find collocations:

From: https://www.nltk.org/howto/collocations.html

In [None]:
search_terms = ['word', 'word']
windows = [10] # add or change to smaller/larger window
algorithms = ['likelihood', 'pmi'] # 'likelihood', 'pmi', 'raw_freq'
coll_to_print = 10

In [None]:
for term in search_terms:
    for window in windows:
        for algorithm in algorithms:
            outfile_coll = f'{term}_{algorithm}_collocations_{window}.txt'
            outpath_coll = os.path.join(outdir, save_corpus(dataset) + '_collocations', outfile_coll)
            os.makedirs(os.path.dirname(outpath_coll), exist_ok=True) # makes outdir if it doesn't exist already
            
            with open(outpath_coll, 'a') as f:
                print('Top %s %s collocations of \'%s\' with a window of %s in %s:\n' %(str(coll_to_print), algorithm, term, str(window), dataset))
                print('Top %s %s collocations of \'%s\' with a window of %s in %s:\n' %(str(coll_to_print), algorithm, term, str(window), dataset), file=f)
                for year, doc in zip(df.index.year, df.text):
                    tokens = WordPunctTokenizer().tokenize(doc)
            
                    bigram_measures = collocations.BigramAssocMeasures()
                    word_fd = FreqDist(tokens)
                    bigram_fd = FreqDist(bigrams(tokens))
                    finder = BigramCollocationFinder(word_fd, bigram_fd, window_size=window)

                    #preprocessing: remove short words and stop words (see above) if only relevant for collocations
                    #finder.apply_word_filter(lambda w: len(w) < 4 or w.lower() in stopwords)
        
                    if algorithm == 'likelihood': 
                        scored = finder.score_ngrams(bigram_measures.likelihood_ratio)
                    elif algorithm == 'pmi': 
                        scored = finder.score_ngrams(bigram_measures.pmi) 
                    else: 
                        scored = finder.score_ngrams(bigram_measures.raw_freq) 

                    # Group bigrams by first word in bigram                                       
                    prefix_keys = defaultdict(list)
                    for key, scores in scored:
                        prefix_keys[key[0]].append((key[1], scores))

                    # Sort keyed bigrams by strongest association                                  
                    for key in prefix_keys:
                        prefix_keys[key].sort(key = lambda x: -x[1])

                    # Print top collocations of term.
                    print(str(year) + ':')
                    print(str(year) + ':', file=f)
                    print(*prefix_keys[term][:coll_to_print], sep='\n')
                    print(*prefix_keys[term][:coll_to_print], sep='\n', file=f)
                    print('\n')
                    print('\n', file=f)

### Create n-grams

Print and save to outdir the top n ngrams per dataframe row

Define length of n-grams

In [None]:
ngram = 2
words_to_print = 100

#### Print and write to disk the n most common ngrams of this length

In [None]:
outfile_ngram = f'{save_corpus(dataset)}_mostcommon_{ngram}_grams.txt'
outpath_ngram = os.path.join(outdir, save_corpus(dataset) + '_ngrams', outfile_ngram)
os.makedirs(os.path.dirname(outpath_ngram), exist_ok=True) # makes outdir if it doesn't exist already

with open(outpath_ngram, 'a') as f:

    print('Top ' + str(words_to_print) + ' ' + str(ngram) + '-grams in ' + dataset + ':\n')
    print('Top ' + str(words_to_print) + ' ' + str(ngram) + '-grams in ' + dataset + ':\n', file=f)

    for year, text in zip(df.index.year, df.text):
        grams = ngrams(text.split(), ngram)
        grams_freq = Counter(grams)
        top_grams = grams_freq.most_common(words_to_print) # for n see above
        
        print(year)
        print(year, file=f)
        print(*top_grams, sep='\n')
        print(*top_grams, sep='\n', file=f)
        print('\n')
        print('\n', file=f)

#### Print and write to disk the most common ngrams of this length beginning or ending with a particular word

In [None]:
beginword = 'word' # type as 'word'

In [None]:
outfile_ngramend = f'{save_corpus(dataset)}_{beginword}_{ngram}_grams_begin.txt'
outpath_ngramend = os.path.join(outdir, save_corpus(dataset) + '_ngrams', outfile_ngramend)
os.makedirs(os.path.dirname(outpath_ngramend), exist_ok=True) # makes outdir if it doesn't exist already

with open(outpath_ngramend, 'a') as f:
    print(f'Top', str(ngram) + '-grams beginning with \'' + beginword + '\' in ' + dataset + ':\n')
    print(f'Top', str(ngram) + '-grams beginning with \'' + beginword + '\' in ' + dataset + ':\n', file=f)

    for year, text in zip(df.index.year, df.text):
        print(year)
        print(year, file=f)
        grams = ngrams(text.split(), ngram)
        grams_freq = Counter(grams)
        top_grams = grams_freq.most_common(words_to_print)
        for item in top_grams:
            if item[0][0] == beginword:
                print(item, sep='\n')
                print(item, sep='\n', file=f)            

In [None]:
endword = 'word' # type as 'word'

In [None]:
end = ngram - 1

outfile_ngrambegin = f'{save_corpus(dataset)}_{endword}_{ngram}_grams_end.txt'
outpath_ngrambegin = os.path.join(outdir, save_corpus(dataset) + '_ngrams', outfile_ngrambegin)
os.makedirs(os.path.dirname(outpath_ngrambegin), exist_ok=True) # makes outdir if it doesn't exist already

with open(outpath_ngrambegin, 'a') as f:
    print(f'Top', str(ngram) + '-grams ending with \'' + endword + '\' in ' + dataset + ':')
    print(f'Top', str(ngram) + '-grams ending with \'' + endword + '\' in ' + dataset + ':', file=f)

    for year, text in zip(df.index.year, df.text):
        print(year)
        print(year, file=f)
        grams = ngrams(text.split(), ngram)
        grams_freq = Counter(grams)
        top_grams = grams_freq.most_common(words_to_print)
        for item in top_grams:
            if item[0][end] == endword:
                print(item, sep='\n')
                print(item, sep='\n', file=f)   