# Doing things with text 4

## Word frequency, n-grams, collocations _for preprocessed texts_

In [None]:
import pandas as pd
import os
from collections import defaultdict
from collections import Counter
import datetime
import nltk
from nltk.util import ngrams
from nltk import bigrams
from nltk.util import everygrams
from nltk import collocations
from nltk import FreqDist
from nltk.collocations import *
from nltk import WordPunctTokenizer
import matplotlib.pyplot as plt

In [None]:
indir = '/path/to/indir/'
outdir = '/path/to/outdir/'
os.makedirs(os.path.dirname(outdir), exist_ok=True) # makes outdir if it doesn't exist already

dataset = 'dataset' # here the name of your actual dataset for output files

### Create a dataframe 
Df with the texts in "text" column and the file name (=date) in "file_name" column

In [None]:
results = defaultdict(list)


# list all files in a given directory
files = os.listdir(indir)
#files = [f for f in files if not f.startswith('.')]

for infile in files:
    # avoid opening files such as .DS_Store
    if infile.startswith('.'):
        continue
    # open the file and do something with it, close when done
    with open(indir+infile, "r") as f:
        # try / except clause to catch encoding errors
        try:
            text = f.read()
        except Exception:
            print(Exception)
        results["year"].append(infile[:-4])
        results["text"].append(text)
        
df = pd.DataFrame(results)

In [None]:
print(df.head())

Turn "year" column into datetime and set as index

In [None]:
df["date"] = pd.to_datetime(df["year"], format ="%Y")

In [None]:
df = df.set_index("date")
df = df.sort_index()

In [None]:
print(df.head())

In [None]:
df['year'] = df.index.strftime('%Y')

## Additional preprocessing (only run if needed)

In [None]:
stopwords = [] # add words as 'word', 'word', 'word', etc.

In [None]:
minimum_word_length = 4

In [None]:
df['text'] = [[x for x in text.split(' ') if len(x) >= minimum_word_length and x not in stopwords] 
              for text in df['text']]
df['text'] = df['text'].apply(' '.join)

## Count and plot word frequencies per year

In [None]:
def word_count(string):
    words = string.split()
    return len(words)

In [None]:
df['num_words'] = df['text'].apply(word_count)

In [None]:
fig = plt.figure(figsize = (15,8))

plt.bar(df.year, df['num_words'])
plt.ylabel('words')
plt.xlabel('date')
plt.title("Total number of words per text in %s" %(dataset))
plt.savefig(outdir + 'total_words_%s.png' %(dataset))
plt.show()

## Finding and visualizing (ngram) strings in the texts

In [None]:
def to_string(list):
    string = '_'.join(list)
    return string

In [None]:
search_words_freq = [] # add words as 'word', 'word', 'word', etc.

In [None]:
fig = plt.figure(figsize = (15,8))

for search_word in search_words_freq:
    df[search_word + '_rel'] = df.text.str.count(pat=search_word + '??') / df.num_words
    plt.scatter(df.year, df[search_word + '_rel'], label=search_word)
    with open(str(search_word) + '_freq.txt', 'a') as outfile:
        print(df.text.str.count(pat=search_word + '??') / df.num_words, file=outfile)

plt.ylabel('frequency')
plt.xlabel('year')
plt.title("Keyword trends in %s" %(dataset))
plt.legend()
plt.minorticks_on()
plt.xticks(rotation=45)
plt.savefig(f'{outdir}{to_string(search_words_freq)}_{dataset}.png')
plt.show()

## Find collocations:

From: https://www.nltk.org/howto/collocations.html

In [None]:
n = 15

In [None]:
search_words_coll = [] # add words as 'word', 'word', 'word', etc.

In [None]:
windows = [10, 20]
algorithms = ['likelihood', 'pmi', 'raw_freq']

for window in windows:
    for algorithm in algorithms:
        for year, doc in zip(df.year, df.text):
            tokens = WordPunctTokenizer().tokenize(doc)

            bigram_measures = collocations.BigramAssocMeasures()
            word_fd = FreqDist(tokens)
            bigram_fd = FreqDist(bigrams(tokens))
            finder = BigramCollocationFinder(word_fd, bigram_fd, window_size=window)

            #preprocessing: remove short words and stop words (see above) if only relevant for collocations
            #finder.apply_word_filter(lambda w: len(w) < 4 or w.lower() in stopwords)
        
            if algorithm == 'likelihood': 
                scored = finder.score_ngrams(bigram_measures.likelihood_ratio)
            elif algorithm == 'pmi': 
                scored = finder.score_ngrams(bigram_measures.pmi) 
            else: 
                scored = finder.score_ngrams(bigram_measures.raw_freq) 

            # Group bigrams by first word in bigram.                                        
            prefix_keys = defaultdict(list)
            for key, scores in scored:
                prefix_keys[key[0]].append((key[1], scores))

            # Sort keyed bigrams by strongest association.                                  
            for key in prefix_keys:
                prefix_keys[key].sort(key = lambda x: -x[1])

            # Print top collocations of search_terms

            for term in search_words_coll:
                outfp = f'{term}_collocations_{algorithm}_{window}.txt'
                output_fp = os.path.join(outdir, dataset + '_' + term + '_coll', outfp)
                os.makedirs(os.path.dirname(output_fp), exist_ok=True) # makes outdir if it doesn't exist already
                with open(output_fp, 'a') as outfile:
                    print('Top collocations of ' + term + ' in ' + str(year) + ':')
                    print('Top collocations of ' + term + ' in ' + str(year) + ':', file=outfile)
                    print(*prefix_keys[term][:n], sep='\n')
                    print(*prefix_keys[term][:n], sep='\n', file=outfile)
                    print('\n')
                    print('\n', file=outfile)

### Create n-grams

Print and save to outdir the top n ngrams per dataframe row

Define length of n-grams

In [None]:
ngram = 3

#### Print and write to disk the n most common ngrams of this length

In [None]:
for year, text in zip(df.year, df.text):
       
    grams = ngrams(text.split(), ngram)
    grams_freq = Counter(grams)
    top_grams = grams_freq.most_common(n) # for n see above
    
    outfp = f'{dataset}_{year}_{ngram}_grams.txt'
    output_fp = os.path.join(outdir, dataset + '_%s_grams' %(ngram), outfp)
    os.makedirs(os.path.dirname(output_fp), exist_ok=True) # makes outdir if it doesn't exist already
    with open(output_fp, 'a') as outfile:
        print('Top ' + str(n) + ' ' + str(ngram) + '-grams in ' + year + ':')
        print('Top ' + str(n) + ' ' + str(ngram) + '-grams in ' + year + ':', file=outfile)
        print(*top_grams, sep='\n')
        print(*top_grams, sep='\n', file=outfile)
        print('\n')
        print('\n', file=outfile)

#### Print and write to disk the most common ngrams of this length beginning or ending with a particular word

In [None]:
def top_ngrams_begin(year, word):
    outfp = f'{dataset}_{word}_{year}_{ngram}_grams_begin.txt'
    outfolder = os.path.join(outdir, dataset + '_%s_grams' %(ngram), outfp) # same folder as in previous step
    os.makedirs(os.path.dirname(outfolder), exist_ok=True) # makes outdir if it doesn't exist already
    with open(outfolder, 'a') as outfile:
        print(f'Top', str(ngram) + '-grams starting with \'' + word + '\' in', year, ':')
        print(f'Top', str(ngram) + '-grams starting with \'' + word + '\' in', year, ':', file=outfile)
        for item in top_grams:
            if item[0][0] == word:
                print(item, sep='\n')
                print(item, sep='\n', file=outfile)            
            
def top_ngrams_end(top_grams, year, word):
    end = ngram - 1
    outfp = f'{dataset}_{word}_{year}_{ngram}_grams_end.txt'
    outfolder = os.path.join(outdir, dataset + '_%s_grams' %(ngram), outfp) # same folder as in previous step
    os.makedirs(os.path.dirname(outfolder), exist_ok=True) # makes outdir if it doesn't exist already
    with open(outfolder, 'a') as outfile:
        print(f'Top', str(ngram) + '-grams ending with \'' + word + '\' in', year, ':')
        print(f'Top', str(ngram) + '-grams ending with \'' + word + '\' in', year, ':', file=outfile)
        for item in top_grams:
            if item[0][end] == word:
                print(item, sep='\n')
                print(item, sep='\n', file=outfile)            

In [None]:
beginword = # type as 'word'

In [None]:
for year, text in zip(df.year, df.text):
       
    grams = ngrams(text.split(), ngram)
    grams_freq = Counter(grams)
    top_grams = grams_freq.most_common(1000)
    
    top_ngrams_begin(year, beginword)

In [None]:
endword =  # type as 'word'

In [None]:
for year, text in zip(df.year, df.text):
       
    grams = ngrams(text.split(), ngram)
    grams_freq = Counter(grams)
    top_grams = grams_freq.most_common(1000)

    top_ngrams_end(top_grams, year, endword)