In [None]:
import pandas as pd
import os
from collections import defaultdict
from pathlib import Path
from os.path import splitext
from os.path import basename
import numpy as np
import datetime
from nltk import word_tokenize
from nltk.util import ngrams
import matplotlib.pyplot as plt

In [None]:
indir = "/path/to/indir/"

function to extract filename from file

In [None]:
def remove_ext(filepath):
    root, ext = splitext(filepath)
    return root

def get_filename(filepath):
    return basename(remove_ext(filepath))

### Create a dataframe 
Df with the texts in "text" column and the file name (=date) in "file_name" column

In [None]:
results = defaultdict(list)

for root, dirs, files in os.walk(indir):
    for filename in files:
        filepath = root + os.sep + filename
        if filename.startswith("."):
            continue
        with open(filepath, 'r') as f:
            text = f.read()
            results["file_name"].append(get_filename(remove_ext(filepath)))
            results["text"].append(text)

corpus = pd.DataFrame(results)

In [None]:
print(corpus.head())

Turn "file_name" column into datetime and set as index

In [None]:
corpus["date"] = pd.to_datetime(corpus["file_name"], format ="%Y")

In [None]:
corpus = corpus.set_index("date")
del corpus["file_name"]
corpus = corpus.sort_index()

In [None]:
print(corpus.head())

### Apply preprocessing

Lemmatization, stopwords, alphanumerical; store as string

In [None]:
def clean_text(text):
    text = [w for w in word_tokenize(text.lower()) if w.isalnum()]
    text_str = ' '.join(text)
    return text_str

In [None]:
corpus['clean_words'] = corpus['text'].apply(clean_text)
print(corpus.head())

### Finding and visualizing (ngram) strings in the texts

In [None]:
def word_count(string):
    words = string.split()
    return len(words)

In [None]:
corpus['num_words'] = corpus['text'].apply(word_count)

In [None]:
def query_column(i):
    i_df = i.replace(' ','_')
    return i_df

In [None]:
years = ['one', 'two', 'ten']
fig = plt.figure(figsize = (15,8))
ax = plt.gca()

for year in years:
    if year == 'one':
        years_ago = year + ' year ago'
    else:
        years_ago = year + ' years ago'
    years_from_now1 = 'in ' + year + ' years'
    years_from_now2 = year + ' from now'
    corpus[query_column(years_ago)] = corpus.clean_words.str.count(years_ago) / corpus.num_words
    corpus[query_column(years_from_now2)] = (corpus.clean_words.str.count(years_from_now1)+corpus.clean_words.str.count(years_from_now2)) / corpus.num_words
    color = next(ax._get_lines.prop_cycler)['color']
    plt.plot(corpus.index, corpus[query_column(years_ago)], color=color, marker='.', label=years_ago)
    plt.plot(corpus.index, corpus[query_column(years_from_now2)], color=color, marker='.', linestyle='--', label=years_from_now2)
    
plt.ylabel('frequency')
plt.xlabel('year')
plt.title("Looking back and forward")
plt.legend()
plt.minorticks_on()
plt.show()

### Create n-grams

In [None]:
get_year = '1970'
year_text = corpus.loc[get_year, 'clean_words']
year_text_list = year_text.split()
columns = ['first', 'second', 'third']
ngrams_df = pd.DataFrame(ngrams(one_text_list, 3), columns = columns).astype(str)

In [None]:
print(ngrams_df.head())

In [None]:
ngrams_df['trigrams'] = ngrams_df['first'].str.cat([ngrams_df['second'], ngrams_df['third']], sep=' ')
ngrams_df.drop(['first', 'second', 'third'], axis=1, inplace=True)

Make new dataframe of trigrams ending with specific words (here 'years ago')

In [None]:
years_ago_df = pd.DataFrame(ngrams_df['trigrams'][ngrams_df['trigrams'].str.endswith("years ago")]).reset_index(drop=True)

Save sorted value counts of trigrams dataframe to csv

In [None]:
csv = '/path/to/outdir/years_ago_' + get_year + '.csv'
years_ago_sorted = years_ago_df.value_counts()
years_ago_sorted.to_csv(csv, sep='\t', encoding='utf-8')