# Exploratory Data Analysis - Words

* Analyze word frequencies before and after removing stop words
* Analyze word frequencies before and after stemming/lemmatization
* Does zipf's law hold?
* Look at words that are not English
* British spellings

In [None]:
%load_ext autoreload
%autoreload 2
%aimport tools.utils

%config InlineBackend.figure_format = 'svg'
%matplotlib inline

import operator
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

sns.set()

## Zipf's Law

Simply put, Zipf's law states that the frequencies of words from a natural language corpus are inversely proportional to their rank in a frequency table. That is, a plot of their rank vs frequency on a log-log scale will be roughly linear.

For example, The first word in the table below is twice as frequent as the second word, and three times as frequent as the third.

| rank | value  | occurrences |
|------|--------|-------------|
| 1    | word 1 | 21          |
| 2    | word 2 | 10          |
| 3    | word 3 | 7           |

In [None]:
ranks = np.array([1, 2, 3])
frequencies = np.array([21, 10, 7])

plt.plot(np.log(ranks), np.log(frequencies))
plt.plot(np.log(ranks), np.log(frequencies), ".")

plt.title("Example of Zipf's Law")
plt.xlabel("$\log(rank)$")
plt.ylabel("$\log(freq)$")
plt.show()

So we get a bag-of-words representation of the dataset, and construct the frequency table.

In [None]:
bag = tools.utils.get_bag_of_words()

frequency_table = sorted(bag.items(), key=operator.itemgetter(1), reverse=True)
words, frequencies = zip(*frequency_table)

frequencies = np.array(frequencies)
words = np.array(words)

Plotting the ranks of each word vs their frequency on a log-log scale reveals that Zipf's law does seem to hold for most of the dataset.

In [None]:
plt.plot(np.log(np.arange(1, len(words) + 1)), np.log(frequencies), '.', markersize=3)

plt.title("Haiku Word Frequency")
plt.xlabel("$\log(rank)$")
plt.ylabel("$\log(freq)$")
plt.show()

In [None]:
words[:15]

So then we find the words and their corresponding frequencies at the interesting breaks in the plot.

In [None]:
# This should be a crime.
interesting_indices = np.nonzero(
    np.logical_or(
        words == "the",
        np.logical_or(
            words == "a",
            np.logical_or(
                words == "of",
                np.logical_or(
                    words == "to",
                    np.logical_or(
                        words == "i",
                        np.logical_or(words == "her", words == "his"),
                    ),
                ),
            ),
        ),
    )
)[0]
print(interesting_indices)
interesting_words = np.array([words[i] for i in interesting_indices])
print(interesting_words)
interesting_freqs = np.array([frequencies[i] for i in interesting_indices])
print(interesting_freqs)

In [None]:
plt.plot(
    np.log(np.arange(1, len(words) + 1)), np.log(frequencies), ".", markersize=3
)
# plt.plot(np.log(interesting_indices + 1), np.log(interesting_freqs), ".")

# This should also be a crime.
x_adjust = np.array([0.1, -0.6, 0.15, -0.6, 0.2, -0.6, 0.0])
y_adjust = np.array([1.0, -1.2, 1.0, -1.3, 1.0, -1.3, 1.0])

for word, freq, rank, xa, ya in zip(
    interesting_words,
    interesting_freqs,
    interesting_indices,
    x_adjust,
    y_adjust,
):
    plt.annotate(
        word,
        xy=(np.log(rank + 1), np.log(freq) + ya / 20),
        xytext=(np.log(rank + 1) + xa, np.log(freq) + ya),
        size=9,
        arrowprops={
            'arrowstyle': '-',
            'color': 'k'
        }
    )

plt.title("Haiku Word Frequency")
plt.xlabel("$\log(rank)$")
plt.ylabel("$\log(freq)$")
plt.ylim((-0.5, 11.9))
# plt.savefig('zipfs-uncleaned.svg')
plt.show()

## Zipf's Law After Removing Stop Words

We remove the stop words from the bag of words.

In [None]:
for stopword in tools.utils.stop_words():
    if stopword in bag:
        del bag[stopword]

In [None]:
frequency_table = sorted(bag.items(), key=operator.itemgetter(1), reverse=True)
words, frequencies = zip(*frequency_table)

frequencies = np.array(frequencies)
words = np.array(words)

In [None]:
plt.plot(np.log(np.arange(1, len(words) + 1)), np.log(frequencies), '.', markersize=3)

plt.title("Haiku Word Frequency")
plt.xlabel("$\log(rank)$")
plt.ylabel("$\log(freq)$")
plt.show()

In [None]:
words[:13]

In [None]:
# This should be a crime.
interesting_indices = np.nonzero(
    np.logical_or(
        words == "moon",
        np.logical_or(
            words == "rain",
            np.logical_or(
                words == "day",
                np.logical_or(
                    words == "winter",
                    np.logical_or(
                        words == "summer",
                        np.logical_or(
                            words == "spring",
                            np.logical_or(
                                words == "autumn",
                                np.logical_or(
                                    words == "snow", words == "night"
                                ),
                            ),
                        ),
                    ),
                ),
            ),
        ),
    )
)[0]
print(interesting_indices)
interesting_words = np.array([words[i] for i in interesting_indices])
print(interesting_words)
interesting_freqs = np.array([frequencies[i] for i in interesting_indices])
print(interesting_freqs)

In [None]:
plt.plot(
    np.log(np.arange(1, len(words) + 1)), np.log(frequencies), ".", markersize=3
)
# plt.plot(np.log(interesting_indices + 1), np.log(interesting_freqs), ".")

# This should also be a crime.
x_adjust = np.array([-0.35, -0.9, -0.23, -0.9, -0.1, -0.7, 0.3, -0.7, 0.4])
y_adjust = np.array([1.0, -1.0, 1.1, -1.1, 1.1, -1.4, 1.0, -1.45, 1.0])

for word, freq, rank, xa, ya in zip(
    interesting_words,
    interesting_freqs,
    interesting_indices,
    x_adjust,
    y_adjust,
):
    plt.annotate(
        word,
        xy=(np.log(rank + 1), np.log(freq) + ya / 20),
        xytext=(np.log(rank + 1) + xa, np.log(freq) + ya),
        size=8,
        arrowprops={
            'arrowstyle': '-',
            'color': 'k'
        }
    )

plt.title("Haiku Word Frequency")
plt.xlabel("$\log(rank)$")
plt.ylabel("$\log(freq)$")
plt.xlim((-0.5, 10.5))
plt.ylim((-0.5, 9))
# plt.savefig("zipfs-cleaned.svg")
plt.show()

# TODO: stem/lemmatize and repeat