In [None]:
import os
from sklearn.feature_extraction.text import CountVectorizer
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# define a function to read all text files in a directory and combine them into a single string:
def read_corpus(directory):
    corpus = ""
    for filename in os.listdir(directory):
        # check for this condition, otherwise listdir takes into account binary files
        if filename.endswith(".txt"): 
            with open(os.path.join(directory, filename), 'r') as f:
                text = f.read().strip() # remove preceding and following whitespaces
                corpus += text # combine every text into one corpus
    return corpus

In [None]:
# call the read_corpus function to get a single string containing all the text:
corpus = read_corpus('../verse_corpus_imposters/')

In [None]:
# use CountVectorizer to transform the corpus into a matrix of charcter 4-gram counts:
vectorizer = CountVectorizer(analyzer='char', ngram_range=(4,4))
X = vectorizer.fit_transform([corpus])

In [None]:
# convert the matrix to a list of tuples containing the n-grams and their counts:
ngram_counts = list(zip(vectorizer.get_feature_names_out(), X.toarray()[0]))

In [None]:
# sort the list in descending order of the counts:
ngram_counts_sorted = sorted(ngram_counts, key=lambda x: x[1], reverse=True)
# extract the counts and plot them as a line graph:

In [None]:
counts = [count for _, count in ngram_counts_sorted]

In [None]:
sns.set_palette("colorblind")
sns.set_style("darkgrid")
# plot counts into a lineplot
sns.lineplot(counts)
# plt.plot(counts)
plt.xlim(0, 600)
plt.ylim(0, 28000)
plt.ylabel("Frequency")
plt.xlabel("Index")
plt.title("Frequency distribution among character 4grams\n9 authors - 90 texts")
plt.savefig("freq_dist_char_ngrams.png", dpi=300)  # save the plot high res - save must be before show
plt.show()

In [None]:
import pprint
print("Top 100 MFC 4grams along with their overall frequency:\n")
pprint.pprint(ngram_counts_sorted[:100], width=110, compact=True)