## Import required modules & specify paths

In [1]:
import os
from pathlib import Path

novel_dir = os.path.join('..', 'data', '100_english_novels', 'corpus') # Road to our library
novel_filenames = os.listdir(novel_dir) # A list of all of the books in our library
outfile_path = os.path.join('..', 'data', '100_english_novels', 'word_counts.csv') # Where our output data is gonna live

## Write headers to the output file (not strictly necessary)

In [2]:
with open(outfile_path, 'w', encoding='utf-8') as fh_out: # Returns the file handle of the output file in write mode
    fh_out.write('filename,total_words,unique_words\n') # Hardcoded one line of headers in a Comma-separated Values format

## Calculate words & write to CSV

In [3]:
for novel_filename in novel_filenames:
    novel_file_path = os.path.join(novel_dir, novel_filename) # '../data/100_english_novels/corpus/Anon_Clara_1864.txt' et cetera

    with open(novel_file_path, 'r', encoding='utf-8') as fh_in:
        content = fh_in.read() # Reads the whole text string
        words = content.split() # Splits (implicitly) on spaces - returns list of tokens
        total_words = len(words) # The length of the former list is the amount of tokens
        unique_words = len(set(words)) # Coercing the list of tokens into a set removes all duplicates - The size of the set is the amount of unique tokens

        with open(outfile_path, 'a', encoding='utf-8') as fh_out: # Returns file handle of output file in append mode
            fh_out.write(f'{novel_filename},{total_words},{unique_words}\n') # Commas denote a new value, newlines denote a new row