In [2]:
import json
import pandas as pd
from nltk.tokenize import word_tokenize 
import string
import nltk
nltk.download('words')
nltk.download('punkt')

[nltk_data] Downloading package words to /home/peter/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package punkt to /home/peter/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

# Importing the text and slicing

In [3]:
# Download data from data directory to one long raw string with no formatting

pwd = !pwd
path = pwd[0] + '/../data/alice_in_wonderland.txt'
f = open(path, "r", newline = None)
text = ' '.join(f.read().splitlines())

In [4]:
# Slicing the data before and after the specified beginning and ending, and removing chapter titles

start = 'CHAPTER I. Down the Rabbit-Hole'
end = 'THE END'
start_ind = text.find(start)
end_ind = text.find(end)
novel = text[start_ind:end_ind+len(end)]

# Splitting the text by chapter

In [5]:
# Splitting the text into chapters and splitting these chapters into sentences for later use

split_text = novel.split("CHAPTER")
chapter_list = []

for chapter_text in split_text:
    if chapter_text.strip():
        chapter_list.append("CHAPTER" + chapter_text)

with open('../data/chapter_list.json', 'w') as json_file:
    json.dump(chapter_list, json_file)

# Pre-processing the text for Question 1

In [6]:
# Extra cleaning to remove non-valid tokens after tokenizing

def extra_token_cleaning(list_of_tokens):
    '''Removing punctuation kept inside tokens aside from hyphens. Specifically aiming at words of the form '_very_' '''

    index = string.punctuation.find('-')
    punc_list = string.punctuation[:index] + string.punctuation[index+1:]

    result = []
    for token in list_of_tokens:
        if any(char.isalpha() for char in token):
            result.append(''.join([char for char in token if char.isalpha() or char == '-']))
    return result

In [10]:
# Tokenizing chapters and keeping strings

chapters_tokenized = []

for chapter in chapter_list:
    chapters_tokenized.append(extra_token_cleaning(word_tokenize(chapter.lower())))

with open('../data/chapters_tokenized.json', 'w') as json_file:
    json.dump(chapters_tokenized, json_file)

# Finding the complete and chapter-specific vocabulary

In [11]:
# Using a set to remove duplicates and saving vocabulary in an indexed list

chapter_vocab = []
for chapter in chapters_tokenized:
    chapter_vocab.append(list(set((chapter))))

In [12]:
# Calculating total vocab

total_vocab = set([])
for chapter in chapter_vocab:
    total_vocab = total_vocab.union(set(chapter))
    
total_vocab = list(total_vocab)

# Counting word occurrences and forming Dataframe

In [13]:
# Creating a Dataframe and adding each chapter's respective vocabs

df_info = []
for i in range(0, 12):
    for word in chapter_vocab[i]:
        chapter_number = i+1
        df_info.append((chapter_number, word))
        
columns = ['chapter_number', 'word']
count_df = pd.DataFrame(df_info, columns=columns)

In [14]:
# Creating custom count function 

def calculate_count(row):
    chapter_number = row['chapter_number']
    word = row['word']
    chapter_words = chapters_tokenized[chapter_number-1]
    number_of_word = chapter_words.count(word)
    return int(number_of_word)

count_df['count'] = count_df.apply(calculate_count, axis=1)    

In [15]:
# Checking we haven't missed any words between the chapter list and the Dataframe

total_words = []
for chapter in chapters_tokenized:
        for word in chapter:
            total_words.append(word)
total_word_length = len(total_words)

remaining_word_list = total_words

for index, row in count_df.iterrows():
    count = row['count']
    word = row['word']
    for i in range(count):
        remaining_word_list.remove(word)
        
assert(remaining_word_list == [])
assert((count_df['count'].sum()) == total_word_length)

# Saving the dataframe as a JSON file

In [83]:
count_df.to_csv('../data/wordcount_df.csv')

In [84]:
with open('../data/chapter_list.json', 'w') as json_file:
    json.dump(chapter_list, json_file)