# Cleaning and Basic Text Analysis

Methods for cleaning the segmented and disaggregated text files and performing word counts, chapter counts, stopword removal, and other basic frequency calculations and enrichment processes. 

## Install Packages and Upload Files

In [None]:
import os
import pandas as pd

import nltk
#nltk.download('stopwords')
from nltk.corpus import stopwords

import re

#Get dictionary of English words to keep 
from nltk.corpus import words
#nltk.download('words')
#nltk.download('wordnet')
from nltk import WordNetLemmatizer

from collections import Counter

In [None]:
#Get current working directory 
path = os.getcwd()
print(path)

#Change working directory
path = os.chdir("/PATH")

#Upload dataframeâˆš
clean_df = pd.read_csv('chapter_chunks_bow_output.csv')

#Drop first column (unnamed)
clean_df = clean_df.iloc[: , 1:]

#Make text column string values
clean_df['Text'] = clean_df['Text'].astype(str)

clean_df

## Basic Cleaning

In [None]:
#Lowercase all words
clean_df['Clean_Text'] = clean_df['Text'].str.lower()
clean_df

In [None]:
p = re.compile(r'[^\w\s]+')
clean_df['Clean_Text'] = [p.sub('', x) for x in clean_df['Clean_Text'].tolist()]
clean_df

In [None]:
#Remove extraneous whitespace using regular expressions
clean_df['Clean_Text'] = clean_df['Clean_Text'] .str.replace('  +', ' ', regex=True)
clean_df

In [None]:
#Remove numbers and extraneous characters
clean_df['Clean_Text'] = clean_df['Clean_Text'] .str.replace('\d+', '', regex=True)
clean_df['Clean_Text'] = clean_df['Clean_Text'] .str.replace('_', '')
clean_df

In [None]:
#Change path to where you want to save the files
path = os.chdir("/PATH")

#Save cleaned dataframe to working directory
clean_df.to_csv('clean_bow_ch_chunks.csv', index=False)

## Advanced Cleaning: Stopword Removal, Lemmatization and Keep Only English Words

In [None]:
#Make new dataframe for advanced cleaning
adv_clean_df = clean_df.copy()

In [None]:
#Remove stopwords
stop_words = set(stopwords.words("english"))
adv_clean_df['Text_NoStops'] = adv_clean_df['Clean_Text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))
adv_clean_df

In [None]:
#Define list of words to keep from nltk words
#Set function will make processing faster
words_list = words.words()
my_words = set(words.words())

In [None]:
#Words corpus does not contain plural forms, must lemmatize first
#nltk.download('omw-1.4')
wnl = WordNetLemmatizer()

#Can choose to lemmatize clean text with or without stopwords
adv_clean_df['Text_Lemmas'] = adv_clean_df['Clean_Text'].apply(lambda x: ' '.join([wnl.lemmatize(word) for word in x.split() ]))
adv_clean_df

In [None]:
#Add all text to a list of strings
adv_clean_df['Text_Lemmas'] = adv_clean_df['Text_Lemmas'].astype(str)
data = adv_clean_df.Text_Lemmas.values.tolist()
data

In [None]:
#Append each word in list of strings to list of words
all_words = []

for text in data:
    word = text.split()
    all_words.append(word)
    
import itertools
all_words_list = list(itertools.chain(*all_words))
len(all_words_list)

In [None]:
#Keep only words in lemma list that are also in words corpus
adv_clean_df['English_Text'] = adv_clean_df['Text_Lemmas'].apply(lambda x: ' '.join([word for word in x.split() if word in (my_words)]))
adv_clean_df

In [None]:
#Add all English text to a list of strings
adv_clean_df['English_Text'] = adv_clean_df['English_Text'].astype(str)
kept_data = adv_clean_df.English_Text.values.tolist()
kept_data

In [None]:
#Append each word in list of strings to list of words
kept_words = []

for text in kept_data:
    word = text.split()
    kept_words.append(word)
    
import itertools
kept_words_list = list(itertools.chain(*kept_words))
len(kept_words_list)

In [None]:
#Calculate the number of words that have been removed from the text
removed_list = set(all_words_list) - set(kept_words_list)
len(removed_list)

In [None]:
#Examine what words were removed from the text
removed_list= list(removed_list)
removed_list.sort()
removed_list

In [None]:
#Put the list of removed words into a dataframe
col_name = ['Removed Words']
removed_words_df = pd.DataFrame(removed_list, columns = col_name)
removed_words_df

In [None]:
#Change path to where you want to save the files
path = os.chdir("/PATH")

#Save dataframe with kept words and titles
adv_clean_df.to_csv('adv_clean_bow_ch_chunks.csv', index=False)

#Saved removed words dataframe to working directory
removed_words_df.to_csv('Removed_Words_bow_ch_chunks.csv', index=False)

## Basic Text Analysis

In [None]:
#Get new dataframe to work with
df_counts = adv_clean_df.copy()

In [None]:
#Get number of words in each chapter chunk
#Make sure to use original texts (not cleaned)
ch_words = df_counts["Text"].apply(lambda x: len(str(x).split(' ')))

#Append word counts of each chapter chunk to dataframe
df_counts["Word Count"] = ch_words
df_counts

In [None]:
#Get most frequent words across the dataframe
#Use text in English/without stopwords
Counter(" ".join(df_counts["Text_NoStops"]).split()).most_common(20)