# VIK DATA CLEANING

- Remember to set Python kernel to 3 (not later).

In [None]:
# Necessary packages
import os, requests, pandas as pd, pickle
from bs4 import BeautifulSoup

## VIK EXPLORATION: File operations

### Essentially copying a file's contents to another file - this works for text files...

with open('/Users/vix/Repos/Python-Learning/src/NLP/SNLI Stanford Corpus/README.txt','r') as sourcefile:
    with open('/Users/vix/Repos/Python-Learning/src/NLP/SNLI Stanford Corpus/DESTFILE.txt','w') as destfile:
        for line in sourcefile:
            destfile.write(line)


### And for an image file...simply append `b` for *binary mode* to file operation command `r`, `w`, or `a`

with open('/Users/vix/OneDrive/Temp/Portrait_Vikram_Before-After.png','rb') as sourceimage:
    with open('/Users/vix/Repos/Python-Learning/src/NLP/SNLI Stanford Corpus/destimage.png','wb') as destimage:
        for line in sourceimage:
            destimage.write(line)



## VIK EXPLORATION: Get book text from Project Gutenberg, save to file, and populate list object

import os
import re
import urllib # Import `urllib` package - primarily using `request` module with `urlopen` method

### os.chdir('/Users/vix/Repos/Python-Learning/src/NLP/Texts')
gutenberg_texts = [] # Initialize list

for counter in range(10,25003): # Loop over each book, which is a reference number

    def get_gutenberg_text():
        url = "https://www.gutenberg.org/files/" + str(counter) + "/" + str(counter) + ".txt"

        try: # Check if URL valid
            webpage = urllib.request.urlopen(url) # Open the webpage containing book text

            # Extract book title and author (author TBD) for file name
            linecount = 1
            for line in webpage:
                m = re.search('Title: ',str(line))
                if m:
                    print("Matched!")
                    text = line.decode()
                    booktitle = text[7 : (len(text) - 2)] # Minus 2 at end critical to remove newline character
                linecount += 1 # Advance line counter
            filename = str(counter)+'_'+booktitle+'.txt'
            
            # Write book text to output file
            print("Currently retrieving: " + booktitle + " -- file name: " + filename)
            with open(str('/Users/vix/Repos/Python-Learning/src/NLP/Texts/' + filename),'w') as file:
                webpage = urllib.request.urlopen(url)
                for line in webpage:
                    text = line.decode() # IMP: Extract only text, discarding non-printing characters
                    file.write(text)
            
            # Write book text to list 
            with open(str('/Users/vix/Repos/Python-Learning/src/NLP/Texts/' + filename),'r') as file:
                text = [file.read().replace('\n','')]
                gutenberg_texts.append(text)
                print("Added list item: " + str(len(gutenberg_texts)) + "\n") # Enumerate list count, which is number of books
            return gutenberg_texts


        except: # If URL invalid, means no book at that webpage
            print("URL Not Valid\n")

    gutenberg_texts = get_gutenberg_text() # Call function


## VIK EXPLORATION: Separate function to populate list object using existing files in given folder

In [30]:
### Extract book title, author, & text

import os, codecs, re

### Custom function to create `listdir` command that does not show hidden files
def listdir_nohidden(path):
    import glob
    return glob.glob(os.path.join(path, '*'))

os.chdir('/Users/vix/Repos/Python-Learning/src/NLP/Intro to NLP Alice Zhao/books') # Set working folder
print("Adding *all* files in " + os.getcwd() + "\n")
gutenberg_titles = [] # Initialize list of titles
gutenberg_authors = [] # Initialize list of authors
gutenberg_texts = [] # Initialize list of texts

def get_gutenberg_text():
    for file in sorted(listdir_nohidden(".")):
        with codecs.open(file, 'r', encoding='utf-8', errors='ignore') as f:

            # First, take title from file name
            title = re.sub("\.txt","",file)
            title = re.sub("\./","",title)
            title = re.sub("^\d+_","",title)
            gutenberg_titles.append(title) # Put book title in list

            # Then, extract author from book text
            author = ""
            for line in f: 
                if re.search('Author: ',str(line)):
                    author = line[8 : (len(line) - 2)] # Minus 2 at end critical to remove newline character
                    gutenberg_authors.append(author) # Put author in list
                    print("Found author: " + author)
                    break
                elif re.search("^BY ",str(line)):
                    author = line[3 : (len(line) -2)]
                    gutenberg_authors.append(author) # Put author in list
                    print("Found author: " + author)
                    break
            if not author:
                print("No Author Match for " + title)
                gutenberg_authors.append("Unknown Author") # If no author found
                
            # Finally, read all text, beginning with official "START OF" line
            gutenberg_text_current = []
            f.seek(0)
            started = False
            for i, line in enumerate(f.readlines()):
                if not started:
                    if re.search('^\*\*\*.?START OF',str(line)):
                        started = True
                        print ("Started at line", i+1) 
                elif re.search('^\*\*\*.?END OF',str(line)):
                    print ("Ended at line", i+1)
                    break
                else:
                    gutenberg_text_current.append(line)
            
            print(len(gutenberg_text_current))
            if len(gutenberg_text_current) == 0:
                f.seek(0)
                text = f.read()
                gutenberg_text_current.append(text)
                gutenberg_texts.append(gutenberg_text_current)
            else:
                gutenberg_text_current[0 : len(gutenberg_text_current)] = [''.join(gutenberg_text_current[0 : len(gutenberg_text_current)])] 
                gutenberg_texts.append(gutenberg_text_current)

            print("Added list index # " + str(len(gutenberg_texts)-1) + ": " + title + "\n") # Enumerate running count of books

get_gutenberg_text() # Call function

    

Adding *all* files in /Users/vix/Repos/Python-Learning/src/NLP/Intro to NLP Alice Zhao/books

Found author: Dante Alighieri
Started at line 20
Ended at line 5310
5289
Added list index # 0: Hell

Found author: Thomas Troward
Started at line 25
Ended at line 3366
3340
Added list index # 1: The Creative Process in the Individual

Found author: Kossuth
Started at line 19
Ended at line 15211
15191
Added list index # 2: Select Speeches of Kossuth

Found author: Unknown
Started at line 20
Ended at line 804
783
Added list index # 3: The Apricot Tree

No Author Match for The King James Bible
Started at line 28
Ended at line 99874
99845
Added list index # 4: The King James Bible

No Author Match for King Richard III
0
Added list index # 5: King Richard III

Found author: Various
Started at line 21
Ended at line 1573
1551
Added list index # 6: The Mirror of Literature, Amusement, and Instruction

Found author: Elizabeth Gray Potter and Mabel Thayer Gray
Started at line 21
Ended at line 2456
2434


### Correct code snippet used to read only specified block of text beginning and ending with some condition

with open("/Users/vix/Repos/Python-Learning/src/NLP/Intro to NLP Alice Zhao/books/11_Alice's Adventures in Wonderland.txt", 'r') as f:

    started = False
    collected_lines = []
    combined_lines = []
    for i, line in enumerate(f.readlines()):
        if not started:
            if re.search('^\*\*\*.?START OF',str(line)):
                started = True
                print ("started at line", i+1) 
        elif re.search('^\*\*\*.?END OF',str(line)):
            print ("end at line", i+1)
            break
        else:
            collected_lines.append(line)
    print("Book done")
    combined_lines[0 : len(collected_lines)] = [''.join(collected_lines[0 : len(collected_lines)])] 

### Prior wrong attempts at above

with codecs.open(file, 'r', encoding='utf-8', errors='ignore') as f:

            f.seek(0)
            started = False
            collected_lines = []
            for i, line in enumerate(f.readlines()):
                    if re.search('^\*\*\*.?START OF',str(line)):
                        started = True
                        print ("started at line", i) 
                        continue
                    if started and re.search('^\*\*\*.?END OF',str(line)):
                        print ("end at line", i)
                        break
                    collected_lines.append(line)

            f.seek(0)
            textstart = 0
            for line in f:
                if re.search('^\*\*\*.?START OF',str(line)):
                    textstart = textstart + len(line) + 4 # Find start position
                    text = f.readline()
                    if re.search('^\*\*\*.?END OF',str(line)):
                        textend = textstart + len(line) + 4 # Find end position
                        break
                else:
                    textstart = textstart + len(line)
                    f.seek(textstart)
            
            text = f.read() # Begin reading at current value of `textstart`
            
            if not text: # In case no starting point found, take it all
                f.seek(0)
                text = f.read()

            text = text.replace('\n',' ')
            text = text.replace('\r','')
            
            gutenberg_texts.append(text)


# Extract book author

import os
import codecs
import re

### Custom function to create `listdir` command that does not show hidden files
def listdir_nohidden(path):
    import glob
    return glob.glob(os.path.join(path, '*'))

os.chdir('/Users/vix/Repos/Python-Learning/src/NLP/Intro to NLP Alice Zhao/books') # Set working folder
print("Adding *all* files in " + os.getcwd() + "\n")
gutenberg_authors = [] # Initialize list of book titles

for file in sorted(listdir_nohidden(".")):
    with codecs.open(file, 'r', encoding='utf-8', errors='ignore') as f:
        author = ""
        linecount = 1
        for line in f:
            if re.search('Author: ',str(line)):
                print("Matched!")
                author = line[8 : (len(line) - 2)] # Minus 2 at end critical to remove newline character
                gutenberg_authors.append(author) # Put authors in list
            linecount += 1 # Advance line counter
        if not author:
            print("No Match for " + file)
            gutenberg_authors.append("Unknown Author") # If no author found


### Useful code to display beginnings of each list item as preview
[book[0][:100] for book in gutenberg_texts]

### Similar code for items in dictionary form
{}

### Useful code to convert MS Word document to text file

import docx2txt
converted_text = docx2txt.process(filename.docx)
with open(Filename.txt, 'w') as file:
    file.write(converted_text)


### Custom function to create `listdir` command that does not show hidden files

def listdir_nohidden(path):
    import glob
    return glob.glob(os.path.join(path, '*'))


### Pickle files for later use - alternative to `csv_writer()`?

### Make a new directory to hold the text files
!mkdir transcripts

for i, c in enumerate(comedians):
    with open("transcripts/" + c + ".txt", "wb") as file:
        pickle.dump(transcripts[i], file) ### Indexing into the `transcripts` array/list

In [35]:
### Load pickled files into dictionary data container
#### - Modified code to simply open file; unsure about need for Pickle

### Create dictionary data container which can hold book text as well as title, from respective list objects

books = {} # `{}` signifies a dictionary
books = dict(zip(gutenberg_titles, gutenberg_texts))

# for i, title in enumerate(gutenberg_titles):
#     for j, text in enumerate(gutenberg_texts):
#         books[title] = gutenberg_texts[]

# books = {'title': gutenberg_titles, 'text': gutenberg_texts}
# print(len(books))
# for key, value in books.items():
#     print(key, value)

In [36]:
# Double check to make sure data has been loaded properly
books.keys()

dict_keys(['Hell', 'The Creative Process in the Individual', 'Select Speeches of Kossuth', 'The Apricot Tree', 'The King James Bible', 'King Richard III', 'The Mirror of Literature, Amusement, and Instruction', 'The Lure of San Francisco', 'Robbery Under Arms', "Alice's Adventures in Wonderland", 'The Tale of Mrs. Tiggy-Winkle', "At the Earth's Core", 'Punch, Or The London Charivari, Vol. 146., January 21, 1914', 'Through the Looking-Glass', 'Paul Kelver', 'The Hunting of the Snark', 'Household Gods', 'Peter Pan', 'Facino Cane', 'The Book Of Mormon', 'The Federalist Papers', 'The Song Of Hiawatha', 'Paradise Lost', 'A Damsel in Distress', 'Sketches in Lavender, Blue and Green', 'Minna von Barnhelm', 'Laddie', 'Table-Talk', 'Fabre, Poet of Science', 'The Memoirs of Louis XV. and XVI., Volume 6', "Se-Quo-Yah; from Harper's New Monthly, V. 41, 1870", 'The Fortune Hunter', 'Eclectic School Readings: Stories from Life', 'The Rise of the Dutch Republic, 1577-78', 'Cape Cod Stories', 'Arachne

In [38]:
# More checks
books['Q and the Magic of Grammar'][:100]

spell-checker owls perched on a branch of an oak tree. In the tree, there was a hole. And in the hole, there was a scroll.   \n\n\t“I want to get this over and done with. Don’t look so worried. Soon you will be with your precious flower. Q of Alphabet village… Are you ready for the first question?” asked K. \n\n\t“Yes.”     \n\n\tThe three spell-checker owls nodded at each other. Clutching the edge of the scroll in its beak, an owl pulled the scroll out of the tree. The owl flew to the other side of the river, and it let go of the scroll when it was above K’s eager hands.  \n\n\t“The thirteen questions... These were prepared by a committee of spell-checker owls. I had nothing to do with it. Nothing at all,” said K.\n\n\tHe broke the red seal and unfurled the scroll. \n\n\t“The first question is about the zero article. Is this correct? We believe in peace.” \n\n\tThe word-pecker hovered above K. In its beak was the special grammatica flower. The bird bit hard into the flower’s stem.\n\n

## Cleaning The Data

When dealing with numerical data, data cleaning often involves removing null values and duplicate data, dealing with outliers, etc. With text data, there are some common data cleaning techniques, which are also known as text pre-processing techniques.

With text data, this cleaning process can go on forever. There's always an exception to every cleaning step. So, we're going to follow the MVP (minimum viable product) approach - start simple and iterate. Here are a bunch of things you can do to clean your data. We're going to execute just the common cleaning steps here and the rest can be done at a later point to improve our results.

**Common data cleaning steps on all text:**
* Make text all lower case
* Remove punctuation
* Remove numerical values
* Remove common non-sensical text (/n)
* Tokenize text
* Remove stop words

**More data cleaning steps after tokenization:**
* Stemming / lemmatization
* Parts of speech tagging
* Create bi-grams or tri-grams
* Deal with typos
* And more...

In [39]:
### Let's take a look at our data again
### next(iter(books.keys()))
books.keys()

dict_keys(['Hell', 'The Creative Process in the Individual', 'Select Speeches of Kossuth', 'The Apricot Tree', 'The King James Bible', 'King Richard III', 'The Mirror of Literature, Amusement, and Instruction', 'The Lure of San Francisco', 'Robbery Under Arms', "Alice's Adventures in Wonderland", 'The Tale of Mrs. Tiggy-Winkle', "At the Earth's Core", 'Punch, Or The London Charivari, Vol. 146., January 21, 1914', 'Through the Looking-Glass', 'Paul Kelver', 'The Hunting of the Snark', 'Household Gods', 'Peter Pan', 'Facino Cane', 'The Book Of Mormon', 'The Federalist Papers', 'The Song Of Hiawatha', 'Paradise Lost', 'A Damsel in Distress', 'Sketches in Lavender, Blue and Green', 'Minna von Barnhelm', 'Laddie', 'Table-Talk', 'Fabre, Poet of Science', 'The Memoirs of Louis XV. and XVI., Volume 6', "Se-Quo-Yah; from Harper's New Monthly, V. 41, 1870", 'The Fortune Hunter', 'Eclectic School Readings: Stories from Life', 'The Rise of the Dutch Republic, 1577-78', 'Cape Cod Stories', 'Arachne

In [40]:
### Notice that our dictionary is currently in key: book title, value: list of text format
#### - for some reason our values are not in list of text form - they are already string
### next(iter(books.values()))
books.items()

ll-checker owls perched on a branch of an oak tree. In the tree, there was a hole. And in the hole, there was a scroll.   \n\n\t“I want to get this over and done with. Don’t look so worried. Soon you will be with your precious flower. Q of Alphabet village… Are you ready for the first question?” asked K. \n\n\t“Yes.”     \n\n\tThe three spell-checker owls nodded at each other. Clutching the edge of the scroll in its beak, an owl pulled the scroll out of the tree. The owl flew to the other side of the river, and it let go of the scroll when it was above K’s eager hands.  \n\n\t“The thirteen questions... These were prepared by a committee of spell-checker owls. I had nothing to do with it. Nothing at all,” said K.\n\n\tHe broke the red seal and unfurled the scroll. \n\n\t“The first question is about the zero article. Is this correct? We believe in peace.” \n\n\tThe word-pecker hovered above K. In its beak was the special grammatica flower. The bird bit hard into the flower’s stem.\n\n\tQ

### - We are going to change this to key: book title, value: string format - NOT USED
def combine_text(list_of_text):
    '''Takes a list of text and combines them into one large chunk of text.'''
    combined_text = ''.join(list_of_text)
    return combined_text

### Combine it!
books_combined = {key: [combine_text(value)] for (key, value) in books.items()}

In [41]:
# We can either keep it in dictionary format or put it into a pandas dataframe
import pandas as pd
pd.set_option('max_colwidth',150)

books_df = pd.DataFrame.from_dict(books,orient='index')
books_df.columns = ['book_text']
books_df = books_df.sort_index()
books_df

Unnamed: 0,book_text
A Damsel in Distress,\r\n\r\n\r\n\r\nProduced by Jim Tinsley\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n[Transcriber's Note for edition 11: in para. 4 of Chapter 1...
Agnes Grey,"\r\n\r\nTranscribed from the 1910 John Murray edition by David Price, email\r\nccx074@pglaf.org\r\n\r\n_Facsimile of the Title-Page of the First E..."
Alice's Adventures in Wonderland,\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\nALICE'S ADVENTURES IN WONDERLAND\r\n\r\nLewis Carroll\r\n\r\nTHE MILLENNIUM FULCRUM EDITION 3.0\r\n\r\n\r\...
"Arachne, Volume 1.","\r\n\r\n\r\nThis eBook was produced by David Widger <widger@cecomet.net>\r\n\r\n\r\n\r\n[NOTE: There is a short list of bookmarks, or pointers, at..."
At the Earth's Core,\r\n\r\n\r\n\r\nProduced by Judith Boss. HTML version by Al Haines.\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\nAt the Earth's Core\r\n\r\n\r\nBy\r\n\...
Cape Cod Stories,"\r\n\r\n\r\n\r\nProduced by Don Lainson\r\n\r\n\r\n\r\n\r\n\r\nCAPE COD STORIES\r\n\r\nAlso Published Under The Title Of ""The Old Home House""\r\n\..."
David Copperfield,\r\n\r\n\r\n\r\nProduced by Jo Churcher\r\n\r\n\r\n\r\n\r\n\r\nDAVID COPPERFIELD\r\n\r\n\r\nBy Charles Dickens\r\n\r\n\r\n\r\n AFFEC...
Eclectic School Readings: Stories from Life,"\r\n\r\n\r\n\r\nProduced by Robert Rowe, Charles Franks and the Online\r\nDistributed Proofreading Team. HTML version by Al Haines\r\n\r\n\r\n\r\n..."
"Ernest Maltravers, Book 9","\r\n\r\n\r\nThis eBook was produced by Dagny,\r\n and David Widger,\r\n\r\n\r\n\r\n\r\n\r\nBOOK IX.\r\n\r\n I go, the bride of Ac..."
"Fabre, Poet of Science","The Project Gutenberg Etext of Fabre, Poet of Science by Legros\r\nDr. G.V. (C.V.) Legros\r\n\r\nCopyright laws are changing all over the world, b..."


In [42]:
# Let's take a look at the text for title Q and the Magic of Grammar
books_df.book_text.loc['Q and the Magic of Grammar']

 spell-checker owls perched on a branch of an oak tree. In the tree, there was a hole. And in the hole, there was a scroll.   \n\n\t“I want to get this over and done with. Don’t look so worried. Soon you will be with your precious flower. Q of Alphabet village… Are you ready for the first question?” asked K. \n\n\t“Yes.”     \n\n\tThe three spell-checker owls nodded at each other. Clutching the edge of the scroll in its beak, an owl pulled the scroll out of the tree. The owl flew to the other side of the river, and it let go of the scroll when it was above K’s eager hands.  \n\n\t“The thirteen questions... These were prepared by a committee of spell-checker owls. I had nothing to do with it. Nothing at all,” said K.\n\n\tHe broke the red seal and unfurled the scroll. \n\n\t“The first question is about the zero article. Is this correct? We believe in peace.” \n\n\tThe word-pecker hovered above K. In its beak was the special grammatica flower. The bird bit hard into the flower’s stem.\n\

In [43]:
# Apply a first round of text cleaning techniques
import re
import string

def clean_text_round1(text_round1):
    '''Make text lowercase, remove text in square brackets, remove punctuation and remove words containing numbers.'''
    text_round1 = text_round1.lower()
    text_round1 = re.sub('\[.*?\]', '', text_round1)
    text_round1 = re.sub('[%s]' % re.escape(string.punctuation), '', text_round1)
    text_round1 = re.sub('\w*\d\w*', '', text_round1)
    return text_round1

round1 = lambda x: clean_text_round1(x)# Apply a first round of text cleaning techniques

In [51]:
# Let's take a look at the updated text from round 1
books_df_clean = pd.DataFrame(books_df.book_text.apply(round1))
books_df_clean

Unnamed: 0,book_text
A Damsel in Distress,\r\n\r\n\r\n\r\nproduced by jim tinsley\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\ntranscribers note for edition in para of chapter the\r\n...
Agnes Grey,\r\n\r\ntranscribed from the john murray edition by david price email\r\n\r\n\r\nfacsimile of the titlepage of the first edition which was issued...
Alice's Adventures in Wonderland,\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\nalices adventures in wonderland\r\n\r\nlewis carroll\r\n\r\nthe millennium fulcrum edition \r\n\r\n\r\n\r\...
"Arachne, Volume 1.",\r\n\r\n\r\nthis ebook was produced by david widger widgercecometnet\r\n\r\n\r\n\r\nnote there is a short list of bookmarks or pointers at the end...
At the Earth's Core,\r\n\r\n\r\n\r\nproduced by judith boss html version by al haines\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\nat the earths core\r\n\r\n\r\nby\r\n\r\n...
Cape Cod Stories,\r\n\r\n\r\n\r\nproduced by don lainson\r\n\r\n\r\n\r\n\r\n\r\ncape cod stories\r\n\r\nalso published under the title of the old home house\r\n\r\...
David Copperfield,\r\n\r\n\r\n\r\nproduced by jo churcher\r\n\r\n\r\n\r\n\r\n\r\ndavid copperfield\r\n\r\n\r\nby charles dickens\r\n\r\n\r\n\r\n affec...
Eclectic School Readings: Stories from Life,\r\n\r\n\r\n\r\nproduced by robert rowe charles franks and the online\r\ndistributed proofreading team html version by al haines\r\n\r\n\r\n\r\n\r...
"Ernest Maltravers, Book 9",\r\n\r\n\r\nthis ebook was produced by dagny\r\n and david widger\r\n\r\n\r\n\r\n\r\n\r\nbook ix\r\n\r\n i go the bride of achero...
"Fabre, Poet of Science",the project gutenberg etext of fabre poet of science by legros\r\ndr gv cv legros\r\n\r\ncopyright laws are changing all over the world be sure to...


In [48]:
# Apply a second round of cleaning
def clean_text_round2(text_round2):
    '''Get rid of some additional punctuation and non-sensical text that was missed the first time around.'''
    text_round2 = re.sub('[‘’“”…]', '', text_round2)
    text_round2 = re.sub('\n', ' ', text_round2)
    text_round2 = re.sub('\r', ' ', text_round2)
    return text_round2

round2 = lambda x: clean_text_round2(x)

In [52]:
# Let's take a look at the updated text
books_df_clean = pd.DataFrame(books_clean.book_text.apply(round2))
books_df_clean

Unnamed: 0,book_text
A Damsel in Distress,produced by jim tinsley transcribers note for edition in para of chapter the word leafy has been changed to lea...
Agnes Grey,transcribed from the john murray edition by david price email facsimile of the titlepage of the first edition which was issued together...
Alice's Adventures in Wonderland,alices adventures in wonderland lewis carroll the millennium fulcrum edition chapter i down the rabbithole ...
"Arachne, Volume 1.",this ebook was produced by david widger widgercecometnet note there is a short list of bookmarks or pointers at the end of the file ...
At the Earth's Core,produced by judith boss html version by al haines at the earths core by edgar rice burroughs contents ...
Cape Cod Stories,produced by don lainson cape cod stories also published under the title of the old home house by joseph c lincoln ...
David Copperfield,produced by jo churcher david copperfield by charles dickens affectionately inscribed to ...
Eclectic School Readings: Stories from Life,produced by robert rowe charles franks and the online distributed proofreading team html version by al haines eclect...
"Ernest Maltravers, Book 9",this ebook was produced by dagny and david widger book ix i go the bride of acheronsoph antig these ...
"Fabre, Poet of Science",the project gutenberg etext of fabre poet of science by legros dr gv cv legros copyright laws are changing all over the world be sure to check...


## Organizing The Data

### Corpus

In [53]:
# Let's take a look at our dataframe
books_df_clean

Unnamed: 0,book_text
A Damsel in Distress,produced by jim tinsley transcribers note for edition in para of chapter the word leafy has been changed to lea...
Agnes Grey,transcribed from the john murray edition by david price email facsimile of the titlepage of the first edition which was issued together...
Alice's Adventures in Wonderland,alices adventures in wonderland lewis carroll the millennium fulcrum edition chapter i down the rabbithole ...
"Arachne, Volume 1.",this ebook was produced by david widger widgercecometnet note there is a short list of bookmarks or pointers at the end of the file ...
At the Earth's Core,produced by judith boss html version by al haines at the earths core by edgar rice burroughs contents ...
Cape Cod Stories,produced by don lainson cape cod stories also published under the title of the old home house by joseph c lincoln ...
David Copperfield,produced by jo churcher david copperfield by charles dickens affectionately inscribed to ...
Eclectic School Readings: Stories from Life,produced by robert rowe charles franks and the online distributed proofreading team html version by al haines eclect...
"Ernest Maltravers, Book 9",this ebook was produced by dagny and david widger book ix i go the bride of acheronsoph antig these ...
"Fabre, Poet of Science",the project gutenberg etext of fabre poet of science by legros dr gv cv legros copyright laws are changing all over the world be sure to check...


In [54]:
### Let's add the author's name as well

books_df_clean['book_author'] = gutenberg_authors
books_df_clean

Unnamed: 0,book_text,book_author
A Damsel in Distress,produced by jim tinsley transcribers note for edition in para of chapter the word leafy has been changed to lea...,Dante Alighieri
Agnes Grey,transcribed from the john murray edition by david price email facsimile of the titlepage of the first edition which was issued together...,Thomas Troward
Alice's Adventures in Wonderland,alices adventures in wonderland lewis carroll the millennium fulcrum edition chapter i down the rabbithole ...,Kossuth
"Arachne, Volume 1.",this ebook was produced by david widger widgercecometnet note there is a short list of bookmarks or pointers at the end of the file ...,Unknown
At the Earth's Core,produced by judith boss html version by al haines at the earths core by edgar rice burroughs contents ...,Unknown Author
Cape Cod Stories,produced by don lainson cape cod stories also published under the title of the old home house by joseph c lincoln ...,Unknown Author
David Copperfield,produced by jo churcher david copperfield by charles dickens affectionately inscribed to ...,Various
Eclectic School Readings: Stories from Life,produced by robert rowe charles franks and the online distributed proofreading team html version by al haines eclect...,Elizabeth Gray Potter and Mabel Thayer Gray
"Ernest Maltravers, Book 9",this ebook was produced by dagny and david widger book ix i go the bride of acheronsoph antig these ...,"Thomas Alexander Browne, AKA Rolf Boldrewood"
"Fabre, Poet of Science",the project gutenberg etext of fabre poet of science by legros dr gv cv legros copyright laws are changing all over the world be sure to check...,Lewis Carroll


In [None]:
# Let's pickle it for later use
books_df.to_pickle("books_corpus.pkl")

### Document-Term Matrix

In [None]:
# We are going to create a document-term matrix using CountVectorizer, and exclude common English stop words
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(stop_words='english')
books_cv = cv.fit_transform(books_clean.book_text)
books_dtm = pd.DataFrame(books_cv.toarray(), columns=cv.get_feature_names())
books_dtm.index = books_clean.index
books_dtm

In [None]:
### Let's pickle it for later use
books_dtm.to_pickle("books_dtm.pkl")

In [None]:
### Let's also pickle the cleaned data (before we put it in document-term matrix format) and the CountVectorizer object
books_clean.to_pickle('books_clean.pkl')
pickle.dump(cv, open("books_cv.pkl", "wb"))