# VIK DATA CLEANING

- Remember to set Python kernel to 3 (not later).

## VIK EXPLORATION: File operations

### Essentially copying a file's contents to another file - this works for text files...

with open('/Users/vix/Repos/Python-Learning/src/NLP/SNLI Stanford Corpus/README.txt','r') as sourcefile:
    with open('/Users/vix/Repos/Python-Learning/src/NLP/SNLI Stanford Corpus/DESTFILE.txt','w') as destfile:
        for line in sourcefile:
            destfile.write(line)


### And for an image file...simply append `b` for *binary mode* to file operation command `r`, `w`, or `a`

with open('/Users/vix/OneDrive/Temp/Portrait_Vikram_Before-After.png','rb') as sourceimage:
    with open('/Users/vix/Repos/Python-Learning/src/NLP/SNLI Stanford Corpus/destimage.png','wb') as destimage:
        for line in sourceimage:
            destimage.write(line)



## VIK EXPLORATION: Get book text from Project Gutenberg, save to file, and populate list object

import os, requests, re, urllib

### os.chdir('/Users/vix/Repos/Python-Learning/src/NLP/Texts')
gutenberg_texts = [] # Initialize list

for counter in range(31083,350001): # Loop over each book, which is a reference number

    def get_gutenberg_text():
        url = "https://www.gutenberg.org/files/" + str(counter) + "/" + str(counter) + ".txt"

        try: # Check if URL valid
            webpage = urllib.request.urlopen(url) # Open the webpage containing book text

            # Extract book title and author (author TBD) for file name
            linecount = 1
            for line in webpage:
                m = re.search('Title: ',str(line))
                if m:
                    print("Matched!")
                    text = line.decode()
                    booktitle = text[7 : (len(text) - 2)] # Minus 2 at end critical to remove newline character
                linecount += 1 # Advance line counter
            filename = str(counter)+'_'+booktitle+'.txt'
            
            # Write book text to output file
            print("Currently retrieving: " + booktitle + " -- file name: " + filename)
            with open(str('/Users/vix/Repos/Python-Learning/src/NLP/Corpora/Gutenberg Texts/' + filename),'w') as file:
                webpage = urllib.request.urlopen(url)
                for line in webpage:
                    text = line.decode() # IMP: Extract only text, discarding non-printing characters
                    file.write(text)
            
            # Write book text to list 
            with open(str('/Users/vix/Repos/Python-Learning/src/NLP/Corpora/Gutenberg Texts/' + filename),'r') as file:
                text = [file.read().replace('\n','')]
                gutenberg_texts.append(text)
                print("Added list item: " + str(len(gutenberg_texts)) + "\n") # Enumerate list count, which is number of books
            return gutenberg_texts


        except: # If URL invalid, means no book at that webpage
            print("URL Not Valid\n")

    gutenberg_texts = get_gutenberg_text() # Call function


## VIK EXPLORATION: Separate function to populate list object using existing files in given folder

In [1]:
### Extract book title, author, & text

import os, codecs, re

### Custom function to create `listdir` command that does not show hidden files
def listdir_nohidden(path):
    import glob
    return glob.glob(os.path.join(path, '*'))

os.chdir('/Users/vix/Repos/Python-Learning/src/NLP/Intro to NLP Alice Zhao/books') # Set working folder
print("Adding *all* files in " + os.getcwd() + "\n")
gutenberg_titles = [] # Initialize list of titles
gutenberg_authors = [] # Initialize list of authors
gutenberg_texts = [] # Initialize list of texts

def get_gutenberg_text():
    for file in sorted(listdir_nohidden(".")):
        with codecs.open(file, 'r', encoding='utf-8', errors='ignore') as f:

            # First, take title from file name
            title = re.sub("\.txt","",file)
            title = re.sub("\./","",title)
            title = re.sub("^\d+_","",title)
            gutenberg_titles.append(title) # Put book title in list

            # Then, extract author from book text
            author = ""
            for line in f: 
                if re.search('Author: ',str(line)):
                    author = line[8 : (len(line) - 2)] # Minus 2 at end critical to remove newline character
                    gutenberg_authors.append(author) # Put author in list
                    print("Found author: " + author)
                    break
                elif re.search("^BY ",str(line)):
                    author = line[3 : (len(line) -2)]
                    gutenberg_authors.append(author) # Put author in list
                    print("Found author: " + author)
                    break
            if not author:
                print("No Author Match for " + title)
                gutenberg_authors.append("Unknown Author") # If no author found
                
            # Finally, read all text, beginning with official "START OF" line
            gutenberg_text_current = []
            f.seek(0)
            started = False
            for i, line in enumerate(f.readlines()):
                if not started:
                    if re.search('^\*\*\*.?START OF',str(line)):
                        started = True
                        print ("Started at line", i+1) 
                elif re.search('^\*\*\*.?END OF',str(line)):
                    print ("Ended at line", i+1)
                    break
                else:
                    gutenberg_text_current.append(line)
            
            print(len(gutenberg_text_current))
            if len(gutenberg_text_current) == 0:
                f.seek(0)
                text = f.read()
                gutenberg_text_current.append(text)
                gutenberg_texts.append(gutenberg_text_current)
            else:
                gutenberg_text_current[0 : len(gutenberg_text_current)] = [''.join(gutenberg_text_current[0 : len(gutenberg_text_current)])] 
                gutenberg_texts.append(gutenberg_text_current)

            print("Added list index # " + str(len(gutenberg_texts)-1) + ": " + title + "\n") # Enumerate running count of books

get_gutenberg_text() # Call function

    

Adding *all* files in /Users/vix/Repos/Python-Learning/src/NLP/Intro to NLP Alice Zhao/books

Found author: Dante Alighieri
Started at line 20
Ended at line 5310
5289
Added list index # 0: Hell

Found author: Thomas Troward
Started at line 25
Ended at line 3366
3340
Added list index # 1: The Creative Process in the Individual

Found author: Kossuth
Started at line 19
Ended at line 15211
15191
Added list index # 2: Select Speeches of Kossuth

Found author: Unknown
Started at line 20
Ended at line 804
783
Added list index # 3: The Apricot Tree

No Author Match for The King James Bible
Started at line 28
Ended at line 99874
99845
Added list index # 4: The King James Bible

No Author Match for King Richard III
0
Added list index # 5: King Richard III

Found author: Various
Started at line 21
Ended at line 1573
1551
Added list index # 6: The Mirror of Literature, Amusement, and Instruction

Found author: Elizabeth Gray Potter and Mabel Thayer Gray
Started at line 21
Ended at line 2456
2434


### Correct code snippet used to read only specified block of text beginning and ending with some condition

with open("/Users/vix/Repos/Python-Learning/src/NLP/Intro to NLP Alice Zhao/books/11_Alice's Adventures in Wonderland.txt", 'r') as f:

    started = False
    collected_lines = []
    combined_lines = []
    for i, line in enumerate(f.readlines()):
        if not started:
            if re.search('^\*\*\*.?START OF',str(line)):
                started = True
                print ("started at line", i+1) 
        elif re.search('^\*\*\*.?END OF',str(line)):
            print ("end at line", i+1)
            break
        else:
            collected_lines.append(line)
    print("Book done")
    combined_lines[0 : len(collected_lines)] = [''.join(collected_lines[0 : len(collected_lines)])] 

### Prior wrong attempts at above

with codecs.open(file, 'r', encoding='utf-8', errors='ignore') as f:

            f.seek(0)
            started = False
            collected_lines = []
            for i, line in enumerate(f.readlines()):
                    if re.search('^\*\*\*.?START OF',str(line)):
                        started = True
                        print ("started at line", i) 
                        continue
                    if started and re.search('^\*\*\*.?END OF',str(line)):
                        print ("end at line", i)
                        break
                    collected_lines.append(line)

            f.seek(0)
            textstart = 0
            for line in f:
                if re.search('^\*\*\*.?START OF',str(line)):
                    textstart = textstart + len(line) + 4 # Find start position
                    text = f.readline()
                    if re.search('^\*\*\*.?END OF',str(line)):
                        textend = textstart + len(line) + 4 # Find end position
                        break
                else:
                    textstart = textstart + len(line)
                    f.seek(textstart)
            
            text = f.read() # Begin reading at current value of `textstart`
            
            if not text: # In case no starting point found, take it all
                f.seek(0)
                text = f.read()

            text = text.replace('\n',' ')
            text = text.replace('\r','')
            
            gutenberg_texts.append(text)


# Extract book author

import os
import codecs
import re

### Custom function to create `listdir` command that does not show hidden files
def listdir_nohidden(path):
    import glob
    return glob.glob(os.path.join(path, '*'))

os.chdir('/Users/vix/Repos/Python-Learning/src/NLP/Intro to NLP Alice Zhao/books') # Set working folder
print("Adding *all* files in " + os.getcwd() + "\n")
gutenberg_authors = [] # Initialize list of book titles

for file in sorted(listdir_nohidden(".")):
    with codecs.open(file, 'r', encoding='utf-8', errors='ignore') as f:
        author = ""
        linecount = 1
        for line in f:
            if re.search('Author: ',str(line)):
                print("Matched!")
                author = line[8 : (len(line) - 2)] # Minus 2 at end critical to remove newline character
                gutenberg_authors.append(author) # Put authors in list
            linecount += 1 # Advance line counter
        if not author:
            print("No Match for " + file)
            gutenberg_authors.append("Unknown Author") # If no author found


### Useful code to display beginnings of each list item as preview
[book[0][:100] for book in gutenberg_texts]

### Similar code for items in dictionary form
{}

### Useful code to convert MS Word document to text file

import docx2txt
converted_text = docx2txt.process(filename.docx)
with open(Filename.txt, 'w') as file:
    file.write(converted_text)


### Custom function to create `listdir` command that does not show hidden files

def listdir_nohidden(path):
    import glob
    return glob.glob(os.path.join(path, '*'))


### Pickle files for later use - alternative to `csv_writer()`?

### Make a new directory to hold the text files
!mkdir transcripts

for i, c in enumerate(comedians):
    with open("transcripts/" + c + ".txt", "wb") as file:
        pickle.dump(transcripts[i], file) ### Indexing into the `transcripts` array/list

In [3]:
### Load pickled files into dictionary data container
#### - ** IMP ** Modified to create dictionary directly from respective list items

books_dict = {} #### `{}` signifies a dictionary
books_dict = dict(zip(gutenberg_titles, gutenberg_texts)) #### `dict()` constructor w/ `zip` iterator


## Useful dictionary syntax

### Dict comprehension creates dictionaries from arbitrary key:value expressions
#### - `test_dict = {x: "This is the text for book " + x for x in gutenberg_titles}`
#### - `test_dict = {key: key*2}
#### - Comprehension also available for object types list and set
### There are three "dictionary views"
**** - `dict.keys()`, `dict.values()`, `dict_items()`

In [4]:
### Double check to make sure data has been loaded properly
#### `books.keys()` This is the original syntax from the lesson; however, output is a block of running keys.  

### I find the following syntax better, resulting in a more elegant list of keys
list(books_dict)
### or `sorted(books)`
### `'key' in dict` checks for presence of given key in dictionary (or `not in`)

['Hell',
 'The Creative Process in the Individual',
 'Select Speeches of Kossuth',
 'The Apricot Tree',
 'The King James Bible',
 'King Richard III',
 'The Mirror of Literature, Amusement, and Instruction',
 'The Lure of San Francisco',
 'Robbery Under Arms',
 "Alice's Adventures in Wonderland",
 'The Tale of Mrs. Tiggy-Winkle',
 "At the Earth's Core",
 'Punch, Or The London Charivari, Vol. 146., January 21, 1914',
 'Through the Looking-Glass',
 'Paul Kelver',
 'The Hunting of the Snark',
 'Household Gods',
 'Peter Pan',
 'Facino Cane',
 'The Book Of Mormon',
 'The Federalist Papers',
 'The Song Of Hiawatha',
 'Paradise Lost',
 'A Damsel in Distress',
 'Sketches in Lavender, Blue and Green',
 'Minna von Barnhelm',
 'Laddie',
 'Table-Talk',
 'Fabre, Poet of Science',
 'The Memoirs of Louis XV. and XVI., Volume 6',
 "Se-Quo-Yah; from Harper's New Monthly, V. 41, 1870",
 'The Fortune Hunter',
 'Eclectic School Readings: Stories from Life',
 'The Rise of the Dutch Republic, 1577-78',
 'Cap

### More checks - need to learn how to display only first few lines of dictionary item
books['Q and the Magic of Grammar'][:1]

## Cleaning The Data

When dealing with numerical data, data cleaning often involves removing null values and duplicate data, dealing with outliers, etc. With text data, there are some common data cleaning techniques, which are also known as text pre-processing techniques.

With text data, this cleaning process can go on forever. There's always an exception to every cleaning step. So, we're going to follow the MVP (minimum viable product) approach - start simple and iterate. Here are a bunch of things you can do to clean your data. We're going to execute just the common cleaning steps here and the rest can be done at a later point to improve our results.

**Common data cleaning steps on all text:**
* Make text all lower case
* Remove punctuation
* Remove numerical values
* Remove common non-sensical text (/n)
* Tokenize text
* Remove stop words

**More data cleaning steps after tokenization:**
* Stemming / lemmatization
* Parts of speech tagging
* Create bi-grams or tri-grams
* Deal with typos
* And more...

### Notice that our dictionary is currently in key: book title, value: list of text format
#### - for some reason our values are not in list of text form - they are already string
### next(iter(books.values()))
books.items()

### - We are going to change this to key: book title, value: string format - NOT USED
def combine_text(list_of_text):
    '''Takes a list of text and combines them into one large chunk of text.'''
    combined_text = ''.join(list_of_text)
    return combined_text

### Combine it!
books_combined = {key: [combine_text(value)] for (key, value) in books.items()}

In [6]:
# We can either keep it in dictionary format or put it into a **PANDAS DATAFRAME**
import pandas as pd
import numpy as np
pd.set_option('max_colwidth',150)

books_df = pd.DataFrame.from_dict(books_dict,orient='index')
books_df.columns = ['book_text']
# books_df = books_df.sort_index()
books_df

Unnamed: 0,book_text
Hell,\r\n\r\n\r\n\r\nProduced by Judith Smith and Natalie Salter\r\n\r\n\r\n\r\n\r\nHELL\r\n\r\nOR THE INFERNO FROM THE DIVINE COMEDY\r\n\r\nBY\r\n\r\n...
The Creative Process in the Individual,"INDIVIDUAL***\r\n\r\n\r\nE-text prepared by John Hagerson, Kevin Handy, and Project Gutenberg\r\nDistributed Proofreaders\r\n\r\n\r\n\r\nTHE CREAT..."
Select Speeches of Kossuth,"\r\n\r\n\r\n\r\nProduced by Keren Vergon, Rich Magahiz and PG Distributed Proofreaders\r\n\r\n\r\n\r\n\r\nSELECT SPEECHES\r\nOF\r\nKOSSUTH.\r\n\r\..."
The Apricot Tree,"\r\n\r\n\r\n\r\nProduced by Internet Archive; University of Florida, Children, Sjaani\r\nand the Online Distributed Proofreading Team\r\n\r\n\r\n\..."
The King James Bible,\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\nThe Old Testament of the King James Version of the Bible\r\n\r\n\r\n\r\n\r\nThe First Book...
King Richard III,\r\n\r\n*******************************************************************\r\nTHIS EBOOK WAS ONE OF PROJECT GUTENBERG'S EARLY FILES PRODUCED AT A...
"The Mirror of Literature, Amusement, and Instruction","\r\n\r\n\r\n\r\nProduced by Jonathan Ingram, Elaine Walker and the Online Distributed\r\nProofreading Team.\r\n\r\n\r\n\r\n\r\n\r\nTHE MIRROR OF L..."
The Lure of San Francisco,\r\n\r\n\r\n\r\nProduced by David A. Schwan <davidsch@earthlink.net>\r\n\r\n\r\n\r\n\r\n\r\nThe Lure of San Francisco\r\n\r\nA Romance Amid Old La...
Robbery Under Arms,\r\n\r\n\r\n\r\nProduced by Alan R. Light\r\n\r\n\r\n\r\n\r\n\r\nROBBERY UNDER ARMS\r\n\r\nA Story of Life and Adventure in the Bush and in the Go...
Alice's Adventures in Wonderland,\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\nALICE'S ADVENTURES IN WONDERLAND\r\n\r\nLewis Carroll\r\n\r\nTHE MILLENNIUM FULCRUM EDITION 3.0\r\n\r\n\r\...


In [7]:
### Add author to dataframe as column
books_df['author_name'] = gutenberg_authors
books_df

Unnamed: 0,book_text,author_name
Hell,\r\n\r\n\r\n\r\nProduced by Judith Smith and Natalie Salter\r\n\r\n\r\n\r\n\r\nHELL\r\n\r\nOR THE INFERNO FROM THE DIVINE COMEDY\r\n\r\nBY\r\n\r\n...,Dante Alighieri
The Creative Process in the Individual,"INDIVIDUAL***\r\n\r\n\r\nE-text prepared by John Hagerson, Kevin Handy, and Project Gutenberg\r\nDistributed Proofreaders\r\n\r\n\r\n\r\nTHE CREAT...",Thomas Troward
Select Speeches of Kossuth,"\r\n\r\n\r\n\r\nProduced by Keren Vergon, Rich Magahiz and PG Distributed Proofreaders\r\n\r\n\r\n\r\n\r\nSELECT SPEECHES\r\nOF\r\nKOSSUTH.\r\n\r\...",Kossuth
The Apricot Tree,"\r\n\r\n\r\n\r\nProduced by Internet Archive; University of Florida, Children, Sjaani\r\nand the Online Distributed Proofreading Team\r\n\r\n\r\n\...",Unknown
The King James Bible,\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\nThe Old Testament of the King James Version of the Bible\r\n\r\n\r\n\r\n\r\nThe First Book...,Unknown Author
King Richard III,\r\n\r\n*******************************************************************\r\nTHIS EBOOK WAS ONE OF PROJECT GUTENBERG'S EARLY FILES PRODUCED AT A...,Unknown Author
"The Mirror of Literature, Amusement, and Instruction","\r\n\r\n\r\n\r\nProduced by Jonathan Ingram, Elaine Walker and the Online Distributed\r\nProofreading Team.\r\n\r\n\r\n\r\n\r\n\r\nTHE MIRROR OF L...",Various
The Lure of San Francisco,\r\n\r\n\r\n\r\nProduced by David A. Schwan <davidsch@earthlink.net>\r\n\r\n\r\n\r\n\r\n\r\nThe Lure of San Francisco\r\n\r\nA Romance Amid Old La...,Elizabeth Gray Potter and Mabel Thayer Gray
Robbery Under Arms,\r\n\r\n\r\n\r\nProduced by Alan R. Light\r\n\r\n\r\n\r\n\r\n\r\nROBBERY UNDER ARMS\r\n\r\nA Story of Life and Adventure in the Bush and in the Go...,"Thomas Alexander Browne, AKA Rolf Boldrewood"
Alice's Adventures in Wonderland,\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\nALICE'S ADVENTURES IN WONDERLAND\r\n\r\nLewis Carroll\r\n\r\nTHE MILLENNIUM FULCRUM EDITION 3.0\r\n\r\n\r\...,Lewis Carroll


In [8]:
### Explore components of dataframe
books_df.columns
# books_df.loc['Q and the Magic of Grammar']


Index(['book_text', 'author_name'], dtype='object')

In [9]:
# Apply a first round of text cleaning techniques
import re
import string

def clean_text_round1(text_round1):
    '''Make text lowercase, remove text in square brackets, remove punctuation and remove words containing numbers.'''
    text_round1 = text_round1.lower()
    text_round1 = re.sub('\[.*?\]', '', text_round1)
    text_round1 = re.sub('\*', '', text_round1)
    text_round1 = re.sub('\,', '', text_round1)
    text_round1 = re.sub('\;', '', text_round1)
    text_round1 = re.sub('\:', '', text_round1)
    text_round1 = re.sub('\.', '', text_round1)
    text_round1 = re.sub('\"', '', text_round1)
    text_round1 = re.sub('\w*\d\w*', '', text_round1)
    return text_round1

round1 = lambda x: clean_text_round1(x)# Apply the first round of text cleaning 

In [10]:
# Let's take a look at the updated text from round 1
books_df_clean = pd.DataFrame(books_df.book_text.apply(round1))
books_df_clean['author_name'] = gutenberg_authors
books_df_clean['book_text']

Hell                                                           \r\n\r\n\r\n\r\nproduced by judith smith and natalie salter\r\n\r\n\r\n\r\n\r\nhell\r\n\r\nor the inferno from the divine comedy\r\n\r\nby\r\n\r\n...
The Creative Process in the Individual                         individual\r\n\r\n\r\ne-text prepared by john hagerson kevin handy and project gutenberg\r\ndistributed proofreaders\r\n\r\n\r\n\r\nthe creative p...
Select Speeches of Kossuth                                     \r\n\r\n\r\n\r\nproduced by keren vergon rich magahiz and pg distributed proofreaders\r\n\r\n\r\n\r\n\r\nselect speeches\r\nof\r\nkossuth\r\n\r\n\...
The Apricot Tree                                               \r\n\r\n\r\n\r\nproduced by internet archive university of florida children sjaani\r\nand the online distributed proofreading team\r\n\r\n\r\n\r\n...
The King James Bible                                           \r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\nthe old testament of the king

In [11]:
# Apply a second round of cleaning
def clean_text_round2(text_round2):
    '''Get rid of some additional punctuation and non-sensical text that was missed the first time around.'''
    text_round2 = re.sub('[‘’“”…]', '', text_round2)
    text_round2 = re.sub('_', '', text_round2)
    text_round2 = re.sub('\n', ' ', text_round2)
    text_round2 = re.sub('\r', ' ', text_round2)
    return text_round2

round2 = lambda x: clean_text_round2(x) # Apply the second round of text cleaning

In [12]:
# Let's take a look at the updated text from round 2
books_df_clean = pd.DataFrame(books_df_clean.book_text.apply(round2))
books_df_clean['author_name'] = gutenberg_authors
books_df_clean['book_text']

Hell                                                                   produced by judith smith and natalie salter          hell    or the inferno from the divine comedy    by    dante alighieri        transla...
The Creative Process in the Individual                         individual      e-text prepared by john hagerson kevin handy and project gutenberg  distributed proofreaders        the creative process in the in...
Select Speeches of Kossuth                                             produced by keren vergon rich magahiz and pg distributed proofreaders          select speeches  of  kossuth      condensed and abridged  w...
The Apricot Tree                                                       produced by internet archive university of florida children sjaani  and the online distributed proofreading team            the    apricot...
The King James Bible                                                                       the old testament of the king james version of the bible 

In [13]:
# Apply a third round to deal with idiomatic and idiosyncratic deliberate mis-spellings

def clean_text_round3(text_round3):
    text_round3 = re.sub("in\'", 'ing', text_round3)
    text_round3 = re.sub("^\'em", 'them', text_round3)
    text_round3 = re.sub('^.?-', '', text_round3)
    return text_round3

round3 = lambda x: clean_text_round3(x) # Apply the third round of text cleaning

In [14]:
# Let's take a look at the updated text from round 3
books_df_clean = pd.DataFrame(books_df_clean.book_text.apply(round3))
books_df_clean['author_name'] = gutenberg_authors
books_df_clean

Unnamed: 0,book_text,author_name
Hell,produced by judith smith and natalie salter hell or the inferno from the divine comedy by dante alighieri transla...,Dante Alighieri
The Creative Process in the Individual,individual e-text prepared by john hagerson kevin handy and project gutenberg distributed proofreaders the creative process in the in...,Thomas Troward
Select Speeches of Kossuth,produced by keren vergon rich magahiz and pg distributed proofreaders select speeches of kossuth condensed and abridged w...,Kossuth
The Apricot Tree,produced by internet archive university of florida children sjaani and the online distributed proofreading team the apricot...,Unknown
The King James Bible,the old testament of the king james version of the bible the first book of moses called genesis in the...,Unknown Author
King Richard III,this ebook was one of project gutenberg's early files produced at a time when proofing methods and tools were not well developed there is ...,Unknown Author
"The Mirror of Literature, Amusement, and Instruction",produced by jonathan ingram elaine walker and the online distributed proofreading team the mirror of literature amusement and ...,Various
The Lure of San Francisco,produced by david a schwan <davidsch@earthlinknet> the lure of san francisco a romance amid old landmarks by elizabe...,Elizabeth Gray Potter and Mabel Thayer Gray
Robbery Under Arms,produced by alan r light robbery under arms a story of life and adventure in the bush and in the goldfields of australia ...,"Thomas Alexander Browne, AKA Rolf Boldrewood"
Alice's Adventures in Wonderland,alice's adventures in wonderland lewis carroll the millennium fulcrum edition chapter i down the rabbit-hole ...,Lewis Carroll


## Organizing The Data

### Corpus

In [15]:
# Let's take a look at our dataframe
books_df_clean

Unnamed: 0,book_text,author_name
Hell,produced by judith smith and natalie salter hell or the inferno from the divine comedy by dante alighieri transla...,Dante Alighieri
The Creative Process in the Individual,individual e-text prepared by john hagerson kevin handy and project gutenberg distributed proofreaders the creative process in the in...,Thomas Troward
Select Speeches of Kossuth,produced by keren vergon rich magahiz and pg distributed proofreaders select speeches of kossuth condensed and abridged w...,Kossuth
The Apricot Tree,produced by internet archive university of florida children sjaani and the online distributed proofreading team the apricot...,Unknown
The King James Bible,the old testament of the king james version of the bible the first book of moses called genesis in the...,Unknown Author
King Richard III,this ebook was one of project gutenberg's early files produced at a time when proofing methods and tools were not well developed there is ...,Unknown Author
"The Mirror of Literature, Amusement, and Instruction",produced by jonathan ingram elaine walker and the online distributed proofreading team the mirror of literature amusement and ...,Various
The Lure of San Francisco,produced by david a schwan <davidsch@earthlinknet> the lure of san francisco a romance amid old landmarks by elizabe...,Elizabeth Gray Potter and Mabel Thayer Gray
Robbery Under Arms,produced by alan r light robbery under arms a story of life and adventure in the bush and in the goldfields of australia ...,"Thomas Alexander Browne, AKA Rolf Boldrewood"
Alice's Adventures in Wonderland,alice's adventures in wonderland lewis carroll the millennium fulcrum edition chapter i down the rabbit-hole ...,Lewis Carroll


In [16]:
# Let's pickle it for later use - original and cleaned
books_df.to_pickle("/Users/vix/Repos/Python-Learning/src/NLP/Intro to NLP Alice Zhao/pickle/books_corpus.pkl")
books_df_clean.to_pickle("/Users/vix/Repos/Python-Learning/src/NLP/Intro to NLP Alice Zhao/pickle/books_clean_corpus.pkl")

### Document-Term Matrix

In [33]:
# We are going to create a document-term matrix using CountVectorizer, and exclude common English stop words
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd

## First, make custom stopwords list from previously edited built-in list
books_stopwords = pd.read_csv('/Users/vix/Repos/Python-Learning/src/NLP/Intro to NLP Alice Zhao/books_stopwords.csv', header=None, encoding='utf-8')
books_stopwords = list(np.squeeze(books_stopwords.values))
    # with open("/Users/vix/Repos/Python-Learning/src/NLP/Intro to NLP Alice Zhao/books_stopwords.csv", 'r') as csvfile:
    #     books_stopwords = []
    #     books_stopwords = list(csv.reader(csvfile))

cv = CountVectorizer(stop_words=frozenset(books_stopwords), strip_accents='ascii', token_pattern=r'(?u)\b\w+\b', ngram_range=(1,1))
books_cv = cv.fit_transform(books_df_clean.book_text)
# print(cv.get_feature_names())

## Save resultant words - called feature names - to file
with open("/Users/vix/Repos/Python-Learning/src/NLP/Intro to NLP Alice Zhao/books_featurenames_vocabulary.csv", 'w', newline='') as csvfile:
    vocab = csv.writer(csvfile, delimiter='\n')
    vocab.writerow(cv.get_feature_names())

books_dtm = pd.DataFrame(books_cv.toarray(), columns=cv.get_feature_names())
books_dtm.index = books_df_clean.index
books_dtm


Unnamed: 0,aam,aaron,aaronites,aarons,ab,aback,abaddon,abagtha,abana,abandon,...,zuar,zug,zuph,zur,zuriel,zurishaddai,zuyder,zuzims,zythus,zyxrqpcba
Hell,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
The Creative Process in the Individual,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Select Speeches of Kossuth,0,0,0,0,0,0,0,0,0,14,...,0,0,0,0,0,0,0,0,0,0
The Apricot Tree,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
The King James Bible,0,350,2,0,0,0,1,1,1,0,...,5,0,3,5,1,5,0,1,0,0
King Richard III,0,96,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
"The Mirror of Literature, Amusement, and Instruction",0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
The Lure of San Francisco,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Robbery Under Arms,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Alice's Adventures in Wonderland,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [35]:
try:
    print(books_dtm.loc[:, "p"]) ### Check if certain term exists in vocabulary
except:
    print("\nThat text does not exist\n")

Hell                                                             0
The Creative Process in the Individual                           0
Select Speeches of Kossuth                                       2
The Apricot Tree                                                 0
The King James Bible                                             0
King Richard III                                                 1
The Mirror of Literature, Amusement, and Instruction             0
The Lure of San Francisco                                        1
Robbery Under Arms                                               6
Alice's Adventures in Wonderland                                 0
The Tale of Mrs. Tiggy-Winkle                                    0
At the Earth's Core                                              0
Punch, Or The London Charivari, Vol. 146., January 21, 1914      0
Through the Looking-Glass                                        0
Paul Kelver                                                   

In [36]:
### Let's pickle it for later use
books_dtm.to_pickle("/Users/vix/Repos/Python-Learning/src/NLP/Intro to NLP Alice Zhao/pickle/books_dtm.pkl")

In [37]:
### Let's also pickle the cleaned data in dataframe form (before we had put it in document-term matrix format), as well as the CountVectorizer object
books_df_clean.to_pickle('/Users/vix/Repos/Python-Learning/src/NLP/Intro to NLP Alice Zhao/pickle/books_df_clean.pkl')
import pickle
pickle.dump(cv, open("/Users/vix/Repos/Python-Learning/src/NLP/Intro to NLP Alice Zhao/pickle/books_cv.pkl", "wb"))