# VIK DATA CLEANING

- Remember to set Python kernel to 3 (not later).

In [1]:
# Necessary packages
import os, requests, pandas as pd, pickle
from bs4 import BeautifulSoup

## VIK EXPLORATION: File operations

### Essentially copying a file's contents to another file - this works for text files...

with open('/Users/vix/Repos/Python-Learning/src/NLP/SNLI Stanford Corpus/README.txt','r') as sourcefile:
    with open('/Users/vix/Repos/Python-Learning/src/NLP/SNLI Stanford Corpus/DESTFILE.txt','w') as destfile:
        for line in sourcefile:
            destfile.write(line)


### And for an image file...simply append `b` for *binary mode* to file operation command `r`, `w`, or `a`

with open('/Users/vix/OneDrive/Temp/Portrait_Vikram_Before-After.png','rb') as sourceimage:
    with open('/Users/vix/Repos/Python-Learning/src/NLP/SNLI Stanford Corpus/destimage.png','wb') as destimage:
        for line in sourceimage:
            destimage.write(line)



## VIK EXPLORATION: Get book text from Project Gutenberg, save to file, and populate list object

import os
import re
import urllib # Import `urllib` package - primarily using `request` module with `urlopen` method

### os.chdir('/Users/vix/Repos/Python-Learning/src/NLP/Texts')
gutenberg_texts = [] # Initialize list

for counter in range(10,25003): # Loop over each book, which is a reference number

    def get_gutenberg_text():
        url = "https://www.gutenberg.org/files/" + str(counter) + "/" + str(counter) + ".txt"

        try: # Check if URL valid
            webpage = urllib.request.urlopen(url) # Open the webpage containing book text

            # Extract book title and author (author TBD) for file name
            linecount = 1
            for line in webpage:
                m = re.search('Title: ',str(line))
                if m:
                    print("Matched!")
                    text = line.decode()
                    booktitle = text[7 : (len(text) - 2)] # Minus 2 at end critical to remove newline character
                linecount += 1 # Advance line counter
            filename = str(counter)+'_'+booktitle+'.txt'
            
            # Write book text to output file
            print("Currently retrieving: " + booktitle + " -- file name: " + filename)
            with open(str('/Users/vix/Repos/Python-Learning/src/NLP/Texts/' + filename),'w') as file:
                webpage = urllib.request.urlopen(url)
                for line in webpage:
                    text = line.decode() # IMP: Extract only text, discarding non-printing characters
                    file.write(text)
            
            # Write book text to list 
            with open(str('/Users/vix/Repos/Python-Learning/src/NLP/Texts/' + filename),'r') as file:
                text = [file.read().replace('\n','')]
                gutenberg_texts.append(text)
                print("Added list item: " + str(len(gutenberg_texts)) + "\n") # Enumerate list count, which is number of books
            return gutenberg_texts


        except: # If URL invalid, means no book at that webpage
            print("URL Not Valid\n")

    gutenberg_texts = get_gutenberg_text() # Call function


## VIK EXPLORATION: Separate function to populate list object using existing files in given folder

In [41]:
### Extract book title, author, & text

import os
import codecs
import re

### Custom function to create `listdir` command that does not show hidden files
def listdir_nohidden(path):
    import glob
    return glob.glob(os.path.join(path, '*'))

os.chdir('/Users/vix/Repos/Python-Learning/src/NLP/Intro to NLP Alice Zhao/books') # Set working folder
print("Adding *all* files in " + os.getcwd() + "\n")
gutenberg_titles = [] # Initialize list of titles
gutenberg_authors = [] # Initialize list of authors
gutenberg_texts = [] # Initialize list of texts

def get_gutenberg_text():
    for file in sorted(listdir_nohidden(".")):
        with codecs.open(file, 'r', encoding='utf-8', errors='ignore') as f:

            # First, take title from file name
            title = re.sub("\.txt","",file)
            title = re.sub("\./","",title)
            title = re.sub("^\d+_","",title)
            gutenberg_titles.append(title) # Put book title in list

            # Then, extract author from book text
            author = ""
            for line in f: 
                if re.search('Author: ',str(line)):
                    author = line[8 : (len(line) - 2)] # Minus 2 at end critical to remove newline character
                    gutenberg_authors.append(author) # Put author in list
                    print("Found author: " + author)
                    break
                elif re.search("^BY ",str(line)):
                    author = line[3 : (len(line) -2)]
                    gutenberg_authors.append(author) # Put author in list
                    print("Found author: " + author)
                    break
            if not author:
                print("No Author Match for " + title)
                gutenberg_authors.append("Unknown Author") # If no author found

            # Finally, read all text, beginning with official "START OF" line

            f.seek(0)
            started = False
            collected_lines = []
            for i, line in enumerate(f.readlines()):
                    if re.search('^\*\*\*.?START OF',str(line)):
                        started = True
                        print ("started at line", i) 
                        continue
                    if started and re.search('^\*\*\*.?END OF',str(line)):
                        print ("end at line", i)
                        break
                    collected_lines.append(line)

            # f.seek(0)
            # textstart = 0
            # for line in f:
            #     if re.search('^\*\*\*.?START OF',str(line)):
            #         textstart = textstart + len(line) + 4 # Find start position
            #         text = f.readline()
            #         if re.search('^\*\*\*.?END OF',str(line)):
            #             textend = textstart + len(line) + 4 # Find end position
            #             break
            #     else:
            #         textstart = textstart + len(line)
            #         f.seek(textstart)
            
            # text = f.read() # Begin reading at current value of `textstart`
            
            # if not text: # In case no starting point found, take it all
            #     f.seek(0)
            #     text = f.read()

            # text = text.replace('\n',' ')
            # text = text.replace('\r','')
            
            # gutenberg_texts.append(text)
            
            print("Added " + str(len(gutenberg_texts)) + ": " + title + "\n") # Enumerate running count of books

get_gutenberg_text() # Call function


Adding *all* files in /Users/vix/Repos/Python-Learning/src/NLP/Intro to NLP Alice Zhao/books

Found author: Dante Alighieri
started at line 19
end at line 5309
Added 0: Hell

Found author: Thomas Troward
started at line 24
end at line 3365
Added 0: The Creative Process in the Individual

Found author: Kossuth
started at line 18
end at line 15210
Added 0: Select Speeches of Kossuth

Found author: Unknown
started at line 19
end at line 803
Added 0: The Apricot Tree

No Author Match for The King James Bible
started at line 27
end at line 99873
Added 0: The King James Bible

No Author Match for King Richard III
Added 0: King Richard III

Found author: Various
started at line 20
end at line 1572
Added 0: The Mirror of Literature, Amusement, and Instruction

Found author: Elizabeth Gray Potter and Mabel Thayer Gray
started at line 20
end at line 2455
Added 0: The Lure of San Francisco

Found author: Thomas Alexander Browne, AKA Rolf Boldrewood
started at line 20
end at line 20267
Added 0: Ro

# Extract book author

import os
import codecs
import re

### Custom function to create `listdir` command that does not show hidden files
def listdir_nohidden(path):
    import glob
    return glob.glob(os.path.join(path, '*'))

os.chdir('/Users/vix/Repos/Python-Learning/src/NLP/Intro to NLP Alice Zhao/books') # Set working folder
print("Adding *all* files in " + os.getcwd() + "\n")
gutenberg_authors = [] # Initialize list of book titles

for file in sorted(listdir_nohidden(".")):
    with codecs.open(file, 'r', encoding='utf-8', errors='ignore') as f:
        author = ""
        linecount = 1
        for line in f:
            if re.search('Author: ',str(line)):
                print("Matched!")
                author = line[8 : (len(line) - 2)] # Minus 2 at end critical to remove newline character
                gutenberg_authors.append(author) # Put authors in list
            linecount += 1 # Advance line counter
        if not author:
            print("No Match for " + file)
            gutenberg_authors.append("Unknown Author") # If no author found


### Useful code to display beginnings of each list item as preview
[book[0][:100] for book in gutenberg_texts]

### Similar code for items in dictionary form
{}

### Useful code to convert MS Word document to text file

import docx2txt
converted_text = docx2txt.process(filename.docx)
with open(Filename.txt, 'w') as file:
    file.write(converted_text)


### Custom function to create `listdir` command that does not show hidden files

def listdir_nohidden(path):
    import glob
    return glob.glob(os.path.join(path, '*'))


### Pickle files for later use - alternative to `csv_writer()`?

### Make a new directory to hold the text files
!mkdir transcripts

for i, c in enumerate(comedians):
    with open("transcripts/" + c + ".txt", "wb") as file:
        pickle.dump(transcripts[i], file) ### Indexing into the `transcripts` array/list

In [None]:
### Load pickled files into dictionary data container
#### - Modified code to simply open file; unsure about need for Pickle

### Create dictionary data container which can hold book text as well as title, from respective list objects
books = {} # `{}` signifies a dictionary
for i, title in enumerate(gutenberg_titles):
    with open(title + ".txt", 'r') as f:
        books[title] = f.read()
# books = {'title': gutenberg_titles, 'text': gutenberg_texts}
# print(len(books))
# for key, value in books.items():
#     print(key, value)

In [None]:
# Double check to make sure data has been loaded properly
books.keys()

In [None]:
# More checks
books['Q and the Magic of Grammar'][:1000]

## Cleaning The Data

When dealing with numerical data, data cleaning often involves removing null values and duplicate data, dealing with outliers, etc. With text data, there are some common data cleaning techniques, which are also known as text pre-processing techniques.

With text data, this cleaning process can go on forever. There's always an exception to every cleaning step. So, we're going to follow the MVP (minimum viable product) approach - start simple and iterate. Here are a bunch of things you can do to clean your data. We're going to execute just the common cleaning steps here and the rest can be done at a later point to improve our results.

**Common data cleaning steps on all text:**
* Make text all lower case
* Remove punctuation
* Remove numerical values
* Remove common non-sensical text (/n)
* Tokenize text
* Remove stop words

**More data cleaning steps after tokenization:**
* Stemming / lemmatization
* Parts of speech tagging
* Create bi-grams or tri-grams
* Deal with typos
* And more...

In [None]:
### Let's take a look at our data again
### next(iter(books.keys()))
books.keys()

In [None]:
### Notice that our dictionary is currently in key: book title, value: list of text format
#### - for some reason our values are not in list of text form - they are already string
### next(iter(books.values()))
books.items()

### - We are going to change this to key: book title, value: string format - NOT USED
def combine_text(list_of_text):
    '''Takes a list of text and combines them into one large chunk of text.'''
    combined_text = ''.join(list_of_text)
    return combined_text

### Combine it!
books_combined = {key: [combine_text(value)] for (key, value) in books.items()}

In [None]:
# We can either keep it in dictionary format or put it into a pandas dataframe
import pandas as pd
pd.set_option('max_colwidth',150)

books_df = pd.DataFrame.from_dict(books,orient='index')
books_df.columns = ['book_text']
books_df = books_df.sort_index()
books_df

In [None]:
# Let's take a look at the text for title Q and the Magic of Grammar
books_df.book_text.loc['Q and the Magic of Grammar']

In [None]:
# Apply a first round of text cleaning techniques
import re
import string

def clean_text_round1(text_round1):
    '''Make text lowercase, remove text in square brackets, remove punctuation and remove words containing numbers.'''
    text_round1 = text_round1.lower()
    text_round1 = re.sub('\[.*?\]', '', text_round1)
    text_round1 = re.sub('[%s]' % re.escape(string.punctuation), '', text_round1)
    text_round1 = re.sub('\w*\d\w*', '', text_round1)
    return text_round1

round1 = lambda x: clean_text_round1(x)# Apply a first round of text cleaning techniques

In [None]:
# Let's take a look at the updated text from round 1
books_clean = pd.DataFrame(books_df.book_text.apply(round1))
books_clean

In [None]:
# Apply a second round of cleaning
def clean_text_round2(text_round2):
    '''Get rid of some additional punctuation and non-sensical text that was missed the first time around.'''
    text_round2 = re.sub('[‘’“”…]', '', text_round2)
    text_round2 = re.sub('\n', ' ', text_round2)
    return text_round2

round2 = lambda x: clean_text_round2(x)

In [None]:
# Let's take a look at the updated text
books_clean = pd.DataFrame(books_clean.book_text.apply(round2))
books_clean

## Organizing The Data

### Corpus

In [None]:
# Let's take a look at our dataframe
books_df

In [None]:
### Let's add the author's name as well

books_df['book_author'] = gutenberg_authors
books_df

In [None]:
# Let's pickle it for later use
books_df.to_pickle("books_corpus.pkl")

### Document-Term Matrix

In [None]:
# We are going to create a document-term matrix using CountVectorizer, and exclude common English stop words
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(stop_words='english')
books_cv = cv.fit_transform(books_clean.book_text)
books_dtm = pd.DataFrame(books_cv.toarray(), columns=cv.get_feature_names())
books_dtm.index = books_clean.index
books_dtm

In [None]:
### Let's pickle it for later use
books_dtm.to_pickle("books_dtm.pkl")

In [None]:
### Let's also pickle the cleaned data (before we put it in document-term matrix format) and the CountVectorizer object
books_clean.to_pickle('books_clean.pkl')
pickle.dump(cv, open("books_cv.pkl", "wb"))