# VIK DATA CLEANING

- Remember to set Python kernel to 3 (not later).
- Install additional packages `textblob`, `wordcloud`, and `gensim`.

## Import packages for scraping webpage contents and making sense of them

import os, requests
from bs4 import BeautifulSoup

## VIK EXPLORATION: File operations

### Essentially copying a file's contents to another file - this works for text files...

with open('/Users/vix/Repos/Python-Learning/src/NLP/SNLI Stanford Corpus/README.txt','r') as sourcefile:
    with open('/Users/vix/Repos/Python-Learning/src/NLP/SNLI Stanford Corpus/DESTFILE.txt','w') as destfile:
        for line in sourcefile:
            destfile.write(line)


### And for an image file...simply append `b` for *binary mode* to file operation command `r`, `w`, or `a`

with open('/Users/vix/OneDrive/Temp/Portrait_Vikram_Before-After.png','rb') as sourceimage:
    with open('/Users/vix/Repos/Python-Learning/src/NLP/SNLI Stanford Corpus/destimage.png','wb') as destimage:
        for line in sourceimage:
            destimage.write(line)



## VIK EXPLORATION: Get book text from Project Gutenberg, save to file, and populate list object

import os
import re
import urllib # Import `urllib` package - primarily using `request` module with `urlopen` method

### os.chdir('/Users/vix/Repos/Python-Learning/src/NLP/Texts')
gutenberg_texts = [] # Initialize list

for counter in range(10,25003): # Loop over each book, which is a reference number

    def get_gutenberg_text():
        url = "https://www.gutenberg.org/files/" + str(counter) + "/" + str(counter) + ".txt"

        try: # Check if URL valid
            webpage = urllib.request.urlopen(url) # Open the webpage containing book text

            # Extract book title and author (author TBD) for file name
            linecount = 1
            for line in webpage:
                m = re.search('Title: ',str(line))
                if m:
                    print("Matched!")
                    text = line.decode()
                    booktitle = text[7 : (len(text) - 2)] # Minus 2 at end critical to remove newline character
                linecount += 1 # Advance line counter
            filename = str(counter)+'_'+booktitle+'.txt'
            
            # Write book text to output file
            print("Currently retrieving: " + booktitle + " -- file name: " + filename)
            with open(str('/Users/vix/Repos/Python-Learning/src/NLP/Texts/' + filename),'w') as file:
                webpage = urllib.request.urlopen(url)
                for line in webpage:
                    text = line.decode() # IMP: Extract only text, discarding non-printing characters
                    file.write(text)
            
            # Write book text to list 
            with open(str('/Users/vix/Repos/Python-Learning/src/NLP/Texts/' + filename),'r') as file:
                text = [file.read().replace('\n','')]
                gutenberg_texts.append(text)
                print("Added list item: " + str(len(gutenberg_texts)) + "\n") # Enumerate list count, which is number of books
            return gutenberg_texts


        except: # If URL invalid, means no book at that webpage
            print("URL Not Valid\n")

    gutenberg_texts = get_gutenberg_text() # Call function


## VIK EXPLORATION: Separate function to populate list object using existing files in given folder

import os
import codecs
import re

### Custom function to create `listdir` command that does not show hidden files
def listdir_nohidden(path):
    import glob
    return glob.glob(os.path.join(path, '*'))

os.chdir('/Users/vix/Repos/Python-Learning/src/NLP/Intro to NLP Alice Zhao/books') # Set working folder
print("Adding *all* files in " + os.getcwd() + "\n")
gutenberg_texts = [] # Initialize list of texts
gutenberg_titles = [] # Initialize list of book titles

def get_gutenberg_text():
    # Extract book title
    for file in sorted(listdir_nohidden(".")):
        with codecs.open(file, 'r', encoding='utf-8', errors='ignore') as f:
            linecount = 1
            for line in f:
                m = re.search('Title: ',str(line))
                if m:
                    print("Matched!")
                    title = line[7 : (len(line) - 2)] # Minus 2 at end critical to remove newline character
                    gutenberg_titles.append(title) # Put book titles in list
                linecount += 1 # Advance line counter

    # Write book text from file to list 
    for file in sorted(listdir_nohidden(".")):
        with codecs.open(file, 'r', encoding='utf-8', errors='ignore') as f:
            text = [f.read().replace('\n',' ')]
            gutenberg_texts.append(text)
            print(str(len(gutenberg_texts)) + ": " + file) # Enumerate list count, which is running count of books

get_gutenberg_text() # Call function


gutenberg_titles.append("Q and the Magic of Grammar")

### Useful code to display beginnings of each list item as preview
[book[0][:100] for book in gutenberg_texts]

### Similar code for items in dictionary form
{}

### Useful code to convert MS Word document to text file

import docx2txt
converted_text = docx2txt.process(filename.docx)
with open(Filename.txt, 'w') as file:
    file.write(converted_text)


### Custom function to create `listdir` command that does not show hidden files

def listdir_nohidden(path):
    import glob
    return glob.glob(os.path.join(path, '*'))


## **STOP** - Resume NLP lesson

# Necessary packages
import requests
from bs4 import BeautifulSoup
import pickle

# User function to scrape transcript data from scrapsfromtheloft.com
def url_to_transcript(url):
    '''Returns HTML contents of specified site.'''
    page = requests.get(url).text
    soup = BeautifulSoup(page, "lxml")
    text = [p.text for p in soup.find(class_="post-content").find_all('p')]
    print(url)
    return text

# URLs of transcripts in scope
urls = ['http://scrapsfromtheloft.com/2017/05/06/louis-ck-oh-my-god-full-transcript/',
        'http://scrapsfromtheloft.com/2017/04/11/dave-chappelle-age-spin-2017-full-transcript/',
        'http://scrapsfromtheloft.com/2018/03/15/ricky-gervais-humanity-transcript/',
        'http://scrapsfromtheloft.com/2017/08/07/bo-burnham-2013-full-transcript/',
        'http://scrapsfromtheloft.com/2017/05/24/bill-burr-im-sorry-feel-way-2014-full-transcript/',
        'http://scrapsfromtheloft.com/2017/04/21/jim-jefferies-bare-2014-full-transcript/',
        'http://scrapsfromtheloft.com/2017/08/02/john-mulaney-comeback-kid-2015-full-transcript/',
        'http://scrapsfromtheloft.com/2017/10/21/hasan-minhaj-homecoming-king-2017-full-transcript/',
        'http://scrapsfromtheloft.com/2017/09/19/ali-wong-baby-cobra-2016-full-transcript/',
        'http://scrapsfromtheloft.com/2017/08/03/anthony-jeselnik-thoughts-prayers-2015-full-transcript/',
        'http://scrapsfromtheloft.com/2018/03/03/mike-birbiglia-my-girlfriends-boyfriend-2013-full-transcript/',
        'http://scrapsfromtheloft.com/2017/08/19/joe-rogan-triggered-2016-full-transcript/']

# Comedian names
comedians = ['louis', 'dave', 'ricky', 'bo', 'bill', 'jim', 'john', 'hasan', 'ali', 'anthony', 'mike', 'joe']

# Actually perform scrape of contents of scrapsfromtheloft.com

transcripts = [url_to_transcript(u) for u in urls]

# Pickle files for later use - alternative to `csv_writer()`?

## Make a new directory to hold the text files
!mkdir transcripts

for i, c in enumerate(comedians):
    with open("transcripts/" + c + ".txt", "wb") as file:
        pickle.dump(transcripts[i], file) ### Indexing into the `transcripts` array/list

### Load pickled files - modified code to simply open file; unsure about need for Pickle
### Create dictionary data container which can hold book text as well as title

books = {} # `{}` signifies a dictionary
for i, title in enumerate(gutenberg_titles):
    with open(title + ".txt", 'r') as f:
        books[title] = f.read()

In [None]:
books = {'title': gutenberg_titles, 'text': gutenberg_texts}
print(len(books))
for key, value in books.items():
    print(key, value)

In [12]:
# Double check to make sure data has been loaded properly
books.keys()

dict_keys(["Alice's Adventures in Wonderland", 'Paradise Lost', 'Peter Pan', 'The 1990 CIA World Factbook', 'The Book Of Mormon', 'The Federalist Papers', 'The Hunting of the Snark', 'The King James Bible', 'The Song Of Hiawatha', 'Through the Looking-Glass', 'Q and the Magic of Grammar'])

In [13]:
# More checks
books['The King James Bible'][:100]

'The Project Gutenberg EBook of The King James Bible\n\n\n**********************************************'

## Cleaning The Data

When dealing with numerical data, data cleaning often involves removing null values and duplicate data, dealing with outliers, etc. With text data, there are some common data cleaning techniques, which are also known as text pre-processing techniques.

With text data, this cleaning process can go on forever. There's always an exception to every cleaning step. So, we're going to follow the MVP (minimum viable product) approach - start simple and iterate. Here are a bunch of things you can do to clean your data. We're going to execute just the common cleaning steps here and the rest can be done at a later point to improve our results.

**Common data cleaning steps on all text:**
* Make text all lower case
* Remove punctuation
* Remove numerical values
* Remove common non-sensical text (/n)
* Tokenize text
* Remove stop words

**More data cleaning steps after tokenization:**
* Stemming / lemmatization
* Parts of speech tagging
* Create bi-grams or tri-grams
* Deal with typos
* And more...

In [16]:
# Let's take a look at our data again
next(iter(books.keys()))

"Alice's Adventures in Wonderland"

In [17]:
# Notice that our dictionary is currently in key: book title, value: list of text format
next(iter(books.values()))

he shepherd boy--and the sneeze of the baby, the\nshriek of the Gryphon, and all the other queer noises, would change (she\nknew) to the confused clamour of the busy farm-yard--while the lowing\nof the cattle in the distance would take the place of the Mock Turtle\'s\nheavy sobs.\n\nLastly, she pictured to herself how this same little sister of hers\nwould, in the after-time, be herself a grown woman; and how she would\nkeep, through all her riper years, the simple and loving heart of her\nchildhood: and how she would gather about her other little children, and\nmake THEIR eyes bright and eager with many a strange tale, perhaps even\nwith the dream of Wonderland of long ago: and how she would feel with\nall their simple sorrows, and find a pleasure in all their simple joys,\nremembering her own child-life, and the happy summer days.\n\n              THE END\n\n\n\n\n\nEnd of Project Gutenberg\'s Alice\'s Adventures in Wonderland, by Lewis Carroll\n\n*** END OF THIS PROJECT GUTENBERG EB

In [28]:
# We are going to change this to key: book title, value: string format
def combine_text(list_of_text):
    '''Takes a list of text and combines them into one large chunk of text.'''
    combined_text = ''.join(list_of_text)
    return combined_text

In [29]:
# Combine it!
books_combined = {key: [combine_text(value)] for (key, value) in books.items()}

In [30]:
# We can either keep it in dictionary format or put it into a pandas dataframe
import pandas as pd
pd.set_option('max_colwidth',150)

books_df = pd.DataFrame.from_dict(books_combined).transpose()
books_df.columns = ['book_text']
books_df = books_df.sort_index()
books_df

Unnamed: 0,book_text
Alice's Adventures in Wonderland,"Project Gutenberg's Alice's Adventures in Wonderland, by Lewis Carroll\n\nThis eBook is for the use of anyone anywhere at no cost and with\nalmost..."
Paradise Lost,"The Project Gutenberg EBook of Paradise Lost, by John Milton\n\nThis eBook is for the use of anyone anywhere at no cost and with\nalmost no restri..."
Peter Pan,"The Project Gutenberg EBook of Peter Pan, by James M. Barrie\n\nThis eBook is for the use of anyone anywhere at no cost and with\nalmost no restri..."
Q and the Magic of Grammar,Q\n\nand\n\nthe Magic of Grammar\n\n\n\nBy Amal Fabian\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nQ and the Magic of Grammar\n\n\n\nCopyright ©Amal Fa...
The 1990 CIA World Factbook,"The Project Gutenberg EBook of The 1990 CIA World Factbook, by \nUnited States. Central Intelligence Agency\n\nThis eBook is for the use of anyon..."
The Book Of Mormon,"The Project Gutenberg EBook of The Book Of Mormon, by Anonymous\n\nThis eBook is for the use of anyone anywhere at no cost and with\nalmost no res..."
The Federalist Papers,"The Project Gutenberg EBook of The Federalist Papers, by \nAlexander Hamilton and John Jay and James Madison\n\nThis eBook is for the use of anyon..."
The Hunting of the Snark,"The Project Gutenberg EBook of The Hunting of the Snark, by Lewis Carroll\n\nThis eBook is for the use of anyone anywhere at no cost and with\nalm..."
The King James Bible,The Project Gutenberg EBook of The King James Bible\n\n\n**********************************************************************\nEBOOK (#10) WAS O...
The Song Of Hiawatha,"The Project Gutenberg EBook of The Song Of Hiawatha, by Henry W. Longfellow\n\nThis eBook is for the use of anyone anywhere at no cost and with\na..."


In [31]:
# Let's take a look at the text for Amal Fabian
books_df.book_text.loc['Q and the Magic of Grammar']

 spell-checker owls perched on a branch of an oak tree. In the tree, there was a hole. And in the hole, there was a scroll.   \n\n\t“I want to get this over and done with. Don’t look so worried. Soon you will be with your precious flower. Q of Alphabet village… Are you ready for the first question?” asked K. \n\n\t“Yes.”     \n\n\tThe three spell-checker owls nodded at each other. Clutching the edge of the scroll in its beak, an owl pulled the scroll out of the tree. The owl flew to the other side of the river, and it let go of the scroll when it was above K’s eager hands.  \n\n\t“The thirteen questions... These were prepared by a committee of spell-checker owls. I had nothing to do with it. Nothing at all,” said K.\n\n\tHe broke the red seal and unfurled the scroll. \n\n\t“The first question is about the zero article. Is this correct? We believe in peace.” \n\n\tThe word-pecker hovered above K. In its beak was the special grammatica flower. The bird bit hard into the flower’s stem.\n\

In [None]:
# Apply a first round of text cleaning techniques
import re
import string

def clean_text_round1(text):
    '''Make text lowercase, remove text in square brackets, remove punctuation and remove words containing numbers.'''
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

round1 = lambda x: clean_text_round1(x)# Apply a first round of text cleaning techniques

In [None]:
# Let's take a look at the updated text
data_clean = pd.DataFrame(data_df.transcript.apply(round1))
data_clean

In [None]:
# Apply a second round of cleaning
def clean_text_round2(text):
    '''Get rid of some additional punctuation and non-sensical text that was missed the first time around.'''
    text = re.sub('[‘’“”…]', '', text)
    text = re.sub('\n', '', text)
    return text

round2 = lambda x: clean_text_round2(x)

In [None]:
# Let's take a look at the updated text
data_clean = pd.DataFrame(data_clean.transcript.apply(round2))
data_clean

## Organizing The Data

### Corpus

In [None]:
# Let's take a look at our dataframe
data_df

In [None]:
# Let's add the comedians' full names as well
full_names = ['Ali Wong', 'Anthony Jeselnik', 'Bill Burr', 'Bo Burnham', 'Dave Chappelle', 'Hasan Minhaj',
              'Jim Jefferies', 'Joe Rogan', 'John Mulaney', 'Louis C.K.', 'Mike Birbiglia', 'Ricky Gervais']

data_df['full_name'] = full_names
data_df

In [None]:
# Let's pickle it for later use
data_df.to_pickle("corpus.pkl")