# VIK DATA CLEANING

- Remember to set Python kernel to 3 (not later).
- Install additional packages `textblob`, `wordcloud`, and `gensim`.

## VIK EXPLORATION: Search Engine Results

### Import packages for scraping webpage contents and making sense of them

import os
import requests
from bs4 import BeautifulSoup

### Create variables and lists to serve as argument placeholders for scraping

- #### `get` works in conjunction with `requests`.
- #### `BeautifulSoup` must have a particular HTML element from a webpage to work on.
  + ##### In this case, `class="post-content">`, and the *p* is from `<p style=...>`.
  + ##### Each website might have its own HTML structure; so might need different `soup.find` argument for each site being scraped.  
    * ##### E.g., `class_="css-53u6y8"` works for a NYTimes.com article, along with  *p* which is standard in HTML to represent a paragraph of text.
    * ##### E.g., `class_="repo-list"` works for GitHub search results.
    * ##### E.g., `li class_="b_algo"` with *a* works for Bing search results.


### Ask user for input

os.system('clear')

print("\n\nHello there!  \n\n\nThis tool takes your search query, \n\napplies it to both major search engines, \n\nand then displays a simple comparison of the resulting search engine results.  \n\nAnalysis is limited to the first 100 results from each search engine.")

search_query = input("\n\n\nWhat should we search for? ")

### Set page URLs
query_url_bing = requests.get('https://www.bing.com/search?'+'q='+search_query+'&count=1000').text
query_url_google = requests.get('https://www.google.com/search?'+'q='+search_query+'&num=1000').text

#### Pass URL of page into `BeautifulSoup` method
query_html_bing = BeautifulSoup(query_url_bing, 'lxml')
query_html_google = BeautifulSoup(query_url_google, "html.parser")

### Test page extraction results
#### Display well-formatted HTML results
##### print (source_html.prettify())

#### Extract one result title
##### result_title = source_html.find('div', class_='f4 text-normal').text
#### Extract one result description
##### result_desc = source_html.find('p', class_='mb-1').text

#### Display one (ie, first) result and title
##### print (result_title)
##### print (result_desc)


### Prepare output file

import csv # CSV module
from datetime import datetime

#### Check existing records in file, to which we will append

##### with open('/Users/vix/Repos/Python-Learning/src/NLP/Intro to NLP Alice Zhao/Search_Results_Combined.csv','r') as csv_file:
#####     csv_reader = csv.reader(csv_file)
#####     for line in csv_reader:
#####         print(line)


### Extract each search result's title and append to CSV file - note `'a'` argument for append

#### Since using csv.writer in append mode, no need for header row; but otherwise for new files must use:
##### csv_writer.writerow(['Search_Engine','Search_Query','Result_Title'])

with open('/Users/vix/Repos/Python-Learning/src/NLP/Intro to NLP Alice Zhao/Search_Results_Combined.csv','a') as csv_file:
    csv_writer = csv.writer(csv_file)

    for result in query_html_bing.find_all('li', attrs = {'class':'b_algo'}):
        result_title_bing = result.find('a').text
        # print(result_title_bing) - Avoid to save display space
        csv_writer.writerow([datetime.today(),"Bing",search_query,result_title_bing])

    for result in query_html_google.find_all('h3', attrs = {'class':'zBAuLc'}):
        result_title_google = result.find('div').text
        # print(result_title_google) - Avoid to save display space
        csv_writer.writerow([datetime.today(),"Google",search_query,result_title_google])

csv_file.close() # Must close file, only after completing loop of row additions



## VIK EXPLORATION: Compare two text documents

import urllib
import os

os.chdir('/Users/vix/Repos/Python-Learning/src/NLP/Texts')

with open('base.txt', 'r') as file1:
    with open('comparison.txt', 'r') as file2:
        difference = set(file1).symmetric_difference(file2)

for line in difference:
    print(line)


## VIK EXPLORATION: Retrieve contents of text **file** at any URL and save to disk

## VIK EXPLORATION: File operations

### Essentially copying a file's contents to another file - this works for text files...

with open('/Users/vix/Repos/Python-Learning/src/NLP/SNLI Stanford Corpus/README.txt','r') as sourcefile:
    with open('/Users/vix/Repos/Python-Learning/src/NLP/SNLI Stanford Corpus/DESTFILE.txt','w') as destfile:
        for line in sourcefile:
            destfile.write(line)


### And for an image file...simply append `b` for *binary mode* to file operation command `r`, `w`, or `a`

with open('/Users/vix/OneDrive/Temp/Portrait_Vikram_Before-After.png','rb') as sourceimage:
    with open('/Users/vix/Repos/Python-Learning/src/NLP/SNLI Stanford Corpus/destimage.png','wb') as destimage:
        for line in sourceimage:
            destimage.write(line)



## VIK EXPLORATION: Get book text from Project Gutenberg, save to file, and populate list object

In [None]:
import os
import re
import urllib # Import `urllib` package - primarily using `request` module with `urlopen` method

# os.chdir('/Users/vix/Repos/Python-Learning/src/NLP/Texts')
gutenberg_texts = [] # Initialize list

for counter in range(10,25003): # Loop over each book, which is a reference number

    def get_gutenberg_text():
        url = "https://www.gutenberg.org/files/" + str(counter) + "/" + str(counter) + ".txt"

        try: # Check if URL valid
            webpage = urllib.request.urlopen(url) # Open the webpage containing book text

            # Extract book title and author (author TBD) for file name
            linecount = 1
            for line in webpage:
                m = re.search('Title: ',str(line))
                if m:
                    print("Matched!")
                    text = line.decode()
                    booktitle = text[7 : (len(text) - 2)] # Minus 2 at end critical to remove newline character
                linecount += 1 # Advance line counter
            filename = str(counter)+'_'+booktitle+'.txt'
            
            # Write book text to output file
            print("Currently retrieving: " + booktitle + " -- file name: " + filename)
            with open(str('/Users/vix/Repos/Python-Learning/src/NLP/Texts/' + filename),'w') as file:
                webpage = urllib.request.urlopen(url)
                for line in webpage:
                    text = line.decode() # IMP: Extract only text, discarding non-printing characters
                    file.write(text)
            
            # Write book text to list 
            with open(str('/Users/vix/Repos/Python-Learning/src/NLP/Texts/' + filename),'r') as file:
                text = [file.read().replace('\n','')]
                gutenberg_texts.append(text)
                print("Added list item: " + str(len(gutenberg_texts)) + "\n") # Enumerate list count, which is number of books
            return gutenberg_texts


        except: # If URL invalid, means no book at that webpage
            print("URL Not Valid\n")

    gutenberg_texts = get_gutenberg_text() # Call function


## VIK EXPLORATION: Separate function to populate list object using existing files in given folder

In [None]:
import os
import codecs

os.chdir('/Users/vix/Repos/Python-Learning/src/NLP/Texts/') # Set working folder
print("Adding *all* files in " + os.getcwd() + "\n")
gutenberg_texts = [] # Initialize array list

def get_gutenberg_text():
    # Write book text to list 
    for file in sorted(os.listdir()):
        with codecs.open(file, 'r', encoding='utf-8', errors='ignore') as f:
            text = [f.read().replace('\n','')]
            # text = [f.read().replace('\r','')]
            gutenberg_texts.append(text)
            print(str(len(gutenberg_texts)) + ": " + file) # Enumerate list count, which is running count of books
    return gutenberg_texts

gutenberg_texts = get_gutenberg_text() # Call function


In [2]:
print(gutenberg_texts[-1])

In [None]:
testlist = ['one two three','four five six','seven eight nine','ten']
print(testlist)
testlist.append('eleven')
print(testlist)
testlist.insert(0,'Zero')
print(testlist)


## **STOP** - Resume NLP lesson

In [None]:
# Necessary packages
import requests
from bs4 import BeautifulSoup
import pickle

# User function to scrape transcript data from scrapsfromtheloft.com
def url_to_transcript(url):
    '''Returns HTML contents of specified site.'''
    page = requests.get(url).text
    soup = BeautifulSoup(page, "lxml")
    text = [p.text for p in soup.find(class_="post-content").find_all('p')]
    print(url)
    return text

In [None]:
# URLs of transcripts in scope
urls = ['http://scrapsfromtheloft.com/2017/05/06/louis-ck-oh-my-god-full-transcript/',
        'http://scrapsfromtheloft.com/2017/04/11/dave-chappelle-age-spin-2017-full-transcript/',
        'http://scrapsfromtheloft.com/2018/03/15/ricky-gervais-humanity-transcript/',
        'http://scrapsfromtheloft.com/2017/08/07/bo-burnham-2013-full-transcript/',
        'http://scrapsfromtheloft.com/2017/05/24/bill-burr-im-sorry-feel-way-2014-full-transcript/',
        'http://scrapsfromtheloft.com/2017/04/21/jim-jefferies-bare-2014-full-transcript/',
        'http://scrapsfromtheloft.com/2017/08/02/john-mulaney-comeback-kid-2015-full-transcript/',
        'http://scrapsfromtheloft.com/2017/10/21/hasan-minhaj-homecoming-king-2017-full-transcript/',
        'http://scrapsfromtheloft.com/2017/09/19/ali-wong-baby-cobra-2016-full-transcript/',
        'http://scrapsfromtheloft.com/2017/08/03/anthony-jeselnik-thoughts-prayers-2015-full-transcript/',
        'http://scrapsfromtheloft.com/2018/03/03/mike-birbiglia-my-girlfriends-boyfriend-2013-full-transcript/',
        'http://scrapsfromtheloft.com/2017/08/19/joe-rogan-triggered-2016-full-transcript/']

# Comedian names
comedians = ['louis', 'dave', 'ricky', 'bo', 'bill', 'jim', 'john', 'hasan', 'ali', 'anthony', 'mike', 'joe']

In [None]:
# Actually perform scrape of contents of scrapsfromtheloft.com

transcripts = [url_to_transcript(u) for u in urls]

In [None]:
# Pickle files for later use - alternative to `csv_writer()`?

## Make a new directory to hold the text files
!mkdir transcripts

for i, c in enumerate(comedians):
    with open("transcripts/" + c + ".txt", "wb") as file:
        pickle.dump(transcripts[i], file) ### Indexing into the `transcripts` array/list

In [None]:
# Load pickled files - equivalent to `read_csv`?
data = {} # `{}` signifies a dictionary
for i, c in enumerate(comedians):
    with open("transcripts/" + c + ".txt", "rb") as file:
        data[c] = pickle.load(file)

In [None]:
# Double check to make sure data has been loaded properly
data.keys()

In [None]:
# More checks
data['louis'][:2]

## Cleaning The Data

When dealing with numerical data, data cleaning often involves removing null values and duplicate data, dealing with outliers, etc. With text data, there are some common data cleaning techniques, which are also known as text pre-processing techniques.

With text data, this cleaning process can go on forever. There's always an exception to every cleaning step. So, we're going to follow the MVP (minimum viable product) approach - start simple and iterate. Here are a bunch of things you can do to clean your data. We're going to execute just the common cleaning steps here and the rest can be done at a later point to improve our results.

**Common data cleaning steps on all text:**
* Make text all lower case
* Remove punctuation
* Remove numerical values
* Remove common non-sensical text (/n)
* Tokenize text
* Remove stop words

**More data cleaning steps after tokenization:**
* Stemming / lemmatization
* Parts of speech tagging
* Create bi-grams or tri-grams
* Deal with typos
* And more...

In [None]:
# Let's take a look at our data again
next(iter(data.keys()))

In [None]:
# Notice that our dictionary is currently in key: comedian, value: list of text format
next(iter(data.values()))

In [None]:
# We are going to change this to key: comedian, value: string format
def combine_text(list_of_text):
    '''Takes a list of text and combines them into one large chunk of text.'''
    combined_text = ' '.join(list_of_text)
    return combined_text

In [None]:
# Combine it!
data_combined = {key: [combine_text(value)] for (key, value) in data.items()}

In [None]:
# We can either keep it in dictionary format or put it into a pandas dataframe
import pandas as pd
pd.set_option('max_colwidth',150)

data_df = pd.DataFrame.from_dict(data_combined).transpose()
data_df.columns = ['transcript']
data_df = data_df.sort_index()
data_df

In [None]:
# Let's take a look at the transcript for Ali Wong
data_df.transcript.loc['ali']

In [None]:
# Apply a first round of text cleaning techniques
import re
import string

def clean_text_round1(text):
    '''Make text lowercase, remove text in square brackets, remove punctuation and remove words containing numbers.'''
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

round1 = lambda x: clean_text_round1(x)# Apply a first round of text cleaning techniques
import re
import string

def clean_text_round1(text):
    '''Make text lowercase, remove text in square brackets, remove punctuation and remove words containing numbers.'''
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

round1 = lambda x: clean_text_round1(x)