# VIK EXPLORATION: Search Engine Results

### NOTES & TIPS

- Remember to set Python kernel to 3 (not later).
- Import packages `os`, `requests`, `BeautifulSoup`, `csv`, `datetime`,es `textblob`, `wordcloud`, and `gensim`.
- `get` works in conjunction with `requests`.
- `BeautifulSoup` must have a particular HTML element from a webpage to work on.
  E.g., , `class="post-content">`, and the *p* is from `<p style=...>`.
  + Each website might have its own HTML structure; so might need different `soup.find` argument for each site being scraped.  
    * E.g., `class_="css-53u6y8"` works for a NYTimes.com article, along with  *p* which is standard in HTML to represent a paragraph of text.
    * E.g., `class_="repo-list"` works for GitHub search results.
    * E.g., `li class_="b_algo"` with *a* works for Bing search result
  + Can be tricky depending on webpage HTML structure; requires careful inspection even for different pages of same website, which might be slightly different.s    * What you see online is **not** an accurate representation of what `BeautifulSoup` sees; you must save the parsed HTML document to file and study that version..


### Import packages for scraping webpage contents and making sense of them

In [1]:
import os
import requests
from bs4 import BeautifulSoup
import html5lib
import csv 
from datetime import datetime


### Set search query values from lists - this needs to be formalized

In [2]:
search_query_job = ['registered nurse','doctor','phone service rep'] # To be expanded from some source of top job titles
search_query_location = ['new york city','dallas','topeka'] # To be expanded from some source of top metros

### Instead, reference GSheet for each of the above items

### Set query URLs, conduct search, parse necessary information, and save to file

In [3]:
with open('/Volumes/GoogleDrive/My Drive/Market Insights Library/00_Sources (needs organization)/Labor Market/Supply-Demand Dynamics/Job_Search_Results_Combined.csv','a') as csv_file: # Keep output file open at the beginning of all loops
    csv_writer = csv.writer(csv_file)

    for qj in search_query_job: # For each query's job keyword...
        for ql in search_query_location: # ...and location keyword...
            search_query = qj+" "+ql # ...form combined search query phrase
            
            # For Google:
            query_url_google = 'https://www.google.com/search?q=' + str(qj) + " jobs " + str(ql) # ...which populates query URL for search engine
            query_results_google = requests.get(query_url_google).text # Actually perform the search, resulting in unparsed HTML page object...
            query_results_parsed_google = BeautifulSoup(query_results_google, 'lxml') # ...which is parsed by BeautifulSoup for structured searching

            for result_google in query_results_parsed_google.find_all('div', attrs = {'class':'ZINbbc xpd O9g5cc uUPGi'}): # Iterate through each result...
                try:
                    result_google_title = result_google.find('div', attrs = {'class':'BNeawe vvjwJb AP7Wnd'}).text # ...looking for listing title...
                except:
                    pass
                try:
                    result_google_source = result_google.find('div', attrs = {'class':'BNeawe UPmit AP7Wnd'}).text # ...listing source...
                except:
                    pass
                try:
                    result_google_stub = result_google.find('div', attrs = {'class':'BNeawe s3v9rd AP7Wnd'}).text # ...and miscellaneous information below listing
                except:
                    pass
                
                csv_writer.writerow([datetime.today(),"Google",search_query,result_google_title,result_google_source,result_google_stub]) # Save this single result's values to file, before moving to the next result



            # For Indeed:
            query_url_indeed = 'https://www.indeed.com/jobs?q=' + str(qj) + " jobs " + str(ql)
            query_results_indeed = requests.get(query_url_indeed).text
            query_results_parsed_indeed = BeautifulSoup(query_results_indeed, 'lxml')

            for result_indeed in query_results_parsed_indeed.find_all('div', attrs = {'class':'jobsearch-SerpJobCard unifiedRow row result'}):
                try:
                    result_indeed_title = result_indeed.find('h2', attrs = {'class':'title'}).text
                    result_indeed_title = result_indeed_title.strip()
                except:
                    pass
                try:
                    result_indeed_employer = result_indeed.find('span', attrs = {'class':'company'}).text
                    result_indeed_employer =result_indeed_employer.strip()
                except:
                    pass
                try:
                    result_indeed_location = result_indeed.find('span', attrs = {'class':'location accessible-contrast-color-location'}).text
                    result_indeed_location = result_indeed_location.strip()
                except:
                    result_indeed_location=""
                try:
                    result_indeed_summary = result_indeed.find('li').text
                    result_indeed_summary = result_indeed_summary.strip()
                except:
                    pass
                try:
                    result_indeed_postingdate = result_indeed.find('span', attrs = {'class':'date'}).text
                    result_indeed_postingdate = result_indeed_postingdate.strip()
                except:
                    pass

                csv_writer.writerow([datetime.today(),"Indeed",search_query,result_indeed_title,"","",result_indeed_employer,result_indeed_location,result_indeed_summary,result_indeed_postingdate])


### Reference template to extract result elements for first result check

### Test for Indeed

query_url_indeed = 'https://www.indeed.com/jobs?q=nurse phlebotomist washington dc'
query_results_indeed = requests.get(query_url_indeed).text
query_results_parsed_indeed = BeautifulSoup(query_results_indeed, 'lxml')

for result_indeed in query_results_parsed_indeed.find_all('div', attrs = {'class':'jobsearch-SerpJobCard unifiedRow row result'}):
    try:
        result_indeed_title = result_indeed.find('h2', attrs = {'class':'title'}).text
        result_indeed_title = result_indeed_title.strip()
    except:
        pass
    try:
        result_indeed_employer = result_indeed.find('span', attrs = {'class':'company'}).text
        result_indeed_employer =result_indeed_employer.strip()
    except:
        pass
    try:
        result_indeed_location = result_indeed.find('span', attrs = {'class':'location accessible-contrast-color-location'}).text
        result_indeed_location = result_indeed_location.strip()
    except:
        pass
    try:
        result_indeed_summary = result_indeed.find('li').text
        result_indeed_summary = result_indeed_summary.strip()
    except:
        pass
    try:
        result_indeed_postingdate = result_indeed.find('span', attrs = {'class':'date'}).text
        result_indeed_postingdate = result_indeed_postingdate.strip()
    except:
        pass
  


### Print first result's extracted elements as check
print("First Indeed result as check:\n")
print("\tResult title: " + result_indeed_title)
print("\tEmployer: " + result_indeed_employer)
print("\tLocation: " + result_indeed_location)
print("\tSummary: " + result_indeed_summary)
print("\tPosting Date: " + result_indeed_postingdate)
print("\n")


### Reference template to extract result elements and save to ongoing CSV file by appending

import csv # CSV module required for writing to CSV file
from datetime import datetime

#### Note `'a'` argument for CSV file append
#### Since using csv.writer in append mode, no need for header row; but otherwise for new files must use:
#### csv_writer.writerow(['Search_Date_Time','Search_Engine','Search_Query','Result_Title','Result_Source','Result_Stub'])

with open('/Users/vix/Repos/Python-Learning/src/NLP/Labor Market/Search_Results_Combined.csv','a') as csv_file:
    csv_writer = csv.writer(csv_file)
    
    for result_google in query_results_parsed_google.find_all('div', attrs = {'class':'ZINbbc xpd O9g5cc uUPGi'}):
        result_google_title = result_google.find('div', attrs = {'class':'BNeawe vvjwJb AP7Wnd'}).text
        result_google_source = result_google.find('div', attrs = {'class':'BNeawe UPmit AP7Wnd'}).text
        result_google_stub = result_google.find('div', attrs = {'class':'BNeawe s3v9rd AP7Wnd'}).text
        csv_writer.writerow([datetime.today(),"Google",search_query,result_google_title,result_google_source,result_google_stub])



### Spot-checks and future expansion

result_google_employer = result_google.find_next_sibling('div', attrs = {'class':'vNEEBe'}).text
    result_google_location = result_google.find('div', attrs = {'class':'Qk80Jf'}).text
    result_google_postingdate = result_google.find('span', attrs = {'class':'SuWscb'}).text
    
query_url_bing = 'https://www.bing.com/search?q=' + str(search_query)
query_results_bing = requests.get(query_url_bing).text
query_results_parsed_bing = BeautifulSoup(query_results_bing, 'html.parser')

for result_bing in query_results_parsed_bing.find_all('ul', attrs = {'class':'b_hList'}):
    result_bing_title = results_bing.find('div', attrs = {'class':'jb_title'}).text
    result_bing_employer = results_bing.find('div', attrs = {'class':'jb_company'}).text
    result_bing_location = results_bing.find('div', attrs = {'class':'jb_loc_jobType'}).text
    result_bing_salary = results_bing.find('div', attrs = {'class':'jb_salary'}).text

#### `BeautifulSoup` metadata
tag = query_html_bing.div
type(tag)
print("\n" + str(tag) + "\n")
print(query_html_bing.div.get_attribute_list('class'))

#### Save parsed HTML page as file for review & testing
parsed_html_text = str(query_results_parsed_google) 
parsed_html_file = open("/Users/vix/Repos/Python-Learning/src/NLP/Labor Market/parsed_html.html", 'w')
parsed_html_file.write(parsed_html_text)
parsed_html_file.close()

### Ask user for search query input - Not used

os.system('clear')

print("\n\nHello there!  \n\n\nThis tool takes your search query, \n\napplies it to both major general-purpose search engines (Google & Bing), \n\nand to one major job search engine (Indeed), \n\nand then displays a simple comparison of the resulting search engine results.  \n\nAnalysis is limited to the first 100 results from each search engine.")

search_query = input("\n\n\nWhat should we search for? ")

# VIK EXPLORATION: Compare two text documents

import urllib
import os

os.chdir('/Users/vix/Repos/Python-Learning/src/NLP/Texts')

with open('base.txt', 'r') as file1:
    with open('comparison.txt', 'r') as file2:
        difference = set(file1).symmetric_difference(file2)

for line in difference:
    print(line)


# VIK EXPLORATION: Retrieve contents of text **file** at any URL and save to disk

# VIK EXPLORATION: File operations

### Essentially copying a file's contents to another file - this works for text files...

with open('/Users/vix/Repos/Python-Learning/src/NLP/SNLI Stanford Corpus/README.txt','r') as sourcefile:
    with open('/Users/vix/Repos/Python-Learning/src/NLP/SNLI Stanford Corpus/DESTFILE.txt','w') as destfile:
        for line in sourcefile:
            destfile.write(line)


### And for an image file...simply append `b` for *binary mode* to file operation command `r`, `w`, or `a`

with open('/Users/vix/OneDrive/Temp/Portrait_Vikram_Before-After.png','rb') as sourceimage:
    with open('/Users/vix/Repos/Python-Learning/src/NLP/SNLI Stanford Corpus/destimage.png','wb') as destimage:
        for line in sourceimage:
            destimage.write(line)



# VIK EXPLORATION: Get book text from Project Gutenberg, save to file, and populate list object

import os
import re
import urllib # Import `urllib` package - primarily using `request` module with `urlopen` method

# os.chdir('/Users/vix/Repos/Python-Learning/src/NLP/Texts')
gutenberg_texts = [] # Initialize list

for counter in range(10,25003): # Loop over each book, which is a reference number

    def get_gutenberg_text():
        url = "https://www.gutenberg.org/files/" + str(counter) + "/" + str(counter) + ".txt"

        try: # Check if URL valid
            webpage = urllib.request.urlopen(url) # Open the webpage containing book text

            # Extract book title and author (author TBD) for file name
            linecount = 1
            for line in webpage:
                m = re.search('Title: ',str(line))
                if m:
                    print("Matched!")
                    text = line.decode()
                    booktitle = text[7 : (len(text) - 2)] # Minus 2 at end critical to remove newline character
                linecount += 1 # Advance line counter
            filename = str(counter)+'_'+booktitle+'.txt'
            
            # Write book text to output file
            print("Currently retrieving: " + booktitle + " -- file name: " + filename)
            with open(str('/Users/vix/Repos/Python-Learning/src/NLP/Texts/' + filename),'w') as file:
                webpage = urllib.request.urlopen(url)
                for line in webpage:
                    text = line.decode() # IMP: Extract only text, discarding non-printing characters
                    file.write(text)
            
            # Write book text to list 
            with open(str('/Users/vix/Repos/Python-Learning/src/NLP/Texts/' + filename),'r') as file:
                text = [file.read().replace('\n','')]
                gutenberg_texts.append(text)
                print("Added list item: " + str(len(gutenberg_texts)) + "\n") # Enumerate list count, which is number of books
            return gutenberg_texts


        except: # If URL invalid, means no book at that webpage
            print("URL Not Valid\n")

    gutenberg_texts = get_gutenberg_text() # Call function


# VIK EXPLORATION: Separate function to populate list object using existing files in given folder

import os
import codecs
import re

### Custom function to create `listdir` command that does not show hidden files
def listdir_nohidden(path):
    import glob
    return glob.glob(os.path.join(path, '*'))

os.chdir('/Users/vix/Repos/Python-Learning/src/NLP/Intro to NLP Alice Zhao/books') # Set working folder
print("Adding *all* files in " + os.getcwd() + "\n")
gutenberg_texts = [] # Initialize list of texts
gutenberg_titles = [] # Initialize list of book titles

def get_gutenberg_text():
    # Extract book title
    for file in sorted(listdir_nohidden(".")):
        with codecs.open(file, 'r', encoding='utf-8', errors='ignore') as f:
            linecount = 1
            for line in f:
                m = re.search('Title: ',str(line))
                if m:
                    print("Matched!")
                    title = line[7 : (len(line) - 2)] # Minus 2 at end critical to remove newline character
                    gutenberg_titles.append(title) # Put book titles in list
                linecount += 1 # Advance line counter

    # Write book text from file to list 
    for file in sorted(listdir_nohidden(".")):
        with codecs.open(file, 'r', encoding='utf-8', errors='ignore') as f:
            text = [f.read().replace('\n','')]
            gutenberg_texts.append(text)
            print(str(len(gutenberg_texts)) + ": " + file) # Enumerate list count, which is running count of books

get_gutenberg_text() # Call function


### Useful code to display beginnings of each list item as preview
[book[0][:100] for book in gutenberg_texts]

### Similar code for items in dictionary form
{}

### Useful code to convert MS Word document to text file

import docx2txt
converted_text = docx2txt.process(filename.docx)
with open(Filename.txt, 'w') as file:
    file.write(converted_text)


### Custom function to create `listdir` command that does not show hidden files

def listdir_nohidden(path):
    import glob
    return glob.glob(os.path.join(path, '*'))


In [None]:
gutenberg_titles.append("Q and the Magic of Grammar")

# **STOP** - Resume NLP lesson

# Necessary packages
import requests
from bs4 import BeautifulSoup
import pickle

# User function to scrape transcript data from scrapsfromtheloft.com
def url_to_transcript(url):
    '''Returns HTML contents of specified site.'''
    page = requests.get(url).text
    soup = BeautifulSoup(page, "lxml")
    text = [p.text for p in soup.find(class_="post-content").find_all('p')]
    print(url)
    return text

# URLs of transcripts in scope
urls = ['http://scrapsfromtheloft.com/2017/05/06/louis-ck-oh-my-god-full-transcript/',
        'http://scrapsfromtheloft.com/2017/04/11/dave-chappelle-age-spin-2017-full-transcript/',
        'http://scrapsfromtheloft.com/2018/03/15/ricky-gervais-humanity-transcript/',
        'http://scrapsfromtheloft.com/2017/08/07/bo-burnham-2013-full-transcript/',
        'http://scrapsfromtheloft.com/2017/05/24/bill-burr-im-sorry-feel-way-2014-full-transcript/',
        'http://scrapsfromtheloft.com/2017/04/21/jim-jefferies-bare-2014-full-transcript/',
        'http://scrapsfromtheloft.com/2017/08/02/john-mulaney-comeback-kid-2015-full-transcript/',
        'http://scrapsfromtheloft.com/2017/10/21/hasan-minhaj-homecoming-king-2017-full-transcript/',
        'http://scrapsfromtheloft.com/2017/09/19/ali-wong-baby-cobra-2016-full-transcript/',
        'http://scrapsfromtheloft.com/2017/08/03/anthony-jeselnik-thoughts-prayers-2015-full-transcript/',
        'http://scrapsfromtheloft.com/2018/03/03/mike-birbiglia-my-girlfriends-boyfriend-2013-full-transcript/',
        'http://scrapsfromtheloft.com/2017/08/19/joe-rogan-triggered-2016-full-transcript/']

# Comedian names
comedians = ['louis', 'dave', 'ricky', 'bo', 'bill', 'jim', 'john', 'hasan', 'ali', 'anthony', 'mike', 'joe']

# Actually perform scrape of contents of scrapsfromtheloft.com

transcripts = [url_to_transcript(u) for u in urls]

# Pickle files for later use - alternative to `csv_writer()`?

## Make a new directory to hold the text files
!mkdir transcripts

for i, c in enumerate(comedians):
    with open("transcripts/" + c + ".txt", "wb") as file:
        pickle.dump(transcripts[i], file) ### Indexing into the `transcripts` array/list

### Load pickled files - modified code to simply open file; unsure about need for Pickle
### Create dictionary data container which can hold book text as well as title

books = {} # `{}` signifies a dictionary
for i, title in enumerate(gutenberg_titles):
    with open(title + ".txt", 'r') as f:
        books[title] = f.read()

books = {'title': gutenberg_titles, 'text': gutenberg_texts}
print(len(books))
for key, value in books.items():
    print(key, value)

# Double check to make sure data has been loaded properly
books.keys()

# More checks
books['The King James Bible'][:2]

## Cleaning The Data

When dealing with numerical data, data cleaning often involves removing null values and duplicate data, dealing with outliers, etc. With text data, there are some common data cleaning techniques, which are also known as text pre-processing techniques.

With text data, this cleaning process can go on forever. There's always an exception to every cleaning step. So, we're going to follow the MVP (minimum viable product) approach - start simple and iterate. Here are a bunch of things you can do to clean your data. We're going to execute just the common cleaning steps here and the rest can be done at a later point to improve our results.

**Common data cleaning steps on all text:**
* Make text all lower case
* Remove punctuation
* Remove numerical values
* Remove common non-sensical text (/n)
* Tokenize text
* Remove stop words

**More data cleaning steps after tokenization:**
* Stemming / lemmatization
* Parts of speech tagging
* Create bi-grams or tri-grams
* Deal with typos
* And more...

# Let's take a look at our data again
next(iter(data.keys()))

# Notice that our dictionary is currently in key: comedian, value: list of text format
next(iter(data.values()))

# We are going to change this to key: comedian, value: string format
def combine_text(list_of_text):
    '''Takes a list of text and combines them into one large chunk of text.'''
    combined_text = ' '.join(list_of_text)
    return combined_text

# Combine it!
data_combined = {key: [combine_text(value)] for (key, value) in data.items()}

# We can either keep it in dictionary format or put it into a pandas dataframe
import pandas as pd
pd.set_option('max_colwidth',150)

data_df = pd.DataFrame.from_dict(data_combined).transpose()
data_df.columns = ['transcript']
data_df = data_df.sort_index()
data_df

# Let's take a look at the transcript for Ali Wong
data_df.transcript.loc['ali']

# Apply a first round of text cleaning techniques
import re
import string

def clean_text_round1(text):
    '''Make text lowercase, remove text in square brackets, remove punctuation and remove words containing numbers.'''
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

round1 = lambda x: clean_text_round1(x)# Apply a first round of text cleaning techniques
import re
import string

def clean_text_round1(text):
    '''Make text lowercase, remove text in square brackets, remove punctuation and remove words containing numbers.'''
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

round1 = lambda x: clean_text_round1(x)