# VIK Data Cleaning

- Remember to set Python kernel to 3 (not later).
- Install additional packages `textblob`, `wordcloud`, and `gensim`.

In [None]:
## Import packages for scraping webpage contents and making sense of them

import os
import requests
from bs4 import BeautifulSoup

## Create variables and lists to serve as argument placeholders for scraping

- ### `get` works in conjunction with `requests`.
- ### `BeautifulSoup` must have a particular HTML element from a webpage to work on.
  + #### In this case, `class="post-content">`, and the *p* is from `<p style=...>`.
  + #### Each website might have its own HTML structure; so might need different `soup.find` argument for each site being scraped.  
    * #### E.g., `class_="css-53u6y8"` works for a NYTimes.com article, along with  *p* which is standard in HTML to represent a paragraph of text.
    * #### E.g., `class_="repo-list"` works for GitHub search results.
    * #### E.g., `li class_="b_algo"` with *a* works for Bing search results.


In [25]:
#### Ask user for input

os.system('clear')

print("\n\nHello there!  \n\n\nThis tool takes your search query, \n\napplies it to both major search engines, \n\nand then displays a simple comparison of the resulting search engine results.  \n\nAnalysis is limited to the first 100 results from each search engine.")

search_query = input("\n\n\nWhat should we search for? ")



Hello there!  


This tool takes your search query, 

applies it to both major search engines, 

and then displays a simple comparison of the resulting search engine results.  

Analysis is limited to the first 100 results from each search engine.


In [26]:
#### Set URL of page
query_url_bing = requests.get('https://www.bing.com/search?'+'q='+search_query+'&count=1000').text
query_url_google = requests.get('https://www.google.com/search?'+'q='+search_query+'&num=1000').text

#### Pass URL of page into `BeautifulSoup` method
query_html_bing = BeautifulSoup(query_url_bing, 'lxml')
query_html_google = BeautifulSoup(query_url_google, "html.parser")

#### Display well-formatted HTML results
##### print (source_html.prettify())

#### Extract one result title
##### result_title = source_html.find('div', class_='f4 text-normal').text
#### Extract one result description
##### result_desc = source_html.find('p', class_='mb-1').text

#### Display one (ie, first) result and title
##### print (result_title)
##### print (result_desc)


In [27]:
#### Prepare output file

import csv # CSV module
from datetime import datetime

##### Check existing records in file, to which we will append

# with open('/Users/vix/Repos/Python-Learning/src/NLP/Intro to NLP Alice Zhao/Search_Results_Combined.csv','r') as csv_file:
#     csv_reader = csv.reader(csv_file)
#     for line in csv_reader:
#         print(line)


In [28]:
#### Extract each search result's title and append to CSV file

with open('/Users/vix/Repos/Python-Learning/src/NLP/Intro to NLP Alice Zhao/Search_Results_Combined.csv','a') as csv_file:
    csv_writer = csv.writer(csv_file)
    # Since using csv.writer in append mode, no need for header row
    # csv_writer.writerow(['Search_Engine','Search_Query','Result_Title'])

    ##### Specifications for Bing
    for result in query_html_bing.find_all('li', attrs = {'class':'b_algo'}):
        result_title_bing = result.find('a').text
        # print(result_title_bing)
        csv_writer.writerow([datetime.today(),"Bing",search_query,result_title_bing])

    ##### Specifications for Google
    for result in query_html_google.find_all('h3', attrs = {'class':'zBAuLc'}):
        result_title_google = result.find('div').text
        # print(result_title_google)
        csv_writer.writerow([datetime.today(),"Google",search_query,result_title_google])

csv_file.close()



In [29]:
#### STOP

In [None]:
def url_to_transcript(url):
    '''Returns HTML contents of specified site.'''
    page = requests.get(url).text
    soup = BeautifulSoup(page, "lxml")
    # text = [p.text for p in soup.find(class_="post-content").find_all('p')]
    # text = [p.text for p in soup.find(class_="mb-1").find_all('my')]
    text = [p.text for p in soup.title.string]
    print(url)
    return text

In [None]:
### Populate list of URLs

# urls = ['http://scrapsfromtheloft.com/2017/05/06/louis-ck-oh-my-god-full-transcript/']
urls = ['https://github.com/search?q=python']

In [None]:
### Actually perform scrape of contents of scrapsfromtheloft.com

transcripts = [url_to_transcript(u) for u in urls]
print (transcripts)

In [None]:
### Pickling: Saving results of some operation for future use, to be referenced by other Python programs.

import pickle