## Notebook Shortcuts

* `Esc` or clicking the left blue bar enters the **command mode**
* `Shift+Enter` to run cells
* `A` adds a cell above
* `B` adds a cell below
* `D`, `D` (press twice) deletes cells

* `M` converts to markdown cells
* `Y` converts to code cells

* `X` cuts cells
* `C` copies cells
* `V` pastes cells
* `Z` undo

Find out more on https://towardsdatascience.com/jypyter-notebook-shortcuts-bf0101a98330.

## Manage Working Directory

In [None]:
# get current working directory
import os
path=os.getcwd()
print(path)

In [None]:
# create a new folder for our course
new_path="./soc591/"
os.makedirs(new_path)

In [None]:
# change the current working directory to soc591
os.chdir(new_path)
# check the current wd
print("Current wd: ",os.getcwd())

In [None]:
# let us create a new file in current WD, write some texts into the file, and then close it
f = open("soc591.txt",mode="w+")
col_vars = "id;text\n"
f.write(col_vars)
f.write("1;This is a demo for writing some texts\n")
f.close()

In [None]:
# Let us read the soc591.txt file and assign it to variable text_df
text_df = open("soc591.txt", "r").read()
print(text_df)

In [None]:
# list file content
os.listdir(".")

In [None]:
# Let us remove the soc591.txt file
os.remove("soc591.txt")

## Scraping Static Webpages

In [None]:
import pandas as pd
from bs4 import BeautifulSoup as bs
import urllib.parse
import urllib.request

The example below partly comes from [Dr. Yongren SHI's](https://clas.uiowa.edu/sociology/people/yongren-shi) tutorial.

### Web Scaping

In [None]:
url="https://www.imdb.com/search/title/?groups=top_250&sort=user_rating,desc"

# Specify userheader
userHeader = {"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_4) AppleWebKit/600.7.12 (KHTML, like Gecko) Version/8.0.7 Safari/600.7.12"}
req = urllib.request.Request(url, headers=userHeader)

In [None]:
# You may need a VPN for web scraping
# open url and read web page
response = urllib.request.urlopen(req)
the_page = response.read()

In [None]:

# beautifulsoup parse html
soup=bs(the_page,"html.parser")
print(soup)

In [None]:
type(soup)

### Extract Elements from Soup

In [None]:
movie_containers = soup.find_all('div',{'class':'lister-item mode-advanced'})
print(movie_containers)

In [None]:
len(movie_containers)

In [None]:
# Extract the First Movie
first_movie = movie_containers[0]
print(first_movie)

In [None]:
# movie's title
movie_name = first_movie.h3.a.text
movie_name

### For Loop and List Comprehension

In [None]:
for i in [1, 2, 3]:
    print(i+1)

In [None]:
[i+1 for i in [1, 2, 3]]

### All Movies in the First Page

In [None]:
movie_year = first_movie.h3.find('span',{'class':'lister-item-year text-muted unbold'}).text

In [None]:
# the year the movie was released
import re
movie_year = first_movie.h3.find('span',{'class':'lister-item-year text-muted unbold'}).text
[int(year) for year in re.findall("\d{4}", movie_year)][0]

In [None]:
movie_names = [movie.h3.a.text for movie in movie_containers]
movie_names

In [None]:
import re
movie_years = [movie.h3.find('span',{'class':'lister-item-year text-muted unbold'}).text for movie in movie_containers]
movie_years = [[int(year) for year in re.findall("\d{4}", movie_year)][0] for movie_year in movie_years]
movie_years

In [None]:
movie_DF = pd.DataFrame({"name":movie_names,"year":movie_years})
movie_DF

### All Top 50 Movies

In [None]:
start_numbers=list(range(1, 202, 50))
start_numbers

In [None]:
# Use ''.join([]) to join strings
url1="https://www.imdb.com/search/title/?groups=top_250&sort=user_rating,desc&start="
url2="&ref_=adv_nxt"
''.join([url1, str(1), url2])

In [None]:
urls=[''.join([url1, str(i), url2]) for i in start_numbers]
urls

In [None]:
def scrape_movie(url):
    userHeader = {"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_4) AppleWebKit/600.7.12 (KHTML, like Gecko) Version/8.0.7 Safari/600.7.12"}
    req = urllib.request.Request(url, headers=userHeader)
    response = urllib.request.urlopen(req)
    the_page = response.read()
    soup=bs(the_page,"html.parser")
    movie_containers = soup.find_all('div',{'class':'lister-item mode-advanced'})
    return movie_containers

movie_containers_all = [scrape_movie(url) for url in urls]

In [None]:
len(movie_containers_all)

In [None]:
len(movie_containers_all[0])

In [None]:
# Nested list comprehension: [f(i) for j in k for i in j]
movie_names = [movie.h3.a.text for movie_containers in movie_containers_all for movie in movie_containers]
movie_names

In [None]:
len(movie_names)

In [None]:
# Nested list comprehension: [f(i) for j in k for i in j]
movie_years = [movie.h3.find('span',{'class':'lister-item-year text-muted unbold'}).text 
for movie_containers in movie_containers_all for movie in movie_containers]
movie_years = [[int(year) for year in re.findall("\d{4}", movie_year)][0] for movie_year in movie_years]
movie_years

In [None]:
movie_DF = pd.DataFrame({"name":movie_names,"year":movie_years})
movie_DF

## Dynamic Website

In [None]:
# You may not run this!
!pip install webdriver_manager

In [None]:
# Import modules for use
import os
import selenium
from selenium import webdriver
import time
import requests
from webdriver_manager.chrome import ChromeDriverManager
from selenium.common.exceptions import ElementClickInterceptedException
from bs4 import BeautifulSoup as bs

In [None]:
# Install Driver
driver = webdriver.Chrome(ChromeDriverManager().install())

# Open the url and establish a connection
url = "https://elephrame.com/textbook/BLM/chart"
driver.implicitly_wait(5)
driver.maximize_window()
driver.get(url)

In [None]:
soup

### First Page

In [None]:
# Scroll down to the bottom of the page
#driver.execute_script("window.scrollTo(0,window.scrollY+300)")
driver.execute_script("window.scrollTo(0,document.body.scrollHeight)")

# Read and parse the first page
first_page = driver.page_source
first_page

soup = bs(first_page,"html.parser")

# Use google developer inspect to check the source codes
# locate the key info we need
# it stores ad div class = "item chart"
items = soup.find_all("div",{"class":"item chart"})
print(items)

In [None]:
protest_id = [re.findall(r'id="([0-9].*?)"',str(item))[0] for item in items]
protest_location = [''.join(item.find("div",{"class":"item-protest-location"}).text.split()) for item in items]
protest_start = [''.join(item.find("div",{"class":"protest-start"}).text.split()) for item in items]
protest_end = [''.join(item.find("div",{"class":"protest-end"}).text.split()) for item in items]
protest_subject = [''.join(item.find("li",{"class":"item-protest-subject"}).text.split()) for item in items]
protest_participants = [''.join(item.find("li",{"class":"item-protest-participants"}).text.split()) for item in items]
protest_time = [''.join(item.find("li",{"class":"item-protest-time"}).text.split()) for item in items]
protest_description = [''.join(item.find("li",{"class":"item-protest-description"}).text.split()) for item in items]
protest_urls = ['##'.join(item.find("li",{"class":"item-protest-url"}).text.split()) for item in items]

In [None]:
# save the last item content into a tsv file for check
# check current dir
os.getcwd()

In [None]:
# write to a csv file
import csv 
from itertools import zip_longest
data=[protest_id, protest_location,protest_start,protest_end,protest_subject,protest_participants, 
protest_time,protest_description, protest_urls]
export_data = zip_longest(*data, fillvalue = '')
with open('blm-data.csv', 'w', encoding="ISO-8859-1", newline='') as file:
      write = csv.writer(file)
      write.writerow(("protest_id", "protest_location","protest_start","protest_end","protest_subject","protest_participants", 
      "protest_time","protest_description", "protest_urls"))
      write.writerows(export_data)

### Next Page

In [None]:
# click the next page
# you can check here for more info on selenium how to locate elements 
# https://selenium-python.readthedocs.io/locating-elements.html
import time
from selenium.webdriver.common.by import By
next_page = driver.find_element(By.XPATH, '//div[@class="pagination"]//li[4]')
next_page.click()
time.sleep(5)
# then we repeat the process to the end

In [None]:
# Because we have 229 pages, so we need a loop to automate the process
soup = bs(driver.page_source,"html.parser")
# locate the page id
page_id = soup.find("input",{"class":"page-choice"})["value"]
page_id = int(page_id)
print(page_id)

In [None]:
'''
# Please check the number of pages on your computer.
while page_id <=312:
    # do first page scraping 
    # click next page
    # repeat the scraping
    # if page_id>312, then stop
'''

## Coding Challenge

In [None]:
url="https://www.tripadvisor.com/Attraction_Review-g187323-d617423-Reviews-The_Holocaust_Memorial_Memorial_to_the_Murdered_Jews_of_Europe-Berlin.html"

# Specify userheader
userHeader = {"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_4) AppleWebKit/600.7.12 (KHTML, like Gecko) Version/8.0.7 Safari/600.7.12"}
req = urllib.request.Request(url, headers=userHeader)

In [None]:
# open url and read web page
response = urllib.request.urlopen(req)
the_page = response.read()

# beautifulsoup parse html
soup=bs(the_page,"html.parser")
print(soup)

Try to follow codes above to solve the challenge.