# JSON: JavaScript Object Notation
- A list of object to be transferred on Internet: serialize(flatterning the object)->encode->received->decode->deserialize
- Stored as plain (byte strings or utf-8 strings) text
- Contains data type information

json.loads() and json.dumps()

In [8]:
import json
data_string = '[{"b": [2, 4], "3.0": "c", "a": "A"},34]'
# json.loads recursively decodes a string in JSON format into equivalent python objects
python_data = json.loads(data_string)
data_string2 = json.dumps(python_data)
print(type(data_string))
print(type(python_data))
print(type(data_string2))

<class 'str'>
<class 'list'>
<class 'str'>


# API: Application Programming Interface
- A protocal containing a set of commands or functions that allow one piece of software to talk to another
- Data from the web is often gotten through an API
- Web APIs usually consist of two parts:
    - request an well-formed HTTP request to a server
    - response a response from the server, usually either an html page or a JSON object

### Requests library
- The primary mechanism for sending an API request or accessing a web server

In [None]:
import requests
response = requests.get("https://www.allrecipes.com/")
print(response.status_code) #status code 200: the request response cycle was successful
print(type(response.content))
print(type(response.content.decode('utf-8'))) #Data received from the world wide web is usually encoded in utf-8


200
<class 'bytes'>
<class 'str'>


### Google Map Geocoding API Example

In [9]:
def get_lat_lng(address_string,api_key):
  response_data = ''
  lat = ''
  lng = ''
  address = address_string.replace(' ','_')
  url="https://maps.googleapis.com/maps/api/geocode/json?address=%s&key=%s" % (address,api_key)
  try:
      response = requests.get(url)
      if not response.status_code == 200:
          print("HTTP error",response.status_code)
      else:
          try:
            # requests can automatically decode and convert a json response into a python object
            # using response.json()
              response_data = response.json()
              lat = response_data['results'][0]['geometry']['location']['lat']
              lng = response_data['results'][0]['geometry']['location']['lng']
          except:
              print("Response not in valid JSON format")
  except:
      print("Something went wrong with requests.get")

  return (lat,lng)

In [10]:
api_key = 'AIzaSyDsVcdhWZWcd_ep1hKj821pfe1g4JGHtFY'
get_lat_lng("Columbia University, New York, NY",api_key)

(40.8075355, -73.9625727)

# XML
- eXtensible Markup Language
- data is stored in a tree
- data items are "tagged" with named values
- html is (loosely) similar to XML (both are based on SGML)
- The python library  **lxml** deals with converting an xml string to python objects and vice versa

In [13]:
data_string = """
<Bookstore>
   <Book ISBN="ISBN-13:978-1599620787" Price="15.23" Weight="1.5">
      <Title>New York Deco</Title>
      <Authors>
         <Author Residence="New York City">
            <First_Name>Richard</First_Name>
            <Last_Name>Berenholtz</Last_Name>
         </Author>
      </Authors>
   </Book>
   <Book ISBN="ISBN-13:978-1579128562" Price="15.80">
      <Remark>
      Five Hundred Buildings of New York and over one million other books are available for Amazon Kindle.
      </Remark>
      <Title>Five Hundred Buildings of New York</Title>
      <Authors>
         <Author Residence="Beijing">
            <First_Name>Bill</First_Name>
            <Last_Name>Harris</Last_Name>
         </Author>
         <Author Residence="New York City">
            <First_Name>Jorg</First_Name>
            <Last_Name>Brockmann</Last_Name>
         </Author>
      </Authors>
   </Book>
</Bookstore>
"""

from lxml import etree
root = etree.XML(data_string)
print(type(data_string), "\n", root, type(root), "\n\n")

# need pretty_print and decoding to print out XML object
print(etree.tostring(root, pretty_print=True).decode("utf-8"), "\n")
type(etree.tostring(root, pretty_print=True).decode("utf-8"))

<class 'str'> 
 <Element Bookstore at 0x2f757ccdc80> <class 'lxml.etree._Element'> 


<Bookstore>
   <Book ISBN="ISBN-13:978-1599620787" Price="15.23" Weight="1.5">
      <Title>New York Deco</Title>
      <Authors>
         <Author Residence="New York City">
            <First_Name>Richard</First_Name>
            <Last_Name>Berenholtz</Last_Name>
         </Author>
      </Authors>
   </Book>
   <Book ISBN="ISBN-13:978-1579128562" Price="15.80">
      <Remark>
      Five Hundred Buildings of New York and over one million other books are available for Amazon Kindle.
      </Remark>
      <Title>Five Hundred Buildings of New York</Title>
      <Authors>
         <Author Residence="Beijing">
            <First_Name>Bill</First_Name>
            <Last_Name>Harris</Last_Name>
         </Author>
         <Author Residence="New York City">
            <First_Name>Jorg</First_Name>
            <Last_Name>Brockmann</Last_Name>
         </Author>
      </Authors>
   </Book>
</Bookstore>
 



str

### Iterate through XML Tree

1. root.iter(): loose structure of the tree, but go through every node

In [14]:
for element in root.iter():
    print(element)

<Element Bookstore at 0x2f757ccdc80>
<Element Book at 0x2f757cc8940>
<Element Title at 0x2f75a9c7e80>
<Element Authors at 0x2f758fb4380>
<Element Author at 0x2f757cc8940>
<Element First_Name at 0x2f75a9c7e80>
<Element Last_Name at 0x2f758fb4380>
<Element Book at 0x2f757cc8940>
<Element Remark at 0x2f75a9c7e80>
<Element Title at 0x2f758fb4380>
<Element Authors at 0x2f757cc8940>
<Element Author at 0x2f75a9c7e80>
<Element First_Name at 0x2f758fb4380>
<Element Last_Name at 0x2f757cc8940>
<Element Author at 0x2f75a9c7e80>
<Element First_Name at 0x2f758fb4380>
<Element Last_Name at 0x2f757cc8940>


2. only the children level in the subtree
- no bookstore / book information
- no deeper author name information

In [15]:
for child in root:
    for thing in child:
        print(thing)

<Element Title at 0x2f758fb4fc0>
<Element Authors at 0x2f75aa23c00>
<Element Remark at 0x2f75a968480>
<Element Title at 0x2f758fb4fc0>
<Element Authors at 0x2f75aa23c00>


### Accessing some attributes
1. Element.tag: access the tag name
2. Element.text: prints the text in a leaf node
3. Element.find('tag_name'): 
    - access the FIRST node's content with specified tag names
    - .find function only looks at the children level, not other descendants
4. Element.findall(): finds ALL elements with a tag which are direct children of the current element. Element.find() finds the first child with a particular tag.
5. Element.get() accesses the element’s attributes

In [16]:
for element in root.findall("Book[@Price ='15.80']/Authors/Author/Last_Name"): 
#use @ to add constrains on certain attribute
    print(element.text)
    
for element in root.findall('Book/Authors/Author[@Residence = "New York City"]'):
  print(element.find('First_Name').text,element.find('Last_Name').text)

Harris
Brockmann
Richard Berenholtz
Jorg Brockmann


# Web Scrapping

**Python libraries for web scraping**
- requests for handling the request-response cycle
- beautifulsoup4 for extracting data from an html string
- selenium for extracting data from an html string and managing the response process, particularly when a page contains JavaScript or when a button needs to be clicked

### Beautiful Soup
- html and xml parser
- makes use of formatted html tags and css properties to extract data
https://www.crummy.com/software/BeautifulSoup/bs4/doc/

1. find_all():
    - finds all instances of a specified tag
    - returns a bs4 result_set (a list)
2. find():
    - finds the first instance of a specified tag
    - returns a bs4 element
    
***bs4 functions can be recursively applied on bs4 elements***

3. CSS selectors：used in both find_all() and find() functions
    - selector=value
    - dictionary: {"keyword selector": value}
4. get_text(): returns the marked up text (the content, string) enclosed in a tag
5. get(): returns the value of a tag attribute (e.g. "href", "title")

In [37]:
import requests
from bs4 import BeautifulSoup
import re

# get all recipes names and links, and descriptions
def get_recipes(keywords):

    recipe_list = list()
    
    url = "https://www.allrecipes.com/search?q=" + keywords
    # we didn't replace space with "_" because
    # .get() encode the url, so there can be space in the url
    response = requests.get(url)
    if not response.status_code == 200:
        print("HTTP error",response.status_code)
        return None
        
    try:
        results_page = BeautifulSoup(response.content,'lxml') # parse the response into readable format
        recipes = results_page.find_all('div',{"class":"comp card-list mntl-document-card-list mntl-card-list mntl-block"})
        for recipe in recipes:
            recipe_link = recipe.find('a').get('href')
            recipe_name = recipe.find('span').get('card__title-text ')
            try:
                # re.compile helps to match any class attribute with the content as "card_summary" and any letter behind
                recipe_description = recipe.find('div',{"class":re.compile('card__summary.*')}).get_text().replace("\n",'').strip()
            except:
                recipe_description = ''
            recipe_list.append((recipe_name,recipe_link,recipe_description))
        return recipe_list
    except:
        print("Something went wrong with nodes tag finding")
        return None

In [38]:
import requests
import re
from bs4 import BeautifulSoup

# get a recipe's information given a recipe's link
def get_recipe_info(recipe_link):
    recipe_dict = dict()

    try:
        response = requests.get(recipe_link)
        if not response.status_code == 200:
            print("HTTP error",response.status_code)
            return recipe_dict
        
        result_page = BeautifulSoup(response.content,'lxml')
        ingredient_list = list()
        prep_steps_list = list()

        ingredientsection = result_page.find('ul',{"class":"ingredients-section"})
        for ingredient in ingredientsection.find_all('li'):
            ingredient_list.append(ingredient.find('span',{"class":re.compile('ingredients-item-name.*')}).get_text())

        stepssection = result_page.find('ul',{"class":"instructions-section"})
        for step in stepssection.find_all('li'):
            prep_steps_list.append(step.find('p').get_text())

        recipe_dict['ingredients'] = ingredient_list
        recipe_dict['preparation'] = prep_steps_list
        return recipe_dict
        
    except:
        print("Something went wrong with request.get")
        return recipe_dict

In [39]:
# combine the above two functions
# 
def get_all_recipes(keywords):
    results = list()
    all_recipes = get_recipes(keywords) # return all recipe_name, recipe_link, recipe_description
    for recipe in all_recipes:
        recipe_dict = get_recipe_info(recipe[1]) # get each information in the recipe_link extracted
        recipe_dict['name'] = recipe[0]
        recipe_dict['description'] = recipe[2]
        results.append(recipe_dict)
    return(results)

# User Authentication
Login while Web Scrapping

In [41]:
username = 'VickyYu7'
password = 'Yuweiqi1314288'

# Construct an object that contains the data to be sent to the login page
payload = {
    'wpName': username,
    'wpPassword': password,
#     'wploginattempt': 'Log in',
#     'wpEditToken': "+\\",
#     'title': "Special:UserLogin",
    'authAction': "login",
    }
# get the value of the login token
def get_login_token(response):
    soup = BeautifulSoup(response.text, 'lxml')
    token = soup.find('input',{'name':"wpLoginToken"}).get('value') # value is the attribute of the "name" tag
    return token

import requests
from bs4 import BeautifulSoup

with requests.session() as s: # create a new session object
    response = s.get('https://en.wikipedia.org/w/index.php?title=Special:UserLogin&returnto=Main+Page') # use a session to get the response
                                                      # Server sends back the login session
    payload['wpLoginToken'] = get_login_token(response) # use username and password to get the login token
    # post(url, login_data)
    # Send the login request, send Server the data
    # If data is not the Server want, it will ignore it
    response_post = s.post('https://en.wikipedia.org/w/index.php?title=Special:UserLogin&returnto=Main+Page',
                           data=payload)
    #Get another page and check if we’re still logged in, Server controls the time of each session
    #You can use session.close() to manually close the session
    response = s.get('https://en.wikipedia.org/wiki/Special:Watchlist')
    data = BeautifulSoup(response.content,'lxml')
    print(data.find('div',class_='watchlistDetails').get_text())

You have 1 page on your watchlist (excluding talk pages). Changes to pages since you last visited them are shown in bold with solid markers. 
