# Coding Temple's Data Analytics Course
---
## Python for Data Analysis: Working with APIs and Web Scraping

## What is an API?

API stands for Application Programming Interface. API's are a way for two applications to communicate to one another and are commonly used to retrieve and send data using code. An API follows this format: ![](https://www.altexsoft.com/media/2019/06/1.png)


The API exists with constant communication between the server and the internet. When we create our application, or the web app in browser, we make a requests to that API through the internet for information and the API responds with that data, if we are allowed to access it.

Another way to think of it could be: 

![](https://images.ctfassets.net/wqwerb01q4v1/5KgFJ5hSoJqT5NPkfOI0Fb/2bf6e6d5776c91ce5f2cc6ca1eb40dd0/How_Do_APIs_Work_5.png)


To talk to the API, we need to import a package called `requests` to handle the connection and communication for us. We also will want to import a package called `json` to work with the data that is returned by the API. 

In [None]:
#import the packages we need:
import requests, json

In [None]:
# What does requests do?
help(requests)

In [None]:
# What does json do?
help(json)

We will be connecting to the Ergast F1 Racer API today. You can look at the API docs here:
http://ergast.com/mrd/



In [None]:
# help(requests)
url = 'http://ergast.com/api/f1/2008/5/driverStandings.json'

In [None]:
#response object
response = requests.get(url)
print(response)

In [None]:
#check and make sure we got a successful response from the API
response.ok

In [None]:
#view the response as a JSON
response.json()

In [None]:
type(response.json())

In [None]:
# We only want the Driver Standings
my_racer_data = response.json()['MRData']['StandingsTable']['StandingsLists'][0]['DriverStandings']
my_racer_data

In [None]:
# What if I only wanted the driver's name?
driver = [response.json()['MRData']['StandingsTable']['StandingsLists'][0]['DriverStandings'][x]['Driver']['familyName'] for x in range(len(my_racer_data))]
driver

In [None]:
# Create a function to get racer info
def get_racer_info(data):
    """
    Input: response object from API
    Takes the response and creates a dictionary containing the fields we want from the racer's information
    Output: List object containing a dictionary for each racer in the response object
    """
    new_data = []
    for racer in data:
        racer_dict={}
        racer_name = f'{racer["Driver"]["givenName"]} {racer["Driver"]["familyName"]}'

        racer_dict[racer_name] = {
            'first_name' : racer['Driver']['givenName'],
            'last_name' : racer['Driver']['familyName'],
            'position' : racer['position'],
            'wins' : racer['wins'],
            'DOB' : racer['Driver']['dateOfBirth'],
            'nationality' : racer['Driver']['nationality'],
            'constructor' : racer['Constructors'][0]['name']
        }
        new_data.append(racer_dict)
    return new_data

get_racer_info(my_racer_data)

## In-Class Exercise:

Create a function that allows a user to see a specific year and round

In [None]:
# Create a function that allows a user to see a specific year and round
def get_driver_info_by_year_and_round(year:int, r:int):
    """
    Input: Year -> int, r(round) -> int
    How it works: Use the input variables to grab a response object from the API for the year and round that is specified by the user.
    Takes the response object and inputs it into the get_racer_info function
    Output: List object containing the dictionary for each racer in the response object
    """

    # Step 1: Edit the URL to include year and round
    url = f'http://ergast.com/api/f1/{year}/{r}/driverStandings.json'
    # Step 2: Create a response object using the requests library
    response = requests.get(url)

    # I am going to set up a base case. What if the round or year is not in the API object?
    if not response.json()['MRData']['StandingsTable']['StandingsLists']:
        return 'We had an error loading your data. It is likely that the year or round is not present in the API object'

    data = response.json()['MRData']['StandingsTable']['StandingsLists'][0]['DriverStandings']
    return get_racer_info(data)

get_driver_info_by_year_and_round(2025,5)

## API's Part 2: Connecting to an API and returning an image

### Imports

In [None]:
import requests
import pandas as pd
from PIL import Image
from io import BytesIO

### Connect to the API

In [None]:
api_url = 'https://api.scryfall.com/bulk-data'

response = requests.get(api_url)

return_frame = pd.DataFrame.from_dict(requests.get((pd.DataFrame.from_dict(response.json()['data'])['download_uri']
                                                    [pd.DataFrame.from_dict(response.json()['data'])['type'] == 'oracle_cards'][0])).json())

In [None]:
return_frame

### Return an Image object

In [None]:
img_str = return_frame['image_uris'][return_frame['name'] == 'Static Orb'][0]['normal']
img_str

In [None]:
# Step 1: Get a response object using requests.get()
response = requests.get(img_str)

# Step 2 Use the PIL and BytesIO to open the image in Python!
img = Image.open(BytesIO(response.content))
img

In [None]:
# Install Beautiful Soup
!python -m pip install beautifulsoup4

## Beautiful Soup

### Using Requests

In [None]:
# Connect to URL
page = requests.get('https://www.cs.cmu.edu/~rgs/alice-VII.html')

In [None]:
# display result response
page

#### .content()

In [None]:
# Check Status of request response
page.text

### Importing

In [None]:
from bs4 import BeautifulSoup

### Using Beautiful Soup

In [None]:
# Instantiate BeautifulSoup class
soup = BeautifulSoup(page.content, 'html.parser')

soup

### .prettify()

In [None]:
#NOTE: Prettify only works for the full document and the .find() method
print(soup.prettify())

### Converting to a List

In [None]:
# Tags may contain strings and other tags. These elements are the tag’s children.
print(list(soup.children))
print(f'Count of children: {len(list(soup.children))}')

### Extracting Beautiful Soup Elements

In [None]:
# We can traverse through an HTML page and extract other tags and text
# The below example shows the types of iterables available in the object created from the HTML Document
# .Tag allows us to dive deeper into the document i.e we can look for HTML attributes like .class and if needed go deeper into the document from there
[type(item) for item in list(soup.children)]

### Assinging Variables from Beautiful Soup

In [None]:
import pprint

html = list(soup.children)[0] # Selecting the HTML element child from the soup object
body = list(html.children)[3] # Selecting the body element from the HTML child
center = list(body.children)[2]
table = list(center.children)[5]
table.prettify()

### .find() <br>
<p>Find a specific instance of the parameter passed in</p>

In [None]:
table.find('b')

### .find_all() <br>
<p>Similar to .find(), except this will return all of them instead of one</p>

In [None]:
text_corpus = []

for node in table.find_all('b'):
    text_corpus.append(node.get_text())
    
text_corpus

## Exercise <br>
<p>Using the Beautiful Soup library, grab the data from the following link: https://www.nbastuffer.com/2019-2020-nba-player-stats/. After getting the data, display the players name and team inside of a pandas dataframe.</p>

In [None]:
# Step 1: Imports
import pandas as pd

# Step 2: Create a request object using the .get(<url>) method on the url
page_nba = requests.get('https://www.nbastuffer.com/2019-2020-nba-player-stats/')

# Step 3: We are going to make some soup!
# Using the BeautifulSoup class, we can take the text of the page and parse it with an HTML parser
# Parsing means we will read over the page
soup = BeautifulSoup(page_nba.text, 'html.parser')

# Now that we have the data, we can begin working with it!
# Step 4: Instantiate two empty list objects to hold the names and teams of each player:
names = []
teams = []

# Step 5a: Iterate through the document, and get the info we need
# We can use a find_all to create an iterable >soup.find_all(>tag<)< tag = 'tr'
# We can then append each value for name and team to their respective lists using indexing
# We can verity by printing and exploring the find_all BEFORE adding it to a list
# print(soup.find_all('tr'))
for node in soup.find_all('tr'):
    names.append(node.find_all(string=True)[1])
    teams.append(node.find_all(string=True)[2])
    
# Step 5b: Remove the column titles from the list of values
names.pop(0)
teams.pop(0)

# Step 6: Create a dataframe object from a dictionary using the two lists that we created above!
player_data = pd.DataFrame.from_dict({
    'player_name' : names,
    'team' : teams
})

# View the data
player_data

# Bonus Example: Pulling Vegas Odds from PFR.com

<h3> Use this example for further reference</h3>
<p> This is an example that shows what we will get returned back to us when accessing a HTML document with Beautiful Soup</p>

In [None]:
page = requests.get('https://www.pro-football-reference.com/boxscores/201810140nwe.htm')
# print(page.status_code)

soup = BeautifulSoup(page.content, 'html.parser')

In [None]:
print(soup.prettify())

In [None]:
for section in list(soup.children):
    print(section)
    print('1\n2\n3\n')

In [None]:
html = list(soup.children)[3]

html

In [None]:
body = list(html.children)[3]

for el in list(body.children):
    print(el)
    print('\n\n\n\n123\n\n\n\n')

In [None]:
table = body.find_all('div')

print(table)

In [None]:
from bs4 import Comment

comments=soup.find_all(string=lambda text:isinstance(text,Comment))

for comment in comments:
    comment=BeautifulSoup(str(comment))
    log = comment.find('table', {'id':'game_info'}) #search as ordinary tag
    if log:
        print(log)

In [None]:
response = requests.get('https://pokeapi.co/api/v2/pokemon?limit=100000&offset=0')
new_data = []
for object in response.json()['results']:
    new_data.append(requests.get(object['url']).json())
    
new_data

In [None]:
import requests
from PIL import Image
from io import BytesIO

In [None]:
requests.get('https://pokeapi.co/api/v2/pokemon/ditto').json()

In [None]:
Image.open(BytesIO(requests.get("https://raw.githubusercontent.com/PokeAPI/sprites/master/sprites/pokemon/shiny/132.png").content))


In [None]:
def return_img(con_str:str):
    return Image.open(BytesIO(requests.get(con_str).content))

return_img(
    'https://raw.githubusercontent.com/PokeAPI/sprites/master/sprites/pokemon/shiny/132.png'
)

In [None]:
def get_pokemon(pokemon_name:str):
    return requests.get(f'https://pokeapi.co/api/v2/pokemon/{pokemon_name.lower()}').json()
for x in range(1,3):
    print(get_pokemon(str(x)))

In [None]:
Image.open(BytesIO(requests.get(requests.get('https://pokeapi.co/api/v2/pokemon/ditto').json()['sprites']['front_shiny']).content))


In [None]:
requests.get('https://pokeapi.co/api/v2/pokemon/ditto').json()['sprites']['front_shiny']


In [None]:
pokemon = []
for x in range(1,100):
    url = f"https://pokeapi.co/api/v2/pokemon/{x}/"
    pokemon.append(requests.get(url).json())

In [None]:
def get_poke_api(poke_name:str):
    data = requests.get(f'https://pokeapi.co/api/v2/pokemon/{poke_name}').json()
    d = {}
    d[poke_name] = {
        'name': data['name'],
        'ability': data['abilities'][0]['ability']['name']
    }
    return d


poke_names = ['ditto', 'bulbasaur', 'charmander', 'squirtle']
poke_data = []
for name in poke_names:
    poke_data.append(get_poke_api(name))
poke_data

In [None]:
requests.get('https://pokeapi.co/api/v2/type/ground/').json()
