In [1]:
import pandas as pd
import numpy as np

# colab-specific file access
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/My Drive/Colab Notebooks

Mounted at /content/drive
/content/drive/My Drive/Colab Notebooks


The code below produces the data frames used in the examples:

In [None]:
heroes = pd.DataFrame(
    data={'color': ['red', 'green', 'black',
                    'blue', 'black', 'red'],
          'first_seen_on': ['a', 'a', 'f', 'a', 'a', 'f'],
          'first_season': [2, 1, 2, 3, 3, 1]},
    index=['flash', 'arrow', 'vibe',
           'atom', 'canary', 'firestorm']
)

identities = pd.DataFrame(
    data={'ego': ['barry allen', 'oliver queen', 'cisco ramon',
                  'ray palmer', 'sara lance',
                  'martin stein', 'ronnie raymond'],
          'alter-ego': ['flash', 'arrow', 'vibe', 'atom',
                        'canary', 'firestorm', 'firestorm']}
)

teams = pd.DataFrame(
    data={'team': ['flash', 'arrow', 'flash', 'legends',
                   'flash', 'legends', 'arrow'],
          'hero': ['flash', 'arrow', 'vibe', 'atom',
                   'killer frost', 'firestorm', 'speedy']})

## Pandas and Wrangling

In [None]:
heroes

Unnamed: 0,color,first_seen_on,first_season
flash,red,a,2
arrow,green,a,1
vibe,black,f,2
atom,blue,a,3
canary,black,a,3
firestorm,red,f,1


In [None]:
identities

Unnamed: 0,ego,alter-ego
0,barry allen,flash
1,oliver queen,arrow
2,cisco ramon,vibe
3,ray palmer,atom
4,sara lance,canary
5,martin stein,firestorm
6,ronnie raymond,firestorm


In [None]:
teams

Unnamed: 0,team,hero
0,flash,flash
1,arrow,arrow
2,flash,vibe
3,legends,atom
4,flash,killer frost
5,legends,firestorm
6,arrow,speedy


### Slice and Dice

#### Column selection by label


In [None]:
heroes.loc[:, 'color']

Unnamed: 0,color
flash,red
arrow,green
vibe,black
atom,blue
canary,black
firestorm,red


In [None]:
heroes.loc[:, ['color', 'first_season']]

Unnamed: 0,color,first_season
flash,red,2
arrow,green,1
vibe,black,2
atom,blue,3
canary,black,3
firestorm,red,1


In [None]:
heroes['first_seen_on']

Unnamed: 0,first_seen_on
flash,a
arrow,a
vibe,f
atom,a
canary,a
firestorm,f


#### Row Selection by Label


In [None]:
heroes.loc[['flash', 'vibe'], :]

Unnamed: 0,color,first_seen_on,first_season
flash,red,a,2
vibe,black,f,2


In [None]:
heroes.loc[['flash', 'vibe']]

Unnamed: 0,color,first_seen_on,first_season
flash,red,a,2
vibe,black,f,2


#### General Selection by Label



In [None]:
heroes.loc['flash':'atom', :'first_seen_on']

Unnamed: 0,color,first_seen_on
flash,red,a
arrow,green,a
vibe,black,f
atom,blue,a


#### Selection by Integer Index

In [None]:
heroes.iloc[:4,:2]

Unnamed: 0,color,first_seen_on
flash,red,a
arrow,green,a
vibe,black,f
atom,blue,a


### Filtering with boolean arrays

In [None]:
heroes[(heroes['first_season']==3) & (heroes['first_seen_on']=='a')]


Unnamed: 0,color,first_seen_on,first_season
atom,blue,a,3
canary,black,a,3


#### Problem Solving Strategy


In [None]:
heroes[heroes['first_season'].isin([1,3])]

Unnamed: 0,color,first_seen_on,first_season
arrow,green,a,1
atom,blue,a,3
canary,black,a,3
firestorm,red,f,1


### Counting Rows

In [None]:
heroes['color'].value_counts()

Unnamed: 0_level_0,count
color,Unnamed: 1_level_1
red,2
black,2
green,1
blue,1


In [None]:
heroes.groupby(['color', 'first_season']).size()

Unnamed: 0_level_0,Unnamed: 1_level_0,0
color,first_season,Unnamed: 2_level_1
black,2,1
black,3,1
blue,3,1
green,1,1
red,1,1
red,2,1


In [None]:
heroes.groupby(['color', 'first_season']).size().reset_index(name='count')

Unnamed: 0,color,first_season,count
0,black,2,1
1,black,3,1
2,blue,3,1
3,green,1,1
4,red,1,1
5,red,2,1


### Joining Tables on One Column


In [None]:
heroes['hero'] = heroes.index
heroes

Unnamed: 0,color,first_seen_on,first_season,hero
flash,red,a,2,flash
arrow,green,a,1,arrow
vibe,black,f,2,vibe
atom,blue,a,3,atom
canary,black,a,3,canary
firestorm,red,f,1,firestorm


#### Inner Join


In [None]:
pd.merge(heroes, teams, how='inner', on='hero')

Unnamed: 0,color,first_seen_on,first_season,hero,team
0,red,a,2,flash,flash
1,green,a,1,arrow,arrow
2,black,f,2,vibe,flash
3,blue,a,3,atom,legends
4,red,f,1,firestorm,legends


#### Left and right join


In [None]:
pd.merge(heroes, teams, how='left', on='hero')

Unnamed: 0,color,first_seen_on,first_season,hero,team
0,red,a,2,flash,flash
1,green,a,1,arrow,arrow
2,black,f,2,vibe,flash
3,blue,a,3,atom,legends
4,black,a,3,canary,
5,red,f,1,firestorm,legends


#### Outer join


In [None]:
pd.merge(heroes, teams, how='outer', on='hero')

Unnamed: 0,color,first_seen_on,first_season,hero,team
0,red,a,2.0,flash,flash
1,green,a,1.0,arrow,arrow
2,black,f,2.0,vibe,flash
3,blue,a,3.0,atom,legends
4,black,a,3.0,canary,
5,red,f,1.0,firestorm,legends
6,,,,killer frost,flash
7,,,,speedy,arrow


#### More than one match?


In [None]:
pd.merge(heroes, identities, how='inner',
         left_on='hero', right_on='alter-ego')

Unnamed: 0,color,first_seen_on,first_season,hero,ego,alter-ego
0,red,a,2,flash,barry allen,flash
1,green,a,1,arrow,oliver queen,arrow
2,black,f,2,vibe,cisco ramon,vibe
3,blue,a,3,atom,ray palmer,atom
4,black,a,3,canary,sara lance,canary
5,red,f,1,firestorm,martin stein,firestorm
6,red,f,1,firestorm,ronnie raymond,firestorm


### Missing Values

In [None]:
x = np.nan
y = pd.merge(heroes, teams, how='outer', on='hero')['first_season']
y

Unnamed: 0,first_season
0,2.0
1,1.0
2,2.0
3,3.0
4,3.0
5,1.0
6,
7,


In [None]:
pd.isnull(x)

True

In [None]:
y.isnull()

Unnamed: 0,first_season
0,False
1,False
2,False
3,False
4,False
5,False
6,True
7,True


In [None]:
pd.isnull(y)

Unnamed: 0,first_season
0,False
1,False
2,False
3,False
4,False
5,False
6,True
7,True


In [None]:
y.notnull()

Unnamed: 0,first_season
0,True
1,True
2,True
3,True
4,True
5,True
6,False
7,False


In [None]:
y[y.notnull()]

Unnamed: 0,first_season
0,2.0
1,1.0
2,2.0
3,3.0
4,3.0
5,1.0


"On-Time" flight data for all flights originating from SFO or OAK in January 2016. Information about the airports and airlines are contained in the comma-delimited files `airports.dat` and `airlines.dat`, respectively.  Both were sourced from https://openflights.org/data.php.

In [None]:
flights = pd.read_csv("flights.dat", dtype={'sched_dep_time': 'f8', 'sched_arr_time': 'f8', 'origin': 'string', 'destination': 'string'})
flights.head()

Unnamed: 0,year,month,day,date,carrier,tailnum,flight,origin,destination,sched_dep_time,actual_dep_time,sched_arr_time,actual_arr_time
0,2016,1,1,2016-01-01,AA,N3FLAA,208,SFO,MIA,630.0,628.0,1458.0,1431.0
1,2016,1,2,2016-01-02,AA,N3APAA,208,SFO,MIA,600.0,553.0,1428.0,1401.0
2,2016,1,3,2016-01-03,AA,N3DNAA,208,SFO,MIA,630.0,626.0,1458.0,1431.0
3,2016,1,4,2016-01-04,AA,N3FGAA,208,SFO,MIA,630.0,626.0,1458.0,1444.0
4,2016,1,5,2016-01-05,AA,N3KUAA,208,SFO,MIA,640.0,632.0,1458.0,1439.0


In [None]:
airports_cols = [
    'openflights_id',
    'name',
    'city',
    'country',
    'iata',
    'icao',
    'latitude',
    'longitude',
    'altitude',
    'tz',
    'dst',
    'tz_olson',
    'type',
    'airport_dsource'
]

airports = pd.read_csv("airports.dat", names=airports_cols, dtype={'iata': 'string', 'city':'string'})
airports.head(3)

Unnamed: 0,openflights_id,name,city,country,iata,icao,latitude,longitude,altitude,tz,dst,tz_olson,type,airport_dsource
0,1,Goroka,Goroka,Papua New Guinea,GKA,AYGA,-6.081689,145.391881,5282,10.0,U,Pacific/Port_Moresby,,
1,2,Madang,Madang,Papua New Guinea,MAG,AYMD,-5.207083,145.7887,20,10.0,U,Pacific/Port_Moresby,,
2,3,Mount Hagen,Mount Hagen,Papua New Guinea,HGU,AYMH,-5.826789,144.295861,5388,10.0,U,Pacific/Port_Moresby,,


In [None]:
airports.head(10)

Unnamed: 0,openflights_id,name,city,country,iata,icao,latitude,longitude,altitude,tz,dst,tz_olson,type,airport_dsource
0,1,Goroka,Goroka,Papua New Guinea,GKA,AYGA,-6.081689,145.391881,5282,10.0,U,Pacific/Port_Moresby,,
1,2,Madang,Madang,Papua New Guinea,MAG,AYMD,-5.207083,145.7887,20,10.0,U,Pacific/Port_Moresby,,
2,3,Mount Hagen,Mount Hagen,Papua New Guinea,HGU,AYMH,-5.826789,144.295861,5388,10.0,U,Pacific/Port_Moresby,,
3,4,Nadzab,Nadzab,Papua New Guinea,LAE,AYNZ,-6.569828,146.726242,239,10.0,U,Pacific/Port_Moresby,,
4,5,Port Moresby Jacksons Intl,Port Moresby,Papua New Guinea,POM,AYPY,-9.443383,147.22005,146,10.0,U,Pacific/Port_Moresby,,
5,6,Wewak Intl,Wewak,Papua New Guinea,WWK,AYWK,-3.583828,143.669186,19,10.0,U,Pacific/Port_Moresby,,
6,7,Narsarsuaq,Narssarssuaq,Greenland,UAK,BGBW,61.160517,-45.425978,112,-3.0,E,America/Godthab,,
7,8,Nuuk,Godthaab,Greenland,GOH,BGGH,64.190922,-51.678064,283,-3.0,E,America/Godthab,,
8,9,Sondre Stromfjord,Sondrestrom,Greenland,SFJ,BGSF,67.016969,-50.689325,165,-3.0,E,America/Godthab,,
9,10,Thule Air Base,Thule,Greenland,THU,BGTL,76.531203,-68.703161,251,-4.0,E,America/Thule,,


In [None]:
def iata_to_city(iata_codes, airports):
    """
    Converts iata codes to city names.

    Args:
        iata_codes (string): series of iata codes given in string format.
        airports (Pandas Dataframe): dataframe containing information about airports.

    Returns:
        array (string): series of input dimension with names of cities.
    """
    iata = iata_codes.to_frame(name='iata')
    not_nan = iata[iata['iata'].notna()]
    city = pd.merge(not_nan, airports[['iata', 'city']], how='left', on='iata')
    ans = pd.merge(iata,city[['iata', 'city']], how='left', on='iata')
    return ans['city']


In [None]:

import unittest

class TestIataToCity(unittest.TestCase):
  def test_iata_to_city(self):
    ser = pd.Series(["ORD", "DFW", "SBN", np.nan], dtype='string')
    self.assertTrue(iata_to_city(ser, airports).equals(pd.Series(["Chicago", "Dallas-Fort Worth", "South Bend", np.nan], dtype='string')))

if __name__ == '__main__':
    unittest.main(argv=[''], exit=False)

..
----------------------------------------------------------------------
Ran 2 tests in 0.022s

OK


In [None]:
def minutes_passed(time):
    """
    Converts military time to the number of minutes passed in the day

    Args:
        time (float64): series of time given in military format.
          Takes on values in 0.0-2359.0 due to float64 representation.

    Returns:
        array (float64): series of input dimension with number of minutes passed in the day

    Example: 1:03pm is converted to 783.0
    """

    hr=(time // 100)
    min=(time % 100)
    cond = (time >= 0) & (time < 2400) & (min<60)
    min_pass=(hr*60)+min

    return min_pass.where(cond,np.nan)


ser = pd.Series([1303,1200, 2400], dtype='float64')
minutes_passed(ser)

Unnamed: 0,0
0,783.0
1,720.0
2,


In [None]:

import unittest

class TestMinutesPassed(unittest.TestCase):
  def test_minutes_passed(self):
    ser = pd.Series([743, 954, 2157, 300, 12, np.nan], dtype="float64")
    self.assertTrue(minutes_passed(ser).equals(pd.Series([463, 594, 1317, 180, 12, np.nan], dtype='float64')))

if __name__ == '__main__':
    unittest.main(argv=[''], exit=False)

..
----------------------------------------------------------------------
Ran 2 tests in 0.016s

OK


In [None]:
def get_time_diff(x, y):
    """
    Calculates delay times y - x

    Args:
        x (float64): series of scheduled time given in military format.
          Takes on values in 0.0-2359.0 due to float64 representation.
        y (float64): series of same dimensions giving actual time

    Returns:
        array (float64): series of input dimension with delay time in minutes
    """

    return ((minutes_passed(y)-minutes_passed(x))*1)/1



sched = pd.Series([1303, 1210], dtype='float64')
actual = pd.Series([1304, 1215], dtype='float64')
get_time_diff(sched, actual)



Unnamed: 0,0
0,1.0
1,5.0


In [None]:

#delay_time = ... # Series object showing delay time for all flights
delay_time =  get_time_diff(flights['sched_arr_time'],flights['actual_arr_time'])
#on_time = ... # Dataframe showing flights that arrive on time
flights['delay_time']= delay_time
on_time = flights[flights['delay_time']<=10].iloc[:,:-1]
#delayed = ... # Dataframe showing flights that do NOT arrive on time
delayed = flights[flights['delay_time']>10].iloc[:,:-1]
print(delay_time)
print(on_time)
print(delayed)

0         -27.0
1         -27.0
2         -27.0
3         -14.0
4         -19.0
          ...  
16856      59.0
16857      68.0
16858   -1254.0
16859   -1264.0
16860     130.0
Length: 16861, dtype: float64
       year  month  day        date carrier tailnum  flight origin  \
0      2016      1    1  2016-01-01      AA  N3FLAA     208    SFO   
1      2016      1    2  2016-01-02      AA  N3APAA     208    SFO   
2      2016      1    3  2016-01-03      AA  N3DNAA     208    SFO   
3      2016      1    4  2016-01-04      AA  N3FGAA     208    SFO   
4      2016      1    5  2016-01-05      AA  N3KUAA     208    SFO   
...     ...    ...  ...         ...     ...     ...     ...    ...   
16852  2016      1   11  2016-01-11      F9  N227FR     660    SFO   
16854  2016      1   20  2016-01-20      F9  N952FR     756    SFO   
16855  2016      1    3  2016-01-03      F9  N208FR    1124    SFO   
16858  2016      1    3  2016-01-03      F9  N910FR     662    SFO   
16859  2016      1    3 

In [None]:
sfo_oak = on_time[on_time['origin'].isin(['SFO', 'OAK'])]
ser=sfo_oak['destination'][:]
cities=iata_to_city(ser,airports)

#on_time_airports = ... # first get a Dataframe showing airports where on-time flights arrive
on_time_airports = airports[airports['city'].isin(cities)]
print(on_time_airports)
#on_time_destinations = ... # then get the unique and sorted destination cities
on_time_destinations= on_time_airports['city'].drop_duplicates().sort_values()
print(on_time_destinations)


      openflights_id                               name             city  \
1065            1085                        Saint Louis        St. Louis   
1695            6880          Montrose Regional Airport      Montrose CO   
1838            6877            North Las Vegas Airport        Las Vegas   
2584            2654                      Maria Dolores      Los Angeles   
2694            2766               Santa Ana Del Yacuma        Santa Ana   
...              ...                                ...              ...   
8032            9467                    Port Everglades  Fort Lauderdale   
8068            9503             Holmesburg Jct Station     Philadelphia   
8071            9506           B Street Cruise Terminal        San Diego   
8077            9512                    Port of Seattle          Seattle   
8106            9541  San Diego Old Town Transit Center        San Diego   

            country  iata  icao   latitude   longitude  altitude   tz dst  \
1065      

In [None]:

sfo_oak = delayed[delayed['origin'].isin(['SFO', 'OAK'])]
ser=sfo_oak['destination'][:]
cities=iata_to_city(ser,airports)
#late_airports = ... # first get a Dataframe showing airports where late flights arrive
late_airports = airports[airports['city'].isin(cities)]
print(late_airports)
#late_destinations = ... # then get the unique and sorted destination cities
late_destinations= on_time_airports['city'].drop_duplicates().sort_values()
print(late_destinations)


      openflights_id                               name             city  \
1065            1085                        Saint Louis        St. Louis   
1695            6880          Montrose Regional Airport      Montrose CO   
1838            6877            North Las Vegas Airport        Las Vegas   
2584            2654                      Maria Dolores      Los Angeles   
2694            2766               Santa Ana Del Yacuma        Santa Ana   
...              ...                                ...              ...   
8032            9467                    Port Everglades  Fort Lauderdale   
8068            9503             Holmesburg Jct Station     Philadelphia   
8071            9506           B Street Cruise Terminal        San Diego   
8077            9512                    Port of Seattle          Seattle   
8106            9541  San Diego Old Town Transit Center        San Diego   

            country  iata  icao   latitude   longitude  altitude   tz dst  \
1065      

###Web scraping and data collection


* Using HTTP to fetch the content of a website
* HTTP Requests (and lifecycle)
* RESTful APIs
    * Authentication (OAuth)
    * Pagination
    * Rate limiting
* JSON vs. HTML
* HTML traversal (CSS selectors)


[link text](https://)

In [None]:
import io, time, json
import requests
from bs4 import BeautifulSoup

## Authentication and working with APIs


In [None]:
def retrieve_html(url):
    """
    Return the raw HTML at the specified URL.

    Args:
        url (string):

    Returns:
        status_code (integer):
        raw_html (string): the raw HTML content of the response, properly encoded according to the HTTP headers.
    """

    response=requests.get(url)
    return (response.status_code,response.text)

In [None]:

youtube_article = retrieve_html('https://apnews.com/article/north-america-technology-business-journalism-media-a3b9b5a518f247b8a2ebbf4fb5c2d9ed')
print(youtube_article)




## Yelp API Access
###Authenticated HTTP Request with the Yelp API


In [None]:
with open('/content/yelp_api_key.txt', 'r') as f:
    api_key = f.read().replace('\n','')
    print(api_key)


ZwaDNPmJ63AwlQjgp7a6du0T0pysCKH6eXmD_OQlt87zKAkOzpmR8_7oZXQlfWOYj674cUQJAutz-i8JzqdOEVx5TCMm2Zh5XMkNXjX9Q5e7fdcnzubNrRUbeGjrZnYx 


In [None]:
def read_api_key(filepath):
    """
    Read the Yelp API Key from file.

    Args:
        filepath (string): File containing API Key
    Returns:
        api_key (string): The API Key
    """

    with open(filepath, 'r') as f:
        return f.read().replace('\n','')

In [None]:
read_api_key('/content/yelp_api_key.txt')

'ZwaDNPmJ63AwlQjgp7a6du0T0pysCKH6eXmD_OQlt87zKAkOzpmR8_7oZXQlfWOYj674cUQJAutz-i8JzqdOEVx5TCMm2Zh5XMkNXjX9Q5e7fdcnzubNrRUbeGjrZnYx '

Using the Yelp API, fill in the following function stub to make an authenticated request to the [search](https://docs.developer.yelp.com/reference/v3_business_search) endpoint. Remember Yelp allows you to pass the API Key via a special HTTP Header: `Authorization: Bearer <API_KEY>`. Check out the [docs](https://docs.developer.yelp.com/docs/fusion-authentication) for more information.

In [None]:
def location_search_params(api_key, location, **kwargs):
    """
    Construct url, headers and url_params. Reference API docs (link above) to use the arguments
    """
    url = 'https://api.yelp.com/v3/businesses/search'
    headers =  {
        'Authorization': 'Bearer ' + api_key
    }
    url_params = {
        'location': location.replace(' ','+')
    }
    url_params.update(kwargs)

    return url, headers, url_params


In [None]:
api_key = "test_api_key_xyz"
location = "Chicago"
url, headers, url_params = location_search_params(api_key, location, offset=0, limit=50)
url, headers, url_params


('https://api.yelp.com/v3/businesses/search',
 {'Authorization': 'Bearer test_api_key_xyz'},
 {'location': 'Chicago', 'offset': 0, 'limit': 50})

In [None]:
def api_get_request(url, headers, url_params):
    """
    Send a HTTP GET request and return a json response

    Args:
        url (string): API endpoint url
        headers (dict): A python dictionary containing HTTP headers including Authentication to be sent
        url_params (dict): The parameters (required and optional) supported by endpoint

    Returns:
        results (json): response as json
    """
    http_method = 'GET'
    response = requests.request(http_method, url, headers=headers, params=url_params)

    return response.json()


def yelp_search(api_key, location, offset=0):
    """
    Make an authenticated request to the Yelp API.

    Args:
        api_key (string): Your Yelp API Key for Authentication
        location (string): Business Location
        offset (int): param for pagination

    Returns:
        total (integer): total number of businesses on Yelp corresponding to the location
        businesses (list): list of dicts representing each business
    """
    url, headers, url_params = location_search_params(api_key, location, offset=0)
    response_json = api_get_request(url, headers, url_params)
    return response_json["total"], list(response_json["businesses"])

api_key = read_api_key('/content/yelp_api_key.txt')
num_records, data = yelp_search(api_key, 'Chicago')
print(num_records)

print(len(data))
print(list(map(lambda x: x['name'], data)))


11100
20
['Girl & The Goat', 'Au Cheval', 'The Purple Pig', 'Wildberry Pancakes and Cafe', 'Cafe Ba-Ba-Reeba!', 'Aba', 'etta - Bucktown', 'Penumbra', 'The Dearborn', 'Rose Mary', 'Boka', 'The Gage', 'Alinea', 'S.K.Y.', "Joe's Seafood, Prime Steak & Stone Crab", 'Kasama', "Bavette's Bar & Boeuf", 'KAI ZAN', "Cindy's Rooftop", 'Il Porcellino']



## Parameterization and Pagination




###All of the restaurants in Chicago on Yelp


In [None]:
def paginated_restaurant_search_requests(api_key, location, total):
    """
    Returns a list of tuples (url, headers, url_params) for paginated search of all restaurants
    Args:
        api_key (string): Your Yelp API Key for Authentication
        location (string): Business Location
        total (int): Total number of items to be fetched
    Returns:
        results (list): list of tuple (url, headers, url_params)
    """

    url, headers, url_params = location_search_params(api_key,location)
    offset = 0
    limit = 10
    res = []
    if total % limit == 0:
      num_res= int(total/limit)
    else:
      num_res= int(total//limit)+1

    for offset in range(0, num_res * limit, limit):
        url_params = {
            'location': location,
            'offset': offset,
            'limit': limit,
            'categories': 'restaurants'
        }
        res.append((url, headers, url_params))
        time.sleep(0.2)

    return  res

api_key = "test_api_key_xyz"
location = "Chicago"
all_restaurants_requests = paginated_restaurant_search_requests(api_key, location, 15)
all_restaurants_requests


[('https://api.yelp.com/v3/businesses/search',
  {'Authorization': 'Bearer test_api_key_xyz'},
  {'location': 'Chicago',
   'offset': 0,
   'limit': 10,
   'categories': 'restaurants'}),
 ('https://api.yelp.com/v3/businesses/search',
  {'Authorization': 'Bearer test_api_key_xyz'},
  {'location': 'Chicago',
   'offset': 10,
   'limit': 10,
   'categories': 'restaurants'})]

In [None]:
def all_restaurants(api_key, location):
    """
    Construct the pagination requests for ALL the restaurants on Yelp for a given location.

    Args:
        api_key (string): Your Yelp API Key for Authentication
        location (string): Business Location

    Returns:
        results (list): list of dicts representing each restaurant
    """
    url, headers, url_params = location_search_params(api_key, location,limit=10,offset=0,categories='restaurants')
    response_json = api_get_request(url, headers, url_params)
    total_items = response_json["total"]
    all_restaurants_request = paginated_restaurant_search_requests(api_key, location, total_items)

    res_data = []
    for req_url, req_headers, req_url_params in all_restaurants_request:
      response_res = api_get_request(req_url, req_headers, req_url_params)
      res_data += response_res["businesses"]
      time.sleep(0.2)

    return res_data

In [None]:
api_key = read_api_key('/content/yelp_api_key.txt')
data = all_restaurants(api_key, 'Greektown, Chicago, IL')
print(len(data))

print(list(map(lambda x:x['name'], data)))


116
['Greek Islands Restaurant', 'Girl & The Goat', 'Athena Greek Restaurant', 'Monteverde', 'Meli Cafe & Juice Bar', 'Rye Deli & Drink', 'Xi’an Cuisine', 'Green Street Smoked Meats', 'CityBird', 'Trivoli Tavern', "Formento's", 'SUSHI DOKKU Japanese Restaurant', 'El Che Steakhouse & Bar', 'Viaggio Ristorante & Lounge', 'The Allis', 'Zeus Restaurant', 'Tamashii Ramen', "Nancy's Pizza Chicago - West Loop", 'Sepia', 'Green Street Local', 'Suenos x Soho House', 'High Five Ramen', 'Primos Chicago Pizza', 'Spectrum Bar and Grill', '9 Muses', 'Booze Box', 'Omakase Yume', 'J.P. Graziano Grocery', 'Lola’s Restaurant & Bar', 'TenGoku Aburiya', 'Mr Greek Gyros', 'Ciao! Cafe & Wine Lounge', "Nando's Peri-Peri", 'Jubilee Juice & Grill', 'Taco Lulú', 'Swadesi Cafe', "Giordano's", "Lou Mitchell's", 'SGD Dubu So Gong Dong Tofu & Korean BBQ', "Philly's Best", 'Parlor Pizza Bar', "Nonna's Pizza & Sandwiches", 'Slightly Toasted', 'Stelios Bottles & Bites', 'Taco Burrito King - Greektown', 'Tamu', 'Blaze 

##Parsing the API Responses and Extract the URLs

In [None]:
data

[{'id': '_UJ3KqtHIHjExbAg-iIGUA',
  'alias': 'greek-islands-restaurant-chicago',
  'name': 'Greek Islands Restaurant',
  'image_url': 'https://s3-media1.fl.yelpcdn.com/bphoto/E9eB7tElrGp8h4IhsS6JHg/o.jpg',
  'is_closed': False,
  'url': 'https://www.yelp.com/biz/greek-islands-restaurant-chicago?adjust_creative=JN4t62CTo5UGWhBoAuyd7w&utm_campaign=yelp_api_v3&utm_medium=api_v3_business_search&utm_source=JN4t62CTo5UGWhBoAuyd7w',
  'review_count': 2184,
  'categories': [{'alias': 'greek', 'title': 'Greek'},
   {'alias': 'mediterranean', 'title': 'Mediterranean'}],
  'rating': 4.0,
  'coordinates': {'latitude': 41.87896, 'longitude': -87.64763},
  'transactions': ['pickup', 'delivery'],
  'price': '$$',
  'location': {'address1': '200 S Halsted St',
   'address2': '',
   'address3': '',
   'city': 'Chicago',
   'zip_code': '60661',
   'country': 'US',
   'state': 'IL',
   'display_address': ['200 S Halsted St', 'Chicago, IL 60661']},
  'phone': '+13127829855',
  'display_phone': '(312) 782-

In [None]:
def parse_api_response(data):
    """
    Parse Yelp API results to extract restaurant URLs.

    Args:
        data (string): String of properly formatted JSON.

    Returns:
        (list): list of URLs as strings from the input JSON.
    """

    data = json.loads(data)
    res_url=list(map(lambda x:x['url'], data['businesses']))
    return res_url

url, headers, url_params = location_search_params(api_key, "Bridgeport, Chicago, IL", offset=0)
response_text = requests.request('GET', url, headers=headers, params=url_params).text

parse_api_response(response_text)


['https://www.yelp.com/biz/the-duck-inn-chicago?adjust_creative=JN4t62CTo5UGWhBoAuyd7w&utm_campaign=yelp_api_v3&utm_medium=api_v3_business_search&utm_source=JN4t62CTo5UGWhBoAuyd7w',
 'https://www.yelp.com/biz/francos-ristorante-chicago?adjust_creative=JN4t62CTo5UGWhBoAuyd7w&utm_campaign=yelp_api_v3&utm_medium=api_v3_business_search&utm_source=JN4t62CTo5UGWhBoAuyd7w',
 'https://www.yelp.com/biz/mins-noodle-house-%E6%B8%94%E5%AE%B6%E9%87%8D%E5%BA%86%E5%B0%8F%E9%9D%A2-chicago-32?adjust_creative=JN4t62CTo5UGWhBoAuyd7w&utm_campaign=yelp_api_v3&utm_medium=api_v3_business_search&utm_source=JN4t62CTo5UGWhBoAuyd7w',
 'https://www.yelp.com/biz/kimski-chicago?adjust_creative=JN4t62CTo5UGWhBoAuyd7w&utm_campaign=yelp_api_v3&utm_medium=api_v3_business_search&utm_source=JN4t62CTo5UGWhBoAuyd7w',
 'https://www.yelp.com/biz/stix-n-brix-pizza-chicago-2?adjust_creative=JN4t62CTo5UGWhBoAuyd7w&utm_campaign=yelp_api_v3&utm_medium=api_v3_business_search&utm_source=JN4t62CTo5UGWhBoAuyd7w',
 'https://www.yelp.c

## Working with Web Pages (and HTML)

In [None]:
url_lookup = {
"https://www.yelp.com/biz/the-jibarito-stop-chicago-2?start=225":"parse_page_test1.html",
"https://www.yelp.com/biz/the-jibarito-stop-chicago-2?start=245":"parse_page_test2.html"
}

def html_fetcher(url):
    """
    Return the raw HTML at the specified URL.
    Args:
        url (string):

    Returns:
        status_code (integer):
        raw_html (string): the raw HTML content of the response, properly encoded according to the HTTP headers.
    """
    html_file = url_lookup.get(url)
    with open(html_file, 'rb') as file:
        html_text = file.read()
        return 200, html_text


def parse_page(html):
    """
    Parse the reviews on a single page of a restaurant.

    Args:
        html (string): String of HTML corresponding to a Yelp restaurant

    Returns:
        tuple(list, string): a tuple of two elements
            first element: list of dictionaries corresponding to the extracted review information
            second element: URL for the next page of reviews (or None if it is the last page)
    """
    soup = BeautifulSoup(html,'html.parser')
    url_next = soup.find('link',rel='next')
    if url_next:
        url_next = url_next.get('href')
    else:
        url_next = None

    reviews = soup.find_all('div', itemprop="review")
    reviews_list = []
    for rev in reviews:
      author =rev.find('meta',itemprop='author')['content']
      rating =rev.find('meta',itemprop='ratingValue')['content']
      rating=float(rating)
      date =rev.find('meta',itemprop='datePublished')['content']
      description=rev.find('p',itemprop='description').text.strip()

      reviews_list.append({
          'author':author,
          'rating':rating,
          'date':date,
          'description':description
      })

    return reviews_list, url_next

code, html = html_fetcher("https://www.yelp.com/biz/the-jibarito-stop-chicago-2?start=225")
reviews_list, url_next = parse_page(html)
print(len(reviews_list)) # 20
print(url_next) #https://www.yelp.com/biz/the-jibarito-stop-chicago-2?start=245

20
https://www.yelp.com/biz/the-jibarito-stop-chicago-2?start=245


###Extracting  all Yelp reviews for a Single Restaurant




In [None]:
def extract_reviews(url, html_fetcher):
    """
    Retrieve ALL of the reviews for a single restaurant on Yelp.

    Parameters:
        url (string): Yelp URL corresponding to the restaurant of interest.
        html_fetcher (function): A function that takes url and returns html status code and content

    Returns:
        reviews (list): list of dictionaries containing extracted review information
    """
    reviews = []

    cond = True
    while cond:
      code, html = html_fetcher(url)
      reviews_list, url_next = parse_page(html)
      reviews += reviews_list
      if url_next:
        url =url_next
      else:
        cond =False

    return reviews

You can test your function with this code:

In [None]:
data = extract_reviews('https://www.yelp.com/biz/the-jibarito-stop-chicago-2?start=225', html_fetcher=html_fetcher)
print(len(data))
print(data[0])


35
{'author': 'Jason S.', 'rating': 5.0, 'date': '2016-05-02', 'description': "This was one of my favorite food trucks but as of last fall they've opened a brick and mortar restaurant in the Pilsen neighborhood...the perfect success story of how a person can start out with a food truck and grow their business into a restaurant. The food is always delicious and the service is great!"}
