In [1]:
import requests
from bs4 import BeautifulSoup

In [2]:
html_doc = '''<!DOCTYPE html>
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>

<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
</html>'''

In [3]:
# select by class
soup = BeautifulSoup(html_doc,'html.parser')

In [4]:
print(soup.prettify())

<!DOCTYPE html>
<html>
 <head>
  <title>
   The Dormouse's story
  </title>
 </head>
 <body>
  <p class="title">
   <b>
    The Dormouse's story
   </b>
  </p>
  <p class="story">
   Once upon a time there were three little sisters; and their names were
   <a class="sister" href="http://example.com/elsie" id="link1">
    Elsie
   </a>
   ,
   <a class="sister" href="http://example.com/lacie" id="link2">
    Lacie
   </a>
   and
   <a class="sister" href="http://example.com/tillie" id="link3">
    Tillie
   </a>
   ;
and they lived at the bottom of a well.
  </p>
  <p class="story">
   ...
  </p>
 </body>
</html>


# html to select elements of the code

In [5]:
soup.title

<title>The Dormouse's story</title>

In [6]:
soup.body

<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
</body>

In [7]:
p_tags=soup.find_all('p')
p_tags

[<p class="title"><b>The Dormouse's story</b></p>,
 <p class="story">Once upon a time there were three little sisters; and their names were
 <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
 and they lived at the bottom of a well.</p>,
 <p class="story">...</p>]

In [8]:
# find_all
a_tags=soup.find_all('a')
a_tags

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

In [9]:
# get_text
for p in p_tags:
    print(p.get_text())

The Dormouse's story
Once upon a time there were three little sisters; and their names were
Elsie,
Lacie and
Tillie;
and they lived at the bottom of a well.
...


In [10]:
for a in a_tags:
    print(a.get_text())

Elsie
Lacie
Tillie


In [11]:
# get('href')
for a in a_tags:
    print(a.get('href'))

http://example.com/elsie
http://example.com/lacie
http://example.com/tillie


In [12]:
soup.title

<title>The Dormouse's story</title>

In [13]:
# parent hierarchy
soup.title.parent.string

"The Dormouse's story"

In [14]:
soup.title.parent.name

'head'

In [15]:
soup.title.parent.parent.name

'html'

In [16]:
# text.count
soup.text.count('were')

2

In [17]:
# simple website query
import re
re.findall(r'\w+', requests.get('https://www.ironhack.com/en').text).count('bootcamp')

63

# css method

In [18]:
soup

<!DOCTYPE html>

<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
</body></html>

In [19]:
soup.select('.sister') # . for class name

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

In [20]:
soup.select('#link1') # for id

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]

In [21]:
soup.select('a') # for class type

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

In [22]:
# iterate through select of a and get text
for a in soup.select('a'):
    print(a.get_text())

Elsie
Lacie
Tillie


In [23]:
# use index  to find position in results
print(soup.select('a')[0].get_text())

Elsie


In [24]:
soup.select('p.story')

[<p class="story">Once upon a time there were three little sisters; and their names were
 <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
 and they lived at the bottom of a well.</p>,
 <p class="story">...</p>]

In [25]:
soup.select('p.story')[0].get_text()

'Once upon a time there were three little sisters; and their names were\nElsie,\nLacie and\nTillie;\nand they lived at the bottom of a well.'

In [26]:
soup.select('p.story')[1].get_text()

'...'

# activity
- 1) all the 'fun facts'
- 2) names of all the places
- 3) the content (name plus fact) of ONLY cities
- 4) the names of only cities (not facts)

In [27]:
geography = '''<!DOCTYPE html>
<html>
<head> Geography</head>
<body>
<div class='city'>
  <h2>London</h2>
  <p>London is the most popular tourist destination in the world.</p>
</div>
<div class='city'>
  <h2>Paris</h2>
  <p>Paris was originally a Roman City called Lutetia.</p>
</div>
<div class='country'>
  <h2>Spain</h2>
  <p>Spain produces 43,8% of all the world’s Olive Oil.</p>
</div>
</body>
</html>'''

In [28]:
soup = BeautifulSoup(geography,'html.parser')

In [29]:
print(soup.prettify())

<!DOCTYPE html>
<html>
 <head>
  Geography
 </head>
 <body>
  <div class="city">
   <h2>
    London
   </h2>
   <p>
    London is the most popular tourist destination in the world.
   </p>
  </div>
  <div class="city">
   <h2>
    Paris
   </h2>
   <p>
    Paris was originally a Roman City called Lutetia.
   </p>
  </div>
  <div class="country">
   <h2>
    Spain
   </h2>
   <p>
    Spain produces 43,8% of all the world’s Olive Oil.
   </p>
  </div>
 </body>
</html>


# 1

In [30]:
p_tags=soup.find_all('p')
p_tags

[<p>London is the most popular tourist destination in the world.</p>,
 <p>Paris was originally a Roman City called Lutetia.</p>,
 <p>Spain produces 43,8% of all the world’s Olive Oil.</p>]

In [31]:
for p in p_tags:
    print(p.get_text())

London is the most popular tourist destination in the world.
Paris was originally a Roman City called Lutetia.
Spain produces 43,8% of all the world’s Olive Oil.


# 2

In [32]:
h2_tags=soup.find_all('h2')
h2_tags

[<h2>London</h2>, <h2>Paris</h2>, <h2>Spain</h2>]

In [33]:
for h2 in h2_tags:
    print(h2.get_text())

London
Paris
Spain


# 3

In [34]:
soup.find_all('div',{'class':'city'})

[<div class="city">
 <h2>London</h2>
 <p>London is the most popular tourist destination in the world.</p>
 </div>,
 <div class="city">
 <h2>Paris</h2>
 <p>Paris was originally a Roman City called Lutetia.</p>
 </div>]

In [35]:
for i in soup.select('div.city'):
    print(i.get_text())


London
London is the most popular tourist destination in the world.


Paris
Paris was originally a Roman City called Lutetia.



# 4

In [36]:
for i in soup.select('div.city'):
    print(i.h2.get_text())

London
Paris


# IMDB 250 scraping

In [37]:
url='https://www.imdb.com/chart/top/'

In [38]:
results=requests.get(url, headers ={'Accept-Language':'en-US'})

#locale of IP - set to english language

In [39]:
results.status_code

200

In [40]:
soup = BeautifulSoup(results.content,'html.parser')

In [None]:
print(soup.prettify())

In [None]:
# get title
soup.select('td.titleColumn')

In [43]:
soup.select('td.titleColumn a')[0].text

'The Shawshank Redemption'

In [44]:
# get directors, stars
soup.select('td.titleColumn a')[0]['title']

'Frank Darabont (dir.), Tim Robbins, Morgan Freeman'

In [45]:
# get the rank
soup.select('td.titleColumn \n')[0].text

'\n      1.\n      The Shawshank Redemption\n(1994)\n'

In [46]:
# get the year
soup.select('td.titleColumn span.secondaryInfo')[0].get_text()

'(1994)'

### For loop to collect movie title, dirs and actors, years

In [47]:
title=[]
year=[]
dir_actor=[]

len_movies=len(soup.select('td.titleColumn'))

In [48]:
len(title)

0

In [49]:
len(year)

0

In [50]:
len(dir_actor)

0

In [51]:
len_movies

250

In [52]:
for i in range(len_movies):
    title.append(soup.select('td.titleColumn a')[i].text)
    year.append(soup.select('td.titleColumn span.secondaryInfo')[i].text)
    dir_actor.append(soup.select('td.titleColumn a')[i]['title'])
    

In [53]:
from tqdm.notebook import tqdm

In [54]:
# hint : install tqdm.notebook

# Create dataframe from movies data

In [60]:
import pandas as pd
import numpy as np

In [56]:
year_cl=[yr.strip(')').strip('(') for yr in year] # list comprehension instead of for loop

In [62]:
# for loop to split out directors from actors and clean the data slightly
director=[]
star1=[]
star2=[]
for movie in dir_actor:
    split_list=movie.split(', ')
    director.append(split_list[0].replace(' (dir.)', ''))
    star1.append(split_list[1])
    star2.append(split_list[2])

In [None]:
star2

In [72]:
movies=pd.DataFrame({'Movie_title':title, 'director':director, 'actor1':star1, 'actor2':star2, 'rank':np.arange(len(movies))+1})
movies.head()

Unnamed: 0,Movie_title,director,actor1,actor2,rank
0,The Shawshank Redemption,Frank Darabont,Tim Robbins,Morgan Freeman,1
1,The Godfather,Francis Ford Coppola,Marlon Brando,Al Pacino,2
2,The Dark Knight,Christopher Nolan,Christian Bale,Heath Ledger,3
3,The Godfather: Part II,Francis Ford Coppola,Al Pacino,Robert De Niro,4
4,12 Angry Men,Sidney Lumet,Henry Fonda,Lee J. Cobb,5


# Day 2 scrape multiple pages

In [64]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
from tqdm.notebook import tqdm
from time import sleep
from random import randint

In [65]:
# start with website and we will use the second page

In [66]:
url= 'https://www.imdb.com/search/title/?title_type=feature&release_date=1990-01-01,1995-01-01&user_rating=7.0,&start=51&ref_=adv_prv'

In [67]:
results=requests.get(url)
results.status_code # 200 is a status that we can proceed

200

In [68]:
soup=BeautifulSoup(results.content,'html.parser')

In [69]:
print(soup.prettify())

<!DOCTYPE html>
<html xmlns:fb="http://www.facebook.com/2008/fbml" xmlns:og="http://ogp.me/ns#">
 <head>
  <meta charset="utf-8"/>
  <meta content="IE=edge" http-equiv="X-UA-Compatible"/>
  <script type="text/javascript">
   var IMDbTimer={starttime: new Date().getTime(),pt:'java'};
  </script>
  <script>
   if (typeof uet == 'function') {
      uet("bb", "LoadTitle", {wb: 1});
    }
  </script>
  <script>
   (function(t){ (t.events = t.events || {})["csm_head_pre_title"] = new Date().getTime(); })(IMDbTimer);
  </script>
  <title>
   Feature Film,
Released between 1990-01-01 and 1995-01-01,
User Rating at least 7
(Sorted by Popularity Ascending) - IMDb
  </title>
  <script>
   (function(t){ (t.events = t.events || {})["csm_head_post_title"] = new Date().getTime(); })(IMDbTimer);
  </script>
  <script>
   if (typeof uet == 'function') {
      uet("be", "LoadTitle", {wb: 1});
    }
  </script>
  <script>
   if (typeof uex == 'function') {
      uex("ld", "LoadTitle", {wb: 1});
    }
  <

## Build an iterator

In [73]:
# Divide the total by the interval of items to get the number of pages

In [74]:
iterations=range(1,2283,50)

In [78]:
for i in iterations:
    start_at=str(i)
    url='https://www.imdb.com/search/title/?title_type=feature&release_date=1990-01-01,1995-01-01&user_rating=7.0,&start='+start_at+'&ref_=adv_prv'

## Respectful scraping - use sleep and randint to separate our scraper pings

In [81]:
# example
for i in range(5):
    print(i)
    wait_time=randint(1,4)
    print('I will sleep now for ' +str(wait_time) + ' seconds')
    sleep(wait_time)

0
I will sleep now for 3 seconds
1
I will sleep now for 3 seconds
2
I will sleep now for 4 seconds
3
I will sleep now for 3 seconds
4
I will sleep now for 3 seconds


## Assemble the scraper

In [82]:
pages=[]
for i in iterations:
    start_at=str(i)
    url='https://www.imdb.com/search/title/?title_type=feature&release_date=1990-01-01,1995-01-01&user_rating=7.0,&start='+start_at+'&ref_=adv_prv'
    response=requests.get(url)
    #just for monitoring
    print('statu='+str(response.status_code))
    pages.append(response)
    #take a short sleep
    wait_time=randint(1,4)
    print('I will sleep now for ' + str(wait_time) + ' seconds')
    sleep(wait_time)

statu=200
I will sleep now for 1 seconds
statu=200
I will sleep now for 4 seconds
statu=200
I will sleep now for 1 seconds
statu=200
I will sleep now for 4 seconds
statu=200
I will sleep now for 3 seconds
statu=200
I will sleep now for 4 seconds
statu=200
I will sleep now for 3 seconds
statu=200
I will sleep now for 2 seconds
statu=200
I will sleep now for 1 seconds
statu=200
I will sleep now for 2 seconds
statu=200
I will sleep now for 2 seconds
statu=200
I will sleep now for 4 seconds
statu=200
I will sleep now for 1 seconds
statu=200
I will sleep now for 4 seconds
statu=200
I will sleep now for 3 seconds
statu=200
I will sleep now for 2 seconds
statu=200
I will sleep now for 4 seconds
statu=200
I will sleep now for 4 seconds
statu=200
I will sleep now for 4 seconds
statu=200
I will sleep now for 1 seconds
statu=200
I will sleep now for 4 seconds
statu=200
I will sleep now for 4 seconds
statu=200
I will sleep now for 4 seconds
statu=200
I will sleep now for 4 seconds
statu=200
I will

In [83]:
# create the soup from one page
soup=BeautifulSoup(pages[0].content, 'html.parser')

In [84]:
len(pages)

46

## title, synopsis

In [85]:
#title
#main > div > div.lister.list.detail.sub-list > div > div:nth-child(1) > div.lister-item-content > h3 > a
#main > div > div.lister.list.detail.sub-list > div > div:nth-child(2) > div.lister-item-content > h3 > a

soup.select('h3 > a')

[<a href="/title/tt0111161/">Cadena perpetua</a>,
 <a href="/title/tt0110912/">Pulp Fiction</a>,
 <a href="/title/tt0107290/">Jurassic Park (Parque Jurásico)</a>,
 <a href="/title/tt0108052/">La lista de Schindler</a>,
 <a href="/title/tt0108358/">Tombstone: La leyenda de Wyatt Earp</a>,
 <a href="/title/tt0102926/">El silencio de los corderos</a>,
 <a href="/title/tt0109830/">Forrest Gump</a>,
 <a href="/title/tt0099685/">Uno de los nuestros</a>,
 <a href="/title/tt0110413/">El profesional (Léon)</a>,
 <a href="/title/tt0106677/">Movida del 76</a>,
 <a href="/title/tt0110357/">El rey león</a>,
 <a href="/title/tt0103064/">Terminator 2: El juicio final</a>,
 <a href="/title/tt0106611/">Elegidos para el triunfo</a>,
 <a href="/title/tt0103776/">Batman vuelve</a>,
 <a href="/title/tt0108399/">Amor a quemarropa</a>,
 <a href="/title/tt0105236/">Reservoir Dogs</a>,
 <a href="/title/tt0099785/">Solo en casa</a>,
 <a href="/title/tt0103874/">Drácula de Bram Stoker</a>,
 <a href="/title/tt009

In [88]:
#description
#main > div > div.lister.list.detail.sub-list > div > div:nth-child(2) > div.lister-item-content > p:nth-child(4)
#main > div > div.lister.list.detail.sub-list > div > div:nth-child(4) > div.lister-item-content > p:nth-child(4)

soup.select('p:nth-child(4)')

[<p class="text-muted">
 Two imprisoned men bond over a number of years, finding solace and eventual redemption through acts of common decency.</p>,
 <p class="text-muted">
 The lives of two mob hitmen, a boxer, a gangster and his wife, and a pair of diner bandits intertwine in four tales of violence and redemption.</p>,
 <p class="text-muted">
 A pragmatic paleontologist touring an almost complete theme park on an island in Central America is tasked with protecting a couple of kids after a power failure causes the park's cloned dinosaurs to run loose.</p>,
 <p class="text-muted">
 In German-occupied Poland during World War II, industrialist <a href="/name/nm0771861">Oskar Schindler</a> gradually becomes concerned for his Jewish workforce after witnessing their persecution by the Nazis.</p>,
 <p class="text-muted">
 A successful lawman's plans to retire anonymously in Tombstone, Arizona are disrupted by the kind of outlaws he was famous for eliminating.</p>,
 <p class="text-muted">
 A 

## final iterator
- each page responses from the urls
- then parse each page
- capture the block of relevant text for each page
- then for each block of code - get the title, and synopsis
- clean as needed, make into dataframe

In [92]:
pages_parsed=[]
synopsis=[]
title=[]

for i in tqdm(range(len(pages))):
              pages_parsed.append(BeautifulSoup(pages[i].content, 'html.parser'))
              movies_html=pages_parsed[i].select('div.lister-item-content')
              for a in range(len(movies_html)):
                  title.append(movies_html[a].select('h3 > a')[0].get_text())
                  synopsis.append(movies_html[a].select('p:nth-child(4)')[0].get_text().strip())

  0%|          | 0/46 [00:00<?, ?it/s]

In [93]:
print(len(title))

2283


In [94]:
print(len(synopsis))

2283


In [95]:
title

['Cadena perpetua',
 'Pulp Fiction',
 'Jurassic Park (Parque Jurásico)',
 'La lista de Schindler',
 'Tombstone: La leyenda de Wyatt Earp',
 'El silencio de los corderos',
 'Forrest Gump',
 'Uno de los nuestros',
 'El profesional (Léon)',
 'Movida del 76',
 'El rey león',
 'Terminator 2: El juicio final',
 'Elegidos para el triunfo',
 'Batman vuelve',
 'Amor a quemarropa',
 'Reservoir Dogs',
 'Solo en casa',
 'Drácula de Bram Stoker',
 'Bailando con lobos',
 'Pretty Woman',
 'Sin perdón',
 'Instinto básico',
 'Entrevista con el vampiro',
 'The Sandlot: Historia de un verano',
 'La caza del Octubre Rojo',
 'El piano',
 'El padrino: Parte III',
 'Le llaman Bodhi',
 "Wayne's World: ¡Qué desparrame!",
 'Mujercitas',
 'Aladdín',
 'Misery',
 'Stargate: Puerta a las estrellas',
 'Ellas dan el golpe',
 'La bella y la bestia',
 'Sra. Doubtfire: Papá de por vida',
 'Desafío total',
 'Leyendas de pasión',
 'Atrapado en el tiempo',
 'Mi primo Vinny',
 'Dos tontos muy tontos',
 'El fugitivo',
 'El ú

In [96]:
synopsis

['Two imprisoned men bond over a number of years, finding solace and eventual redemption through acts of common decency.',
 'The lives of two mob hitmen, a boxer, a gangster and his wife, and a pair of diner bandits intertwine in four tales of violence and redemption.',
 "A pragmatic paleontologist touring an almost complete theme park on an island in Central America is tasked with protecting a couple of kids after a power failure causes the park's cloned dinosaurs to run loose.",
 'In German-occupied Poland during World War II, industrialist Oskar Schindler gradually becomes concerned for his Jewish workforce after witnessing their persecution by the Nazis.',
 "A successful lawman's plans to retire anonymously in Tombstone, Arizona are disrupted by the kind of outlaws he was famous for eliminating.",
 'A young F.B.I. cadet must receive the help of an incarcerated and manipulative cannibal killer to help catch another serial killer, a madman who skins his victims.',
 'The presidencies 

In [97]:
movies=pd.DataFrame({'Movie_title':title, 'Synopsis':synopsis})
movies.head()

Unnamed: 0,Movie_title,Synopsis
0,Cadena perpetua,Two imprisoned men bond over a number of years...
1,Pulp Fiction,"The lives of two mob hitmen, a boxer, a gangst..."
2,Jurassic Park (Parque Jurásico),A pragmatic paleontologist touring an almost c...
3,La lista de Schindler,"In German-occupied Poland during World War II,..."
4,Tombstone: La leyenda de Wyatt Earp,A successful lawman's plans to retire anonymou...


# Example - wikipedia - presidents

In [99]:
url = 'https://en.wikipedia.org/wiki/List_of_presidents_of_the_United_States'

In [100]:
# get the first page - then extract the names for the subpages
response=requests.get(url)
response.status_code

200

In [101]:
soup=BeautifulSoup(response.content, 'html.parser')

In [106]:
soup

<!DOCTYPE html>

<html class="client-nojs" dir="ltr" lang="en">
<head>
<meta charset="utf-8"/>
<title>List of presidents of the United States - Wikipedia</title>
<script>document.documentElement.className="client-js";RLCONF={"wgBreakFrames":false,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgRequestId":"847d6f45-1542-4878-b17b-53778a7e4936","wgCSPNonce":false,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":false,"wgNamespaceNumber":0,"wgPageName":"List_of_presidents_of_the_United_States","wgTitle":"List of presidents of the United States","wgCurRevisionId":1072751244,"wgRevisionId":1072751244,"wgArticleId":19908980,"wgIsArticle":true,"wgIsRedirect":false,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Articles with short description","Short description is different from Wik

In [103]:
presidentslist=soup.select('td:nth-child(3) > b > a')

In [105]:
len(presidentslist)

46

In [108]:
soup.select('td:nth-child(3) > b > a')[0]['href']

'/wiki/George_Washington'

In [107]:
# copy selector

#mw-content-text > div.mw-parser-output > table.wikitable.sortable.jquery-tablesorter > tbody > tr:nth-child(1) > td:nth-child(3) > b > a
#mw-content-text > div.mw-parser-output > table.wikitable.sortable.jquery-tablesorter > tbody > tr:nth-child(81) > td:nth-child(3) > b > a

In [115]:
presi_soup=[]
for p in presidentslist:
    url='https://en.wikipedia.org/' + p['href']
    response=requests.get(url)
    print(p.get_text(), response.status_code)
    soup=BeautifulSoup(response.content, 'html.parser')
    presi_soup.append(soup.find('table',{'class':'infobox vcard'}))
    wait_time=randint(1,3)
    print('I will sleep now for ' + str(wait_time) + ' seconds')
    sleep(wait_time)
    

George Washington 200
I will sleep now for 2 seconds
John Adams 200
I will sleep now for 2 seconds
Thomas Jefferson 200
I will sleep now for 1 seconds
James Madison 200
I will sleep now for 2 seconds
James Monroe 200
I will sleep now for 1 seconds
John Quincy Adams 200
I will sleep now for 1 seconds
Andrew Jackson 200
I will sleep now for 3 seconds
Martin Van Buren 200
I will sleep now for 1 seconds
William Henry Harrison 200
I will sleep now for 3 seconds
John Tyler 200
I will sleep now for 2 seconds
James K. Polk 200
I will sleep now for 2 seconds
Zachary Taylor 200
I will sleep now for 2 seconds
Millard Fillmore 200
I will sleep now for 3 seconds
Franklin Pierce 200
I will sleep now for 1 seconds
James Buchanan 200
I will sleep now for 3 seconds
Abraham Lincoln 200
I will sleep now for 1 seconds
Andrew Johnson 200
I will sleep now for 1 seconds
Ulysses S. Grant 200
I will sleep now for 3 seconds
Rutherford B. Hayes 200
I will sleep now for 1 seconds
James A. Garfield 200
I will slee

In [None]:
#mw-content-text > div.mw-parser-output > table.infobox.vcard

In [116]:
len(presi_soup)

46

In [117]:
presi_soup

[<table class="infobox vcard"><tbody><tr><th class="infobox-above" colspan="2" style="font-size: 100%;"><div class="fn" style="font-size:125%;">George Washington</div></th></tr><tr><td class="infobox-image" colspan="2"><a class="image" href="/wiki/File:Gilbert_Stuart_Williamstown_Portrait_of_George_Washington.jpg"><img alt="Head and shoulders portrait of George Washington" data-file-height="5615" data-file-width="4626" decoding="async" height="267" src="//upload.wikimedia.org/wikipedia/commons/thumb/b/b6/Gilbert_Stuart_Williamstown_Portrait_of_George_Washington.jpg/220px-Gilbert_Stuart_Williamstown_Portrait_of_George_Washington.jpg" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/b/b6/Gilbert_Stuart_Williamstown_Portrait_of_George_Washington.jpg/330px-Gilbert_Stuart_Williamstown_Portrait_of_George_Washington.jpg 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/b/b6/Gilbert_Stuart_Williamstown_Portrait_of_George_Washington.jpg/440px-Gilbert_Stuart_Williamstown_Portrait_of_Geo

## get information from the table/soup of results

In [125]:
# birthday
presi_soup[45].find('span',{'class':'bday'}).get_text()

'1942-11-20'

In [127]:
# political party # 45 is Obama
presi_soup[45].find('th',string='Political party').parent.find('a').get_text()

'Democratic'

In [131]:
# no of sons/daughters
len(presi_soup[45].find('th',string='Children').parent.find_all('li'))

4

In [138]:
president_name=[]
birth_date=[]
party=[]
no_children=[]

# try and except for children

for presi in presi_soup:
    birth_date.append(presi.find('span',{'class':'bday'}).get_text())
    party.append(presi.find('th',string='Political party').parent.find('a').get_text())
    president_name.append(presi.find('div',{'class':'fn'}).get_text())
    try:
        no_children.append(len(presi.find('th',string='Children').parent.find_all('li')))
    except:
        'NA'

In [141]:
# create a dataframe

presidents=pd.DataFrame({'president': president_name, 'birthday':birth_date, 'party':party})
presidents.head()

Unnamed: 0,president,birthday,party
0,George Washington,1732-02-22,Independent
1,John Adams,1735-10-30,Pro-Administration
2,Thomas Jefferson,1743-04-13,Democratic-Republican
3,James Madison,1751-03-16,Democratic-Republican
4,James Monroe,1758-04-28,Democratic-Republican


## API - easier way to getdata

In [142]:
# eg hockey api

# hockey=requests.get('')
# print('hockey: ', heckey.status_code)

In [143]:
# github url

response=requests.get('https://api.github.com/SergiGuasch')
response.status_code

404

In [144]:
response.json()

{'message': 'Not Found', 'documentation_url': 'https://docs.github.com/rest'}

In [146]:
# iss (international space station)
response=requests.get('http://api.open-notify.org/iss-now.json')
response.status_code

200

In [148]:
response.json()

{'message': 'success',
 'iss_position': {'latitude': '-36.0807', 'longitude': '35.6378'},
 'timestamp': 1646133689}

In [149]:
coordinates = {'lat':41.4161920371042,'lon':2.190552938670964,'n':20}
response=requests.get('http://api.open-notify.org/iss-pass.json', params=coordinates)
response.status_code

200

In [150]:
response.json()

{'message': 'success',
 'request': {'altitude': 100,
  'datetime': 1646134044,
  'latitude': 41.4161920371042,
  'longitude': 2.190552938670964,
  'passes': 20},
 'response': [{'duration': 555, 'risetime': 1646180725},
  {'duration': 656, 'risetime': 1646186458},
  {'duration': 603, 'risetime': 1646192318},
  {'duration': 577, 'risetime': 1646198195},
  {'duration': 634, 'risetime': 1646204020},
  {'duration': 635, 'risetime': 1646209828},
  {'duration': 304, 'risetime': 1646215750},
  {'duration': 466, 'risetime': 1646264306},
  {'duration': 653, 'risetime': 1646269982},
  {'duration': 620, 'risetime': 1646275820},
  {'duration': 574, 'risetime': 1646281703},
  {'duration': 618, 'risetime': 1646287541},
  {'duration': 651, 'risetime': 1646293346},
  {'duration': 464, 'risetime': 1646299209},
  {'duration': 306, 'risetime': 1646347924},
  {'duration': 638, 'risetime': 1646353514},
  {'duration': 637, 'risetime': 1646359323},
  {'duration': 578, 'risetime': 1646365205},
  {'duration': 6

In [152]:
from datetime import datetime
datetime.fromtimestamp(1646376866)

datetime.datetime(2022, 3, 4, 7, 54, 26)

In [157]:
response=requests.get('http://api.agify.io?name=sergi')

In [158]:
response.content

b'{"name":"sergi","age":51,"count":8460}'

In [162]:
response=requests.get('http://api.nationalize.io?name=sergi')

In [163]:
response.content

b'{"name":"sergi","country":[{"country_id":"AD","probability":0.7599570413205197},{"country_id":"ES","probability":0.21919021775884256},{"country_id":"","probability":0.009849230461593866}]}'

In [164]:
response=requests.get('http://universities.hipolabs.com/search?country=Spain')

In [165]:
response.content

b'[{"web_pages": ["http://www.barcelonagse.eu/"], "country": "Spain", "state-province": null, "name": "Barcelona Graduate School of Economics", "domains": ["barcelonagse.eu"], "alpha_two_code": "ES"}, {"web_pages": ["http://www.bircham.edu/"], "country": "Spain", "state-province": null, "name": "Bircham International University", "domains": ["bircham.edu"], "alpha_two_code": "ES"}, {"web_pages": ["http://www.ceu.es/"], "country": "Spain", "state-province": null, "name": "Universidad de San Pablo CEU", "domains": ["ceu.es"], "alpha_two_code": "ES"}, {"web_pages": ["http://www.deusto.es/"], "country": "Spain", "state-province": null, "name": "Universidad de Deusto", "domains": ["deusto.es"], "alpha_two_code": "ES"}, {"web_pages": ["http://www.ehu.es/"], "country": "Spain", "state-province": null, "name": "Universidad del Pa\\u00eds Vasco", "domains": ["ehu.es", "ehu.eus"], "alpha_two_code": "ES"}, {"web_pages": ["http://www.esic.es/"], "country": "Spain", "state-province": null, "name": 

In [166]:
# openweathermap - optional activity

response=requests.get('https://knasmueller.net/using-the-open-weather-map-api-with-python')
response.content

b'<!DOCTYPE html>\n<html lang="en-US">\n<head>\n<meta charset="UTF-8">\n<meta http-equiv="X-UA-Compatible" content="IE=edge">\n<meta name="viewport" content="width=device-width, initial-scale=1">\n<link rel="profile" href="https://gmpg.org/xfn/11">\n<meta name=\'robots\' content=\'index, follow, max-image-preview:large, max-snippet:-1, max-video-preview:-1\' />\n\n<title>Using the Open Weather Map API with Python - Bernhard Knasm\xc3\xbcller on Software Development</title>\n<meta name="description" content="Learn how to obtain hourly 48-hour weather forecasts for free using only Python and the Open Weather Map API - no external packages needed." />\n<link rel="canonical" href="https://knasmueller.net/using-the-open-weather-map-api-with-python" />\n<meta property="og:locale" content="en_US" />\n<meta property="og:type" content="article" />\n<meta property="og:title" content="Using the Open Weather Map API with Python - Bernhard Knasm\xc3\xbcller on Software Development" />\n<meta proper