# HTTP Requests

## The Requests Library

In [1]:
import requests

url = 'https://www.senate.gov/general/contact_information/senators_cfm.cfm'
headers = {'user-agent': 'my-app/0.0.1'}
r = requests.get(url, headers=headers)
content = r.text
print(content)

<!--KEEP THIS APPLICATION PAGE TOGETHER WITH THE CONTACT_LIST.CFM SO 
THAT WE CAN USE APPLICATION VARIABLES--> 
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<!-- [if lt IE 7]> <html class="ie6 oldie"> <![endif] --><!-- [if IE 7]>    <html class="ie7 oldie"> <![endif] --><!-- [if IE 8]>    <html class="ie8 oldie"> <![endif] --><!-- [if gt IE 8]> <! --><html class="">
<!-- <![endif] -->
<head>
<META http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1">
<meta name="object" content="senators_cfm.xml">
<meta name="version" content="1023.0">
<meta name="path" content="/Company Home/Sites/senategov/documentLibrary/Senate.gov/general/contact_information">
<meta name="date" content="Tuesday, January 12, 2021">
<meta name="time" content="3:17:29 PM EST">
<meta name="keywords" content="">
<meta name="bucket" content="

### Passing Parameters

In [2]:
rIL = requests.get(url, headers=headers, params={'State': 'IL'} )
illinois = rIL.text
print(illinois)

<!--KEEP THIS APPLICATION PAGE TOGETHER WITH THE CONTACT_LIST.CFM SO 
THAT WE CAN USE APPLICATION VARIABLES--> 
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<!-- [if lt IE 7]> <html class="ie6 oldie"> <![endif] --><!-- [if IE 7]>    <html class="ie7 oldie"> <![endif] --><!-- [if IE 8]>    <html class="ie8 oldie"> <![endif] --><!-- [if gt IE 8]> <! --><html class="">
<!-- <![endif] -->
<head>
<META http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1">
<meta name="object" content="senators_cfm.xml">
<meta name="version" content="1023.0">
<meta name="path" content="/Company Home/Sites/senategov/documentLibrary/Senate.gov/general/contact_information">
<meta name="date" content="Tuesday, January 12, 2021">
<meta name="time" content="3:17:29 PM EST">
<meta name="keywords" content="">
<meta name="bucket" content="

## Beautiful Soup

In [3]:
from bs4 import BeautifulSoup
import re

soup = BeautifulSoup(content, 'html.parser')

#### Find all &lt;a&gt; tags

In [4]:
links = soup.find_all('a')
print(len(links))

351


#### Find all tags that begin with "t"

In [5]:
t_tags = soup.find_all(re.compile('^t'))
print(len(t_tags))

6


#### Find all &lt;img&gt; tags that have border="0" and alt="" attributes

In [6]:
imgs = soup.find_all('img', {'border': '0', 'alt': ''})
print(len(imgs))

0


#### Find all &lt;img&gt; tags that are missing the alt attribute

In [7]:
imgs_no_alt = soup.find_all('img', {'alt': False})
print(len(imgs_no_alt))

0


#### Find the first three phone numbers formatted as "(###) ###-####".

In [8]:
phones = soup.find_all(text = re.compile('\(\d{3}\) \d{3}-\d{4}'), limit=3 )
print(phones)

['(202) 224-5653', '(202) 224-6441', '(202) 224-5852']


#### Find the first three &lt;td&gt; tags that contain a phone number

In [9]:
phones_in_tds = soup.find_all('td', text = re.compile('\(\d{3}\) \d{3}-\d{4}'), limit=3)
print(phones_in_tds)

[]


#### Find all &lt;a&gt; tags that have an href attribute

In [10]:
links_with_href = soup.find_all('a', href=True)
print(len(links_with_href))

348


#### Find all &lt;a&gt; tags that have an href attribute that contains "senate.gov/"

In [11]:
internal_links = soup.find_all('a', href = re.compile('senate.gov/'))
print(len(internal_links))

128


#### Find all &lt;a&gt; tags that have an href attribute that ends with "senate.gov/"

In [12]:
senator_links = soup.find_all('a', href = re.compile('senate.gov/$'))
print(len(senator_links))

36


#### Find all &lt;a&gt; tags that have an href attribute that ends with "senate.gov/" or "senate.gov"

In [13]:
senator_links = soup.find_all('a', href = re.compile('senate.gov/?$'))
print(len(senator_links))

106


### The select() Method
Get elements based on CSS selectors

In [16]:
top_nav_links = soup.select('ul.topnav li a')
for link in top_nav_links:
    print(link.text.strip())

## Put it all Together: Output List of Senators

In [17]:
import requests
from bs4 import BeautifulSoup
import re

url = 'https://www.senate.gov/general/contact_information/senators_cfm.cfm'
headers = {'user-agent': 'my-app/0.0.1'}
r = requests.get(url, headers=headers)
content = r.text

soup = BeautifulSoup(content, 'html.parser')

senators = soup.find_all('a', href = re.compile('senate.gov/?$'))

for i, senator in enumerate(senators, 1):
    print(i, senator.text.strip())

1 Baldwin, Tammy
2 Barrasso, John
3 Bennet, Michael F.
4 Blackburn, Marsha
5 Blumenthal, Richard
6 Blunt, Roy
7 Booker, Cory A.
8 Boozman, John
9 Braun, Mike
10 Brown, Sherrod
11 Burr, Richard
12 Cantwell, Maria
13 Capito, Shelley Moore
14 Cardin, Benjamin L.
15 Carper, Thomas R.
16 Casey, Robert P., Jr.
17 Cassidy, Bill
18 Collins, Susan M.
19 Coons, Christopher A.
20 Cornyn, John
21 Cortez Masto, Catherine
22 Cotton, Tom
23 Cramer, Kevin
24 Crapo, Mike
25 Cruz, Ted
26 Daines, Steve
27 Duckworth, Tammy
28 Durbin, Richard J.
29 Ernst, Joni
30 Feinstein, Dianne
31 Fischer, Deb
32 Gillibrand, Kirsten E.
33 Graham, Lindsey
34 Grassley, Chuck
35 Hagerty, Bill
36 www.hagerty.senate.gov
37 Harris, Kamala D.
38 Hassan, Margaret Wood
39 Hawley, Josh
40 Heinrich, Martin
41 Hickenlooper, John W.
42 www.hickenlooper.senate.gov
43 Hirono, Mazie K.
44 Hoeven, John
45 Hyde-Smith, Cindy
46 Inhofe, James M.
47 Johnson, Ron
48 Kaine, Tim
49 Kelly, Mark
50 www.kelly.senate.gov
51 Kennedy, John
52 King, 

## XML

In [18]:
import requests
from bs4 import BeautifulSoup

url = 'https://www.senate.gov/general/contact_information/senators_cfm.xml'
headers = {'user-agent': 'my-app/0.0.1'}
r = requests.get(url, headers=headers)
content = r.text

soup = BeautifulSoup(content, 'xml')

senators = soup.find_all('member')

for i, senator in enumerate(senators, 1):
    print(i, senator.first_name.text, senator.last_name.text)

1 Tammy Baldwin
2 John Barrasso
3 Michael F. Bennet
4 Marsha Blackburn
5 Richard Blumenthal
6 Roy Blunt
7 Cory A. Booker
8 John Boozman
9 Mike Braun
10 Sherrod Brown
11 Richard Burr
12 Maria Cantwell
13 Shelley Moore Capito
14 Benjamin L. Cardin
15 Thomas R. Carper
16 Robert P., Jr. Casey
17 Bill Cassidy
18 Susan M. Collins
19 Christopher A. Coons
20 John Cornyn
21 Catherine Cortez Masto
22 Tom Cotton
23 Kevin Cramer
24 Mike Crapo
25 Ted Cruz
26 Steve Daines
27 Tammy Duckworth
28 Richard J. Durbin
29 Joni Ernst
30 Dianne Feinstein
31 Deb Fischer
32 Kirsten E. Gillibrand
33 Lindsey Graham
34 Chuck Grassley
35 Bill Hagerty
36 Kamala D. Harris
37 Margaret Wood Hassan
38 Josh Hawley
39 Martin Heinrich
40 John W. Hickenlooper
41 Mazie K. Hirono
42 John Hoeven
43 Cindy Hyde-Smith
44 James M. Inhofe
45 Ron Johnson
46 Tim Kaine
47 Mark Kelly
48 John Kennedy
49 Angus S., Jr. King
50 Amy Klobuchar
51 James Lankford
52 Patrick J. Leahy
53 Mike Lee
54 Kelly Loeffler
55 Ben Ray LujÃ¡n
56 Cynthia M.