# HTTP Requests

## The Requests Library

In [1]:
import requests

url = 'https://www.senate.gov/general/contact_information/senators_cfm.cfm'
headers = {'user-agent': 'my-app/0.0.1'}
r = requests.get(url, headers=headers)
content = r.text
print(content)

<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<!-- [if lt IE 7]> <html class="ie6 oldie"> <![endif] --><!-- [if IE 7]>    <html class="ie7 oldie"> <![endif] --><!-- [if IE 8]>    <html class="ie8 oldie"> <![endif] --><!-- [if gt IE 8]> <! --><html class="">
<!-- <![endif] -->
<head>
<META http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1">
<meta name="object" content="senators-contact.xml">
<meta name="version" content="30.7">
<meta name="path" content="/Company Home/Sites/senategov/documentLibrary/Senate.gov/senators">
<meta name="date" content="Tuesday, February 4, 2025">
<meta name="time" content="4:25:27 PM EST">
<meta name="keywords" content="how to">
<meta name="bucket" content="senators">
<meta name="description" content="">
<title>U.S. Senate: Contacting U.S. Senators</title>
<link type="image/x-ic

### Passing Parameters

In [2]:
rIL = requests.get(url, headers=headers, params={'State': 'IL'} )
illinois = rIL.text
print(illinois)

<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<!-- [if lt IE 7]> <html class="ie6 oldie"> <![endif] --><!-- [if IE 7]>    <html class="ie7 oldie"> <![endif] --><!-- [if IE 8]>    <html class="ie8 oldie"> <![endif] --><!-- [if gt IE 8]> <! --><html class="">
<!-- <![endif] -->
<head>
<META http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1">
<meta name="object" content="senators-contact.xml">
<meta name="version" content="30.7">
<meta name="path" content="/Company Home/Sites/senategov/documentLibrary/Senate.gov/senators">
<meta name="date" content="Tuesday, February 4, 2025">
<meta name="time" content="4:25:27 PM EST">
<meta name="keywords" content="how to">
<meta name="bucket" content="senators">
<meta name="description" content="">
<title>U.S. Senate: Contacting U.S. Senators</title>
<link type="image/x-ic

## Beautiful Soup

In [3]:
from bs4 import BeautifulSoup
import re

soup = BeautifulSoup(content, 'html.parser')

In [5]:
soup

<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">

<!-- [if lt IE 7]> <html class="ie6 oldie"> <![endif] --><!-- [if IE 7]>    <html class="ie7 oldie"> <![endif] --><!-- [if IE 8]>    <html class="ie8 oldie"> <![endif] --><!-- [if gt IE 8]> <! --><html class="">
<!-- <![endif] -->
<head>
<meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
<meta charset="utf-8"/>
<meta content="width=device-width, initial-scale=1" name="viewport"/>
<meta content="senators-contact.xml" name="object"/>
<meta content="30.7" name="version"/>
<meta content="/Company Home/Sites/senategov/documentLibrary/Senate.gov/senators" name="path"/>
<meta content="Tuesday, February 4, 2025" name="date"/>
<meta content="4:25:27 PM EST" name="time"/>
<meta content="how to" name="keywords"/>
<meta content="senators" name="bucket"/>
<meta content="" name="description"/>
<title>U.S. Senate: Contacting U.S. Senators</title>
<link href="/re

#### Find all &lt;a&gt; tags

In [6]:
links = soup.find_all('a')
print(len(links))

140


#### Find all tags that begin with "t"

In [7]:
t_tags = soup.find_all(re.compile('^t'))
print(len(t_tags))

1


#### Find all &lt;img&gt; tags that have border="0" and alt="" attributes

In [8]:
imgs = soup.find_all('img', {'border': '0', 'alt': ''})
print(len(imgs))

0


#### Find all &lt;img&gt; tags that are missing the alt attribute

In [9]:
imgs_no_alt = soup.find_all('img', {'alt': False})
print(len(imgs_no_alt))

0


#### Find the first three phone numbers formatted as "(###) ###-####".

In [10]:
phones = soup.find_all(text = re.compile('\(\d{3}\) \d{3}-\d{4}'), limit=3 )
print(phones)

['A U.S. Capitol Switchboard operator can also connect you directly with the Senate office. \n(202) 224-3121']


#### Find the first three &lt;td&gt; tags that contain a phone number

In [11]:
phones_in_tds = soup.find_all('td', text = re.compile('\(\d{3}\) \d{3}-\d{4}'), limit=3)
print(phones_in_tds)

[]


#### Find all &lt;a&gt; tags that have an href attribute

In [12]:
links_with_href = soup.find_all('a', href=True)
print(len(links_with_href))

138


#### Find all &lt;a&gt; tags that have an href attribute that contains "senate.gov/"

In [13]:
internal_links = soup.find_all('a', href = re.compile('senate.gov/'))
print(len(internal_links))

0


#### Find all &lt;a&gt; tags that have an href attribute that ends with "senate.gov/"

In [14]:
senator_links = soup.find_all('a', href = re.compile('senate.gov/$'))
print(len(senator_links))

0


#### Find all &lt;a&gt; tags that have an href attribute that ends with "senate.gov/" or "senate.gov"

In [15]:
senator_links = soup.find_all('a', href = re.compile('senate.gov/?$'))
print(len(senator_links))

0


### The select() Method
Get elements based on CSS selectors

In [16]:
top_nav_links = soup.select('ul.topnav li a')
for link in top_nav_links:
    print(link.text.strip())

## Put it all Together: Output List of Senators

In [17]:
import requests
from bs4 import BeautifulSoup
import re

url = 'https://www.senate.gov/general/contact_information/senators_cfm.cfm'
headers = {'user-agent': 'my-app/0.0.1'}
r = requests.get(url, headers=headers)
content = r.text

soup = BeautifulSoup(content, 'html.parser')

senators = soup.find_all('a', href = re.compile('senate.gov/?$'))

for i, senator in enumerate(senators, 1):
    print(i, senator.text.strip())

In [21]:
import requests
from bs4 import BeautifulSoup
import re

url = 'https://www.senate.gov/general/contact_information/senators_cfm.cfm'
headers = {'user-agent': 'my-app/0.0.1'}
r = requests.get(url, headers=headers)
content = r.text

soup = BeautifulSoup(content, 'html.parser')

states = soup.find_all('option', value = re.compile('/intro.htm$'))

# dont know why we have 100 states, so we are truncating
states = states[:50]
i=1
for state in states:
        state_url = 'https://www.senate.gov'+state.attrs['value']
        req = requests.get(state_url,headers=headers)
        state_content = req.text
        soup = BeautifulSoup(state_content,'html.parser')
        senators = soup.find_all('a',href = re.compile('senate.gov/?$'),target="_blank")
                        
        for senator in senators:
            person = senator.text.strip()
            if person != 'Contact' and person !="Web Form":  # Pesky row that we need to ignore
                print("\t",i, person,state.attrs)
                i += 1

	 1 Tommy Tuberville {'value': '/states/AL/intro.htm'}
	 2 Katie Boyd Britt {'value': '/states/AL/intro.htm'}
	 3 Lisa Murkowski {'value': '/states/AK/intro.htm'}
	 4 Dan Sullivan {'value': '/states/AK/intro.htm'}
	 5 Mark Kelly {'value': '/states/AZ/intro.htm'}
	 6 Ruben Gallego {'value': '/states/AZ/intro.htm'}
	 7 John Boozman {'value': '/states/AR/intro.htm'}
	 8 Tom Cotton {'value': '/states/AR/intro.htm'}
	 9 Alex Padilla {'value': '/states/CA/intro.htm'}
	 10 Adam B. Schiff {'value': '/states/CA/intro.htm'}
	 11 Michael F. Bennet {'value': '/states/CO/intro.htm'}
	 12 John W. Hickenlooper {'value': '/states/CO/intro.htm'}
	 13 Richard Blumenthal {'value': '/states/CT/intro.htm'}
	 14 Christopher Murphy {'value': '/states/CT/intro.htm'}
	 15 Christopher A. Coons {'value': '/states/DE/intro.htm'}
	 16 Lisa Blunt Rochester {'value': '/states/DE/intro.htm'}
	 17 Rick Scott {'value': '/states/FL/intro.htm'}
	 18 Ashley Moody {'value': '/states/FL/intro.htm'}
	 19 Jon Ossoff {'value':

## XML

In [22]:
import requests
from bs4 import BeautifulSoup

url = 'https://www.senate.gov/general/contact_information/senators_cfm.xml'
headers = {'user-agent': 'my-app/0.0.1'}
r = requests.get(url, headers=headers)
content = r.text

soup = BeautifulSoup(content, 'xml')

senators = soup.find_all('member')

for i, senator in enumerate(senators, 1):
    print(i, senator.first_name.text, senator.last_name.text)

1 Angela D. Alsobrooks
2 Tammy Baldwin
3 Jim Banks
4 John Barrasso
5 Michael F. Bennet
6 Marsha Blackburn
7 Richard Blumenthal
8 Lisa Blunt Rochester
9 Cory A. Booker
10 John Boozman
11 Katie Boyd Britt
12 Ted Budd
13 Maria Cantwell
14 Shelley Moore Capito
15 Bill Cassidy
16 Susan M. Collins
17 Christopher A. Coons
18 John Cornyn
19 Catherine Cortez Masto
20 Tom Cotton
21 Kevin Cramer
22 Mike Crapo
23 Ted Cruz
24 John R. Curtis
25 Steve Daines
26 Tammy Duckworth
27 Richard J. Durbin
28 Joni Ernst
29 John Fetterman
30 Deb Fischer
31 Ruben Gallego
32 Kirsten E. Gillibrand
33 Lindsey Graham
34 Chuck Grassley
35 Bill Hagerty
36 Margaret Wood Hassan
37 Josh Hawley
38 Martin Heinrich
39 John W. Hickenlooper
40 Mazie K. Hirono
41 John Hoeven
42 Jon Husted
43 Cindy Hyde-Smith
44 Ron Johnson
45 James C. Justice
46 Tim Kaine
47 Mark Kelly
48 John Kennedy
49 Andy Kim
50 Angus S., Jr. King
51 Amy Klobuchar
52 James Lankford
53 Mike Lee
54 Ben Ray LujÃ¡n
55 Cynthia M. Lummis
56 Edward J. Markey
57 