# HTTP Requests

## The Requests Library

In [2]:
import requests

url = 'https://www.senate.gov/general/contact_information/senators_cfm.cfm'
headers = {'user-agent': 'my-app/0.0.1'}
r = requests.get(url, headers=headers)
content = r.text
print(content)

<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<!-- [if lt IE 7]> <html class="ie6 oldie"> <![endif] --><!-- [if IE 7]>    <html class="ie7 oldie"> <![endif] --><!-- [if IE 8]>    <html class="ie8 oldie"> <![endif] --><!-- [if gt IE 8]> <! --><html class="">
<!-- <![endif] -->
<head>
<META http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1">
<meta name="object" content="senators-contact.xml">
<meta name="version" content="30.7">
<meta name="path" content="/Company Home/Sites/senategov/documentLibrary/Senate.gov/senators">
<meta name="date" content="Tuesday, February 4, 2025">
<meta name="time" content="4:25:27 PM EST">
<meta name="keywords" content="how to">
<meta name="bucket" content="senators">
<meta name="description" content="">
<title>U.S. Senate: Contacting U.S. Senators</title>
<link type="image/x-ic

### Passing Parameters

In [3]:
rIL = requests.get(url, headers=headers, params={'State': 'IL'} )
illinois = rIL.text
print(illinois)

<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<!-- [if lt IE 7]> <html class="ie6 oldie"> <![endif] --><!-- [if IE 7]>    <html class="ie7 oldie"> <![endif] --><!-- [if IE 8]>    <html class="ie8 oldie"> <![endif] --><!-- [if gt IE 8]> <! --><html class="">
<!-- <![endif] -->
<head>
<META http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1">
<meta name="object" content="senators-contact.xml">
<meta name="version" content="30.7">
<meta name="path" content="/Company Home/Sites/senategov/documentLibrary/Senate.gov/senators">
<meta name="date" content="Tuesday, February 4, 2025">
<meta name="time" content="4:25:27 PM EST">
<meta name="keywords" content="how to">
<meta name="bucket" content="senators">
<meta name="description" content="">
<title>U.S. Senate: Contacting U.S. Senators</title>
<link type="image/x-ic

## Beautiful Soup

In [4]:
from bs4 import BeautifulSoup
import re

soup = BeautifulSoup(content, 'html.parser')

#### Find all &lt;a&gt; tags

In [5]:
links = soup.find_all('a')
print(len(links))

140


#### Find all tags that begin with "t"

In [6]:
t_tags = soup.find_all(re.compile('^t'))
print(len(t_tags))

1


#### Find all &lt;img&gt; tags that have border="0" and alt="" attributes

In [7]:
imgs = soup.find_all('img', {'border': '0', 'alt': ''})
print(len(imgs))

0


#### Find all &lt;img&gt; tags that are missing the alt attribute

In [8]:
imgs_no_alt = soup.find_all('img', {'alt': False})
print(len(imgs_no_alt))

0


#### Find the first three phone numbers formatted as "(###) ###-####".

In [9]:
phones = soup.find_all(text = re.compile('\(\d{3}\) \d{3}-\d{4}'), limit=3 )
print(phones)

['A U.S. Capitol Switchboard operator can also connect you directly with the Senate office. \n(202) 224-3121']


#### Find the first three &lt;td&gt; tags that contain a phone number

In [10]:
phones_in_tds = soup.find_all('td', text = re.compile('\(\d{3}\) \d{3}-\d{4}'), limit=3)
print(phones_in_tds)

[]


#### Find all &lt;a&gt; tags that have an href attribute

In [11]:
links_with_href = soup.find_all('a', href=True)
print(len(links_with_href))

138


#### Find all &lt;a&gt; tags that have an href attribute that contains "senate.gov/"

In [12]:
internal_links = soup.find_all('a', href = re.compile('senate.gov/'))
print(len(internal_links))

0


#### Find all &lt;a&gt; tags that have an href attribute that ends with "senate.gov/"

In [13]:
senator_links = soup.find_all('a', href = re.compile('senate.gov/$'))
print(len(senator_links))

0


#### Find all &lt;a&gt; tags that have an href attribute that ends with "senate.gov/" or "senate.gov"

In [14]:
senator_links = soup.find_all('a', href = re.compile('senate.gov/?$'))
print(len(senator_links))

0


### The select() Method
Get elements based on CSS selectors

In [15]:
top_nav_links = soup.select('ul.topnav li a')
for link in top_nav_links:
    print(link.text.strip())

## Put it all Together: Output List of Senators

In [16]:
import requests
from bs4 import BeautifulSoup
import re

url = 'https://www.senate.gov/general/contact_information/senators_cfm.cfm'
headers = {'user-agent': 'my-app/0.0.1'}
r = requests.get(url, headers=headers)
content = r.text

soup = BeautifulSoup(content, 'html.parser')

senators = soup.find_all('a', href = re.compile('senate.gov/?$'))

for i, senator in enumerate(senators, 1):
    print(i, senator.text.strip())

In [17]:
import requests
from bs4 import BeautifulSoup
import re

url = 'https://www.senate.gov/general/contact_information/senators_cfm.cfm'
headers = {'user-agent': 'my-app/0.0.1'}
r = requests.get(url, headers=headers)
content = r.text

soup = BeautifulSoup(content, 'html.parser')

states = soup.find_all('option', value = re.compile('/intro.htm$'))

# dont know why we have 100 states, so we are truncating
states = states[:50]
i=1
for state in states:
        state_url = 'https://www.senate.gov'+state.attrs['value']
        req = requests.get(state_url,headers=headers)
        state_content = req.text
        soup = BeautifulSoup(state_content,'html.parser')
        senators = soup.find_all('a',href = re.compile('senate.gov/?$'),target="_blank")
                        
        for senator in senators:
            person = senator.text.strip()
            if person != 'Contact':  # Pesky row that we need to ignore
                print("\t",i, person)
                i += 1

	 1 Tommy Tuberville
	 2 Katie Boyd Britt
	 3 Lisa Murkowski
	 4 Dan Sullivan
	 5 Kyrsten Sinema
	 6 Mark Kelly
	 7 John Boozman
	 8 Tom Cotton
	 9 Dianne Feinstein
	 10 Alex Padilla
	 11 Michael F. Bennet
	 12 John W. Hickenlooper
	 13 Richard Blumenthal
	 14 Christopher Murphy
	 15 Thomas R. Carper
	 16 Christopher A. Coons
	 17 Marco Rubio
	 18 Rick Scott
	 19 Jon Ossoff
	 20 Raphael G. Warnock
	 21 Brian Schatz
	 22 Mazie K. Hirono
	 23 Mike Crapo
	 24 James E. Risch
	 25 Richard J. Durbin
	 26 Tammy Duckworth
	 27 Todd Young
	 28 Mike Braun
	 29 Chuck Grassley
	 30 Joni Ernst
	 31 Jerry Moran
	 32 Roger Marshall
	 33 Mitch McConnell
	 34 Rand Paul
	 35 Bill Cassidy
	 36 John Kennedy
	 37 Susan M. Collins
	 38 Angus S. King,  Jr.
	 39 Benjamin L. Cardin
	 40 Chris Van Hollen
	 41 Elizabeth Warren
	 42 Edward J. Markey
	 43 Debbie Stabenow
	 44 Gary C. Peters
	 45 Amy Klobuchar
	 46 Tina Smith
	 47 Roger F. Wicker
	 48 Cindy Hyde-Smith
	 49 Josh Hawley
	 50 Eric Schmitt
	 51 Jon Tes

## XML

In [18]:
import requests
from bs4 import BeautifulSoup

url = 'https://www.senate.gov/general/contact_information/senators_cfm.xml'
headers = {'user-agent': 'my-app/0.0.1'}
r = requests.get(url, headers=headers)
content = r.text

soup = BeautifulSoup(content, 'xml')

senators = soup.find_all('member')

for i, senator in enumerate(senators, 1):
    print(i, senator.first_name.text, senator.last_name.text)

1 Tammy Baldwin
2 John Barrasso
3 Michael F. Bennet
4 Marsha Blackburn
5 Richard Blumenthal
6 Cory A. Booker
7 John Boozman
8 Mike Braun
9 Katie Boyd Britt
10 Sherrod Brown
11 Ted Budd
12 Maria Cantwell
13 Shelley Moore Capito
14 Benjamin L. Cardin
15 Thomas R. Carper
16 Robert P., Jr. Casey
17 Bill Cassidy
18 Susan M. Collins
19 Christopher A. Coons
20 John Cornyn
21 Catherine Cortez Masto
22 Tom Cotton
23 Kevin Cramer
24 Mike Crapo
25 Ted Cruz
26 Steve Daines
27 Tammy Duckworth
28 Richard J. Durbin
29 Joni Ernst
30 Dianne Feinstein
31 John Fetterman
32 Deb Fischer
33 Kirsten E. Gillibrand
34 Lindsey Graham
35 Chuck Grassley
36 Bill Hagerty
37 Margaret Wood Hassan
38 Josh Hawley
39 Martin Heinrich
40 John W. Hickenlooper
41 Mazie K. Hirono
42 John Hoeven
43 Cindy Hyde-Smith
44 Ron Johnson
45 Tim Kaine
46 Mark Kelly
47 John Kennedy
48 Angus S., Jr. King
49 Amy Klobuchar
50 James Lankford
51 Mike Lee
52 Ben Ray LujÃ¡n
53 Cynthia M. Lummis
54 Joe, III Manchin
55 Edward J. Markey
56 Roger