# Web scrapping intro

In [1]:
# import necessary libraries
import requests # pip install if not already
from bs4 import BeautifulSoup as bs # pip install BeautifulSoup if not already installed


In [2]:
# Load our first page

In [3]:
url = 'https://keithgalli.github.io/web-scraping/example.html'
r = requests.get(url)

#what does r look like?
r, type(r)

(<Response [200]>, requests.models.Response)

In [4]:
str(r.content)

'b\'<html>\\n<head>\\n<title>HTML Example</title>\\n</head>\\n<body>\\n\\n<div align="middle">\\n<h1>HTML Webpage</h1>\\n<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>\\n</div>\\n\\n<h2>A Header</h2>\\n<p><i>Some italicized text</i></p>\\n\\n<h2>Another header</h2>\\n<p id="paragraph-id"><b>Some bold text</b></p>\\n\\n</body>\\n</html>\\n\''

In [5]:
# convert r to a beautiful soup object
soup = bs(r.content)
soup

<html>
<head>
<title>HTML Example</title>
</head>
<body>
<div align="middle">
<h1>HTML Webpage</h1>
<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>
</div>
<h2>A Header</h2>
<p><i>Some italicized text</i></p>
<h2>Another header</h2>
<p id="paragraph-id"><b>Some bold text</b></p>
</body>
</html>

In [None]:
print(soup.prettify())

In [None]:
# start scrapping!
# start with find and find_all

soup.find('h2'), soup.find_all('h2')

In [None]:
# storing as variables
first_header=soup.find('h2')
all_headers=soup.find_all('h2')
first_header, all_headers

.find will only find the first occurence of a tag \
.find_all will find all occurences of the taga and place each item in a list

In [None]:
# pass in a list of elements to look for
first_header=soup.find(['h1','h2'])
first_header

order does not matter when you pass a list into .find it will still return the first tag in the list it finds.

In [None]:
headers=soup.find_all(['h1','h2'])
headers

In [None]:
# you can pass in attributes to the find/find functions.
paragraph=soup.find_all("p")
paragraph

In [None]:
# say we want to find the paragraph that has the attribute id='paragraph-id', use the parameter attr={}
# to find this attribute with the paragraph tag.

soup.find_all("p", attrs={"id": "paragraph-id"} )

In [None]:
# you can nest find and find all calls.
body = soup.find('body')
body

now lets say you want to find a 'div' tag, you can nest this with the body object just created

In [None]:
div =body.find('div')
div

now lets do the same process for the header

In [None]:
header=div.find('h1')
header

In [None]:
# we can search for specific strings in find_all calls.
# say we wanted to find paragraphs that contained the word 'some'

print(body.prettify())

In [None]:
some_paragraphs=body.find_all('p', string='Some')
some_paragraphs

oh no! oh no! Oh no no no no!

In [None]:
# it doesn't quite work...what if we put in 'Some bold text'??
some_paragraphs=body.find_all('p', string='Some bold text')
some_paragraphs

In [None]:
# this situation is not ideal.
# let's leverage our friend regex :)

import re

some_paragraphs=body.find_all('p', string=re.compile('Some'))
some_paragraphs

In [None]:
# another example find all headers that have the word "header" in them

headers=body.find_all('h2', string=re.compile('header'))
headers

only finds one result because regex is looking for 'header'. we can change it by giving it a different pattern to capture the capital "H", string=re.compile('(H|h)eader')

In [None]:
headers=body.find_all('h2', string=re.compile('(H|h)eader'))
headers

### select (CSS selector)

In [None]:
print(soup.body.prettify())

In [None]:
soup.select('p')

looks the same as find_all. Let's try finding all paragraph tags inside of div.

In [None]:
soup.select('div p')

In [None]:
# select all the paragraphs that are also preceeded by a header.
soup.select('h2 ~ p')


In [None]:
# let's do some more of this. It's useful to grab elements with specific id's

In [None]:
soup.select('p#paragraph-id b')

In [None]:
paragraphs=soup.select('body > p')
paragraphs

In [None]:
# we can loop through and make select calls with these objects that we just made. Let's take paragraphs for example
#first look at the type(paragraphs)

type(paragraphs)

In [None]:
# it's a list so we can loop. Let's inspect the elements in the list for their types
type(paragraphs[0])

They are beautiful soup objects so we can use the select and find_all calls. Let's do an example of finding the elements with the id tag.

In [None]:
paragraphs_i=[]
[paragraphs_i.append(x.select('i')) for x in paragraphs]
paragraphs_i

In [None]:
# Grab an element with a specific property
soup.select("[align='middle']")

# Get different properties of the HTML

In [None]:
# want to grab only the string from an element, use the .string property
header = soup.find('h2')
header, header.string

In [None]:
# let's try with div.
div = soup.find('div')
print(div.prettify())
print(div.string)

We got an answer of None. This is because there are children elements inside of the div, so the .string property is not sure which text to grab.

In [None]:
# Let's try with .get_text()
print(div.get_text())
div.get_text()

In [None]:
# Get a specific property from an element.
# Let's get the href link tag
link = soup.find('a')
link

In [None]:
link['href']

In [None]:
# try to get the paragraph id tag.
paragraphs = soup.select('p#paragraph-id')
paragraphs

In [None]:
type(paragraphs)

In [None]:
paragraphs[0], type(paragraphs[0])

In [None]:
paragraphs[0]['id']

# Code Navigation

In [None]:
soup

In [None]:
soup.body

In [None]:
soup.div

In [None]:
soup.h1

In [None]:
soup.h1.string

In [None]:
# know the terms parent, sibling, child
# start with a pretty print of the body.

print(soup.body.prettify())

The body has a nested structure. The body is the parent. Everything nested inside the body is the children. If elements are on the same level they are considered siblings like `<div>` and `<h2>`.

In [None]:
# start with looking at div
div=soup.find('div')
# looks like there are 4 elements that are siblings of the div

In [None]:
div.find_next_siblings()

In [None]:
len(div.find_next_siblings())

In [22]:
# Grab all the social link from the webpage.
# webpage: https://keithgalli.github.io/web-scraping/webpage.html

url = 'https://keithgalli.github.io/web-scraping/webpage.html'
request=requests.get(url)
request

<Response [200]>

In [7]:
# look at the content
request.content

b'<head>\n  <title>Keith Galli\'s Page</title>\n  <style>\n  table {\n    border-collapse: collapse;\n  }\n  th {\n    padding:5px;\n  }\n  td {\n    border: 1px solid #ddd;\n    padding: 5px;\n  }\n  tr:nth-child(even) {\n    background-color: #f2f2f2;\n  }\n  th {\n    padding-top: 12px;\n    padding-bottom: 12px;\n    text-align: left;\n    background-color: #add8e6;\n    color: black;\n  }\n  .block {\n  width: 100px;\n  /*float: left;*/\n    display: inline-block;\n    zoom: 1;\n  }\n  .column {\n  float: left;\n  height: 200px;\n  /*width: 33.33%;*/\n  padding: 5px;\n  }\n\n  .row::after {\n    content: "";\n    clear: both;\n    display: table;\n  }\n</style>\n</head>\n<body>\n  <h1>Welcome to my page!</h1>\n  <img src="./images/selfie1.jpg" width="300px">\n  <h2>About me</h2>\n  <p>Hi, my name is Keith and I am a YouTuber who focuses on content related to programming, data science, and machine learning!</p>\n  <p>Here is a link to my channel: <a href="https://www.youtube.com/kg

In [8]:
# create a variable 'webpage' that takes request.content and makes it a beautiful soup object.
webpage=bs(request.content)


In [9]:
print(webpage.prettify())

<html>
 <head>
  <title>
   Keith Galli's Page
  </title>
  <style>
   table {
    border-collapse: collapse;
  }
  th {
    padding:5px;
  }
  td {
    border: 1px solid #ddd;
    padding: 5px;
  }
  tr:nth-child(even) {
    background-color: #f2f2f2;
  }
  th {
    padding-top: 12px;
    padding-bottom: 12px;
    text-align: left;
    background-color: #add8e6;
    color: black;
  }
  .block {
  width: 100px;
  /*float: left;*/
    display: inline-block;
    zoom: 1;
  }
  .column {
  float: left;
  height: 200px;
  /*width: 33.33%;*/
  padding: 5px;
  }

  .row::after {
    content: "";
    clear: both;
    display: table;
  }
  </style>
 </head>
 <body>
  <h1>
   Welcome to my page!
  </h1>
  <img src="./images/selfie1.jpg" width="300px"/>
  <h2>
   About me
  </h2>
  <p>
   Hi, my name is Keith and I am a YouTuber who focuses on content related to programming, data science, and machine learning!
  </p>
  <p>
   Here is a link to my channel:
   <a href="https://www.youtube.com/kgmi

# starting on my own.

In [10]:
socials = webpage.find('ul', attrs={'class': 'socials'})
socials

<ul class="socials">
<li class="social instagram"><b>Instagram: </b><a href="https://www.instagram.com/keithgalli/">https://www.instagram.com/keithgalli/</a></li>
<li class="social twitter"><b>Twitter: </b><a href="https://twitter.com/keithgalli">https://twitter.com/keithgalli</a></li>
<li class="social linkedin"><b>LinkedIn: </b><a href="https://www.linkedin.com/in/keithgalli/">https://www.linkedin.com/in/keithgalli/</a></li>
<li class="social tiktok"><b>TikTok: </b><a href="https://www.tiktok.com/@keithgalli">https://www.tiktok.com/@keithgalli</a></li>
</ul>

In [11]:
type(socials)

bs4.element.Tag

In [12]:
social_str = str(socials)
social_str

'<ul class="socials">\n<li class="social instagram"><b>Instagram: </b><a href="https://www.instagram.com/keithgalli/">https://www.instagram.com/keithgalli/</a></li>\n<li class="social twitter"><b>Twitter: </b><a href="https://twitter.com/keithgalli">https://twitter.com/keithgalli</a></li>\n<li class="social linkedin"><b>LinkedIn: </b><a href="https://www.linkedin.com/in/keithgalli/">https://www.linkedin.com/in/keithgalli/</a></li>\n<li class="social tiktok"><b>TikTok: </b><a href="https://www.tiktok.com/@keithgalli">https://www.tiktok.com/@keithgalli</a></li>\n</ul>'

In [13]:
social_str.split('\n')

['<ul class="socials">',
 '<li class="social instagram"><b>Instagram: </b><a href="https://www.instagram.com/keithgalli/">https://www.instagram.com/keithgalli/</a></li>',
 '<li class="social twitter"><b>Twitter: </b><a href="https://twitter.com/keithgalli">https://twitter.com/keithgalli</a></li>',
 '<li class="social linkedin"><b>LinkedIn: </b><a href="https://www.linkedin.com/in/keithgalli/">https://www.linkedin.com/in/keithgalli/</a></li>',
 '<li class="social tiktok"><b>TikTok: </b><a href="https://www.tiktok.com/@keithgalli">https://www.tiktok.com/@keithgalli</a></li>',
 '</ul>']

In [14]:
# another way - find_all
socials = webpage.find_all('ul', attrs={'class': 'socials'})
socials

[<ul class="socials">
 <li class="social instagram"><b>Instagram: </b><a href="https://www.instagram.com/keithgalli/">https://www.instagram.com/keithgalli/</a></li>
 <li class="social twitter"><b>Twitter: </b><a href="https://twitter.com/keithgalli">https://twitter.com/keithgalli</a></li>
 <li class="social linkedin"><b>LinkedIn: </b><a href="https://www.linkedin.com/in/keithgalli/">https://www.linkedin.com/in/keithgalli/</a></li>
 <li class="social tiktok"><b>TikTok: </b><a href="https://www.tiktok.com/@keithgalli">https://www.tiktok.com/@keithgalli</a></li>
 </ul>]

In [15]:
# using select
links = webpage.select('ul > li > a[href*="keith"]')

In [16]:
links

[<a href="https://www.instagram.com/keithgalli/">https://www.instagram.com/keithgalli/</a>,
 <a href="https://twitter.com/keithgalli">https://twitter.com/keithgalli</a>,
 <a href="https://www.linkedin.com/in/keithgalli/">https://www.linkedin.com/in/keithgalli/</a>,
 <a href="https://www.tiktok.com/@keithgalli">https://www.tiktok.com/@keithgalli</a>]

In [17]:
links[0]['href']

'https://www.instagram.com/keithgalli/'

In [18]:
links[1]['href']

'https://twitter.com/keithgalli'

In [29]:
actual_links = [x['href'] for x in links]
actual_links

['https://www.instagram.com/keithgalli/',
 'https://twitter.com/keithgalli',
 'https://www.linkedin.com/in/keithgalli/',
 'https://www.tiktok.com/@keithgalli']

In [30]:
[requests.get(x) for x in actual_links]

[<Response [429]>, <Response [400]>, <Response [999]>, <Response [200]>]

# following video solutions

In [None]:
# what happens when I select all of the a elements on the page?
webpage.select('a')

This gave us too much... \
On the page all of the socials are stored in an unordered list (ul) with class="socials"

In [None]:
# NOTE in select '#' goes with id names, '.' goes with class names
links=webpage.select('ul.socials')
links

In [None]:
# now we just want the a elements with the href in them.
links=webpage.select('ul.socials a')
links

In [None]:
# we hant to get only the links - https://..... 
#since the variable 'links' is a list we can do a list comprehension to grab these.

[link['href'] for link in links]


In [None]:
# another way
# this time lets use find
# a starting point. let's see the first '<a>' tag

links=webpage.find('a')
links

this is not what we are looking for :(

In [None]:
# let's try doing something similar to our first approach.
links=webpage.find('ul', attrs={'class': 'socials'})
links

This is the point that I got to when I worked on my own! \
Let's do a step further and grab the actual links.

In [None]:
# from the 'links' object grab all the 'a' tags
a_tags = links.find_all('a')
a_tags

In [None]:
# similar to befor we can now do a list comprehension to grab all the href=''
[link['href'] for link in a_tags]

In [None]:
# one more way
links = webpage.select('li.social a')
links

In [None]:
# and lets loop through one more time
actual_links = [link['href'] for link in links]
actual_links