In [1]:
import requests
from bs4 import BeautifulSoup as bs
import re
import pandas as pd

### Loading a page as a beautifulsoup object

In [2]:
# Loading page
r= requests.get('https://keithgalli.github.io/web-scraping/example.html')

# Convert to Beautifulsoup object
soup=bs(r.content)

print(soup.prettify())

<html>
 <head>
  <title>
   HTML Example
  </title>
 </head>
 <body>
  <div align="middle">
   <h1>
    HTML Webpage
   </h1>
   <p>
    Link to more interesting example:
    <a href="https://keithgalli.github.io/web-scraping/webpage.html">
     keithgalli.github.io/web-scraping/webpage.html
    </a>
   </p>
  </div>
  <h2>
   A Header
  </h2>
  <p>
   <i>
    Some italicized text
   </i>
  </p>
  <h2>
   Another header
  </h2>
  <p id="paragraph-id">
   <b>
    Some bold text
   </b>
  </p>
 </body>
</html>



### Scraping using BeautifulSoup

In [3]:
# Locating specific elements
header=soup.find('h2') #returns first elememt
headers=soup.find_all('h2') #returns a list of all elements
print(headers)
# locating multiple elements
soup.find(['h1','h2'])

# Passing attributes to find/find_all calls
paragraph=soup.find_all('p',attrs={'id':'paragraph-id'})
paragraph

# Nesting find/find_all calls
body=soup.find('body')
div=body.find('div')
header=div.find('h1')
header

# Locating specific strings in find/find_all calls
print(soup.find_all('p',string=re.compile('Some')))
print(soup.find_all('h2',string=re.compile('(H|h)eader')))

[<h2>A Header</h2>, <h2>Another header</h2>]
[<p><i>Some italicized text</i></p>, <p id="paragraph-id"><b>Some bold text</b></p>]
[<h2>A Header</h2>, <h2>Another header</h2>]


In [40]:
link='https://ke.jumia.is/unsafe/fit-in/680x680/filters:fill(white)/product/48/3623261/1.jpg?0632'
r=requests.get(link)
cont=bs(r.content)




In [6]:
soup.find_all(['h1','h2'])

[<h1>HTML Webpage</h1>, <h2>A Header</h2>, <h2>Another header</h2>]

### Selectors (CSS)

In [13]:
# Locating elements using select
content=soup.select('div p')
print(content)
paragraph=soup.select('h2~p')
print(paragraph)
bold_text=soup.select('p#paragraph-id b')
print(bold_text)
paragraph1=soup.select('body>p')
print(paragraph1)

# Nested slect calls
for paragraph in paragraph1:
    print(paragraph.select('i'))

[<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>]
[<p><i>Some italicized text</i></p>, <p id="paragraph-id"><b>Some bold text</b></p>]
[<b>Some bold text</b>]
[<p><i>Some italicized text</i></p>, <p id="paragraph-id"><b>Some bold text</b></p>]
[<i>Some italicized text</i>]
[]


### Getting properties of HTML

In [14]:
# strings
header=soup.find('h2')
header.string
# if multiple child elements use get_text
div=soup.find('div')
print(div.get_text())

# Specific property
link=soup.find('a')
print(link['href'])

paragraph=soup.select('p#paragraph-id')
print(paragraph[0]['id'])

# Path syntax
print(soup.body.div.h1.string)


HTML Webpage
Link to more interesting example: keithgalli.github.io/web-scraping/webpage.html

https://keithgalli.github.io/web-scraping/webpage.html
paragraph-id
HTML Webpage


### Examples

In [16]:
s=requests.get('https://keithgalli.github.io/web-scraping/webpage.html')
webpage=bs(s.content)

print(webpage.prettify())

<html>
 <head>
  <title>
   Keith Galli's Page
  </title>
  <style>
   table {
    border-collapse: collapse;
  }
  th {
    padding:5px;
  }
  td {
    border: 1px solid #ddd;
    padding: 5px;
  }
  tr:nth-child(even) {
    background-color: #f2f2f2;
  }
  th {
    padding-top: 12px;
    padding-bottom: 12px;
    text-align: left;
    background-color: #add8e6;
    color: black;
  }
  .block {
  width: 100px;
  /*float: left;*/
    display: inline-block;
    zoom: 1;
  }
  .column {
  float: left;
  height: 200px;
  /*width: 33.33%;*/
  padding: 5px;
  }

  .row::after {
    content: "";
    clear: both;
    display: table;
  }
  </style>
 </head>
 <body>
  <h1>
   Welcome to my page!
  </h1>
  <img src="./images/selfie1.jpg" width="300px"/>
  <h2>
   About me
  </h2>
  <p>
   Hi, my name is Keith and I am a YouTuber who focuses on content related to programming, data science, and machine learning!
  </p>
  <p>
   Here is a link to my channel:
   <a href="https://www.youtube.com/kgmi

In [17]:
# Getting social media links
links0=webpage.select('ul.socials a')
actual_links0=[link['href'] for link in links0]
actual_links0

ulist=webpage.find('ul',attrs={'class':'socials'})
links1=ulist.find_all('a')
actual_links1=[link['href'] for link in links1]
actual_links1

links2=webpage.select('li.social a')
actual_links2=[link['href'] for link in links2]
actual_links2


['https://www.instagram.com/keithgalli/',
 'https://twitter.com/keithgalli',
 'https://www.linkedin.com/in/keithgalli/',
 'https://www.tiktok.com/@keithgalli']

In [22]:
#Scraping a table
table=webpage.select('table.hockey-stats')[0]
cols=table.find('thead').find_all('th')
col_names=[c.string for c in cols]

rows= table.find('tbody').find_all('tr')
l = []
for tr in rows:
    td = tr.find_all('td')
    row = [str(tr.text).strip() for tr in td]
    l.append(row)
df=pd.DataFrame(l, columns=col_names)
df

Unnamed: 0,S,Team,League,GP,G,A,TP,PIM,+/-,Unnamed: 10,POST,GP.1,G.1,A.1,TP.1,PIM.1,+/-.1
0,2014-15,MIT (Mass. Inst. of Tech.),ACHA II,17.0,3.0,9.0,12.0,20.0,,|,,,,,,,
1,2015-16,MIT (Mass. Inst. of Tech.),ACHA II,9.0,1.0,1.0,2.0,2.0,,|,,,,,,,
2,2016-17,MIT (Mass. Inst. of Tech.),ACHA II,12.0,5.0,5.0,10.0,8.0,0.0,|,,,,,,,
3,2017-18,Did not play,,,,,,,,|,,,,,,,
4,2018-19,MIT (Mass. Inst. of Tech.),ACHA III,8.0,5.0,10.0,15.0,8.0,,|,,,,,,,


In [23]:
# Grabing all fun facts that have the word 'is'
fun_fact=webpage.select('ul.fun-facts li')
fun_fact_with_is=[fact.find_all(string=re.compile('is')) for fact in fun_fact]
fun_fact_with_is=[fact for fact in fun_fact_with_is if fact]
fun_fact_with_is

[['Middle name is Ronald'],
 ['Dunkin Donuts coffee is better than Starbucks'],
 ['A favorite book series of mine is '],
 ['Current video game of choice is '],
 ["The band that I've seen the most times live is the "]]

In [34]:
print('niaje \n'.replace('\n','').strip(''))

niaje 


In [26]:
# Downloading images
url='https://keithgalli.github.io/web-scraping/'
img=webpage.select('div.column img')
url1=img[0]['src']
img_url=url+url1

img_data = requests.get(img_url).content
with open('lake como.jpg', 'wb') as handler:
    handler.write(img_data)

In [27]:
# Solvin mistery challenge
files=webpage.select('div.block a')
relative_files=[f['href'] for f in files]
for i in relative_files:
    t=requests.get(url+i)
    contents=bs(t.content)
    print(contents.select('p#secret-word').string())


AttributeError: ResultSet object has no attribute 'string'. You're probably treating a list of elements like a single element. Did you call find_all() when you meant to call find()?