## Load necessary libraries

In [2]:
import requests # pip install requests
from bs4 import BeautifulSoup as bs # pip install beautifulsoup4

## Load our first page

In [4]:
#load the web page
r = requests.get('https://keithgalli.github.io/web-scraping/example.html')
#Convert to a beautifulsoup object
soup = bs(r.content)
#print out our html
print(soup.prettify())

<html>
 <head>
  <title>
   HTML Example
  </title>
 </head>
 <body>
  <div align="middle">
   <h1>
    HTML Webpage
   </h1>
   <p>
    Link to more interesting example:
    <a href="https://keithgalli.github.io/web-scraping/webpage.html">
     keithgalli.github.io/web-scraping/webpage.html
    </a>
   </p>
  </div>
  <h2>
   A Header
  </h2>
  <p>
   <i>
    Some italicized text
   </i>
  </p>
  <h2>
   Another header
  </h2>
  <p id="paragraph-id">
   <b>
    Some bold text
   </b>
  </p>
 </body>
</html>



## Start scraping by using Beautiful Soup Library

### find and find_all

In [5]:
first_header = soup.find('h2')#finds first header
first_header

<h2>A Header</h2>

In [9]:
headers = soup.find_all('h2')#finds all headers and put them in list
headers

[<h2>A Header</h2>, <h2>Another header</h2>]

In [10]:
# Pass in a list of elements to look for
first_header = soup.find(['h1','h2']) # will get only the first element in the code
first_header

<h1>HTML Webpage</h1>

In [12]:
headers = soup.find_all(['h1','h2'])#finds all in list in the code
headers

[<h1>HTML Webpage</h1>, <h2>A Header</h2>, <h2>Another header</h2>]

In [23]:
# You can pass in attributes to the find/find_all function
paragraph = soup.find_all('p',attrs )
paragraph

[<p id="paragraph-id"><b>Some bold text</b></p>]

In [24]:
# You can nest find/find_all calls
body = soup.find('body')
div = body.find('div')
div

<div align="middle">
<h1>HTML Webpage</h1>
<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>
</div>

In [25]:
header = div.find('h1')
header

<h1>HTML Webpage</h1>

In [34]:
# We can search specific strings in our find/find all calls
paragraphs = soup.find_all('p',string ='Some bold text')#has to be exact match
print(paragraphs)

[<p id="paragraph-id"><b>Some bold text</b></p>]


In [35]:
# better solution is import regex library
import re
paragraphs = soup.find_all('p',string =re.compile('Some'))# all word's but same capitalization
print (paragraphs)

[<p><i>Some italicized text</i></p>, <p id="paragraph-id"><b>Some bold text</b></p>]


In [37]:
headers = soup.find_all('h2',string = re.compile("(H|h)eader"))#since it's regex u could use (H|h)to find both letters.
print (headers)

[<h2>A Header</h2>, <h2>Another header</h2>]


## Select (CSS selector)

In [39]:
content = soup.select('p')#similar to Find_all
content

[<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>,
 <p><i>Some italicized text</i></p>,
 <p id="paragraph-id"><b>Some bold text</b></p>]

In [40]:
content1 = soup.select('div p')#selecting all paragraphs inside div
content1

[<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>]

In [41]:
paragraphs = soup.select('h2 ~ p')# ~ selecting pargraphs right after headers h2
paragraphs

[<p><i>Some italicized text</i></p>,
 <p id="paragraph-id"><b>Some bold text</b></p>]

In [48]:
bold_text = soup.select('p#paragraph-id b')# p paragraphe #paragrapd- id (css id) b bold text
print(bold_text[0])

<b>Some bold text</b>


In [50]:
#selecting a direct decendant of a parent
paragraphs = soup.select("body > p")
print(paragraphs)
for paragraph in paragraphs:
    print(paragraph.select('i'))

[<p><i>Some italicized text</i></p>, <p id="paragraph-id"><b>Some bold text</b></p>]
[<i>Some italicized text</i>]
[]


In [51]:
#Grab by element with specific property
soup.select('[align=middle]')

[<div align="middle">
 <h1>HTML Webpage</h1>
 <p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>
 </div>]

## Get different properties of the HTML

In [54]:
header = soup.find('h2')
header.string# .string will make it give me the text inside the header.

'A Header'

In [62]:
div = soup.find('div')
print(div.prettify())
print(div.get_text())# cant use .string in bigger objects but could use .get_text()

<div align="middle">
 <h1>
  HTML Webpage
 </h1>
 <p>
  Link to more interesting example:
  <a href="https://keithgalli.github.io/web-scraping/webpage.html">
   keithgalli.github.io/web-scraping/webpage.html
  </a>
 </p>
</div>


HTML Webpage
Link to more interesting example: keithgalli.github.io/web-scraping/webpage.html



## Get links 

In [68]:
#Get a specific property from an element
link = soup.find("a")
print(link['href'])

https://keithgalli.github.io/web-scraping/webpage.html


In [69]:
paragraphs = soup.select('p#paragraph-id')
paragraphs[0]['id']

'paragraph-id'

### Code navigation

In [73]:
# Path syntax
soup.body.div.h1.string

'HTML Webpage'

In [77]:
# know the terms: Parent, Sibling, Child
#find_next_siblings() or find_next_sibling()
soup.body.div.find_next_siblings()
# or type code like this soup.body.find('div').find_next_sibling() or .find_next_siblings()


[<h2>A Header</h2>,
 <p><i>Some italicized text</i></p>,
 <h2>Another header</h2>,
 <p id="paragraph-id"><b>Some bold text</b></p>]

## Exercises!

Go to https://keithgalli.github.io/web-scraping/webpage.html

## Load the webpage

In [81]:
r = requests.get('https://keithgalli.github.io/web-scraping/webpage.html')
webpage = bs(r.content)
print(webpage.prettify())

<head>
 <title>
  Keith Galli's Page
 </title>
 <style>
  table {
    border-collapse: collapse;
  }
  th {
    padding:5px;
  }
  td {
    border: 1px solid #ddd;
    padding: 5px;
  }
  tr:nth-child(even) {
    background-color: #f2f2f2;
  }
  th {
    padding-top: 12px;
    padding-bottom: 12px;
    text-align: left;
    background-color: #add8e6;
    color: black;
  }
  .block {
  width: 100px;
  /*float: left;*/
    display: inline-block;
    zoom: 1;
  }
  .column {
  float: left;
  height: 200px;
  /*width: 33.33%;*/
  padding: 5px;
  }

  .row::after {
    content: "";
    clear: both;
    display: table;
  }
 </style>
</head>
<body>
 <h1>
  Welcome to my page!
 </h1>
 <img src="./images/selfie1.jpg" width="300px"/>
 <h2>
  About me
 </h2>
 <p>
  Hi, my name is Keith and I am a YouTuber who focuses on content related to programming, data science, and machine learning!
 </p>
 <p>
  Here is a link to my channel:
  <a href="https://www.youtube.com/kgmit">
   youtube.com/kgmit
  </

## Grab all of the social links from webpage

Do this in at least 3 different ways


In [89]:
sociallinks = webpage.find("ul",attrs='socials')
sociallinks

<ul class="socials">
<li class="social instagram"><b>Instagram: </b><a href="https://www.instagram.com/keithgalli/">https://www.instagram.com/keithgalli/</a></li>
<li class="social twitter"><b>Twitter: </b><a href="https://twitter.com/keithgalli">https://twitter.com/keithgalli</a></li>
<li class="social linkedin"><b>LinkedIn: </b><a href="https://www.linkedin.com/in/keithgalli/">https://www.linkedin.com/in/keithgalli/</a></li>
<li class="social tiktok"><b>TikTok: </b><a href="https://www.tiktok.com/@keithgalli">https://www.tiktok.com/@keithgalli</a></li>
</ul>

In [93]:
instigram = sociallinks.find('li',attrs='social instagram')
instigram
instlink = instigram.find('a')
print(instlink['href'])

https://www.instagram.com/keithgalli/


In [96]:
facebook = sociallinks.find('li',attrs = 'social twitter')
facelink = facebook.find('a')
print(facelink['href'])

https://twitter.com/keithgalli


In [98]:
tiktok = sociallinks.find('li',attrs = 'social tiktok')
tiklink = tiktok.find('a')
print(tiklink['href'])

https://www.tiktok.com/@keithgalli


### Kieth Gally Answer

In [102]:
links = webpage.select('ul.socials a')
actual_links = [link['href'] for link in links]
actual_links

['https://www.instagram.com/keithgalli/',
 'https://twitter.com/keithgalli',
 'https://www.linkedin.com/in/keithgalli/',
 'https://www.tiktok.com/@keithgalli']

In [111]:
ulist = webpage.find('ul',attrs = 'socials')
links = ulist.find_all('a')
actual_links = [link['href'] for link in links]
actual_links

['https://www.instagram.com/keithgalli/',
 'https://twitter.com/keithgalli',
 'https://www.linkedin.com/in/keithgalli/',
 'https://www.tiktok.com/@keithgalli']

In [113]:
links = webpage.select('li.social a')
actual_links = [link['href'] for link in links]
actual_links

['https://www.instagram.com/keithgalli/',
 'https://twitter.com/keithgalli',
 'https://www.linkedin.com/in/keithgalli/',
 'https://www.tiktok.com/@keithgalli']

### Scrape the table

In [115]:
links = webpage.select('span.txt-blue a')
actual_links = [link['href'] for link in links]
print(actual_links)

['https://www.eliteprospects.com/team/10263/mit-mass.-inst.-of-tech./2014-2015?tab=stats', 'https://www.eliteprospects.com/team/10263/mit-mass.-inst.-of-tech./2015-2016?tab=stats', 'https://www.eliteprospects.com/team/10263/mit-mass.-inst.-of-tech./2016-2017?tab=stats', 'https://www.eliteprospects.com/team/10263/mit-mass.-inst.-of-tech./2018-2019?tab=stats']


In [None]:
### geting table from location to pandas
for tr in table_rows:
    td =tr.find_all('td')
    row = [tr.string for tr in td]
    l.append(row)
pd.DataFrame(l,columns ={["A","B",.....]})
    

In [124]:
import pandas as pd
table = webpage.select('table.hockey-stats')[0]
columns = table.find('thead').find_all('th')
column_names = [c.string for c in columns]

table_rows = table.find('tbody').find_all('tr')
l=[]
for tr in table_rows:
    td =tr.find_all('td')
    row = [str(tr.get_text()).strip() for tr in td]
    l.append(row)
df = pd.DataFrame(l,columns =column_names)
    
df.head()

Unnamed: 0,S,Team,League,GP,G,A,TP,PIM,+/-,Unnamed: 10,POST,GP.1,G.1,A.1,TP.1,PIM.1,+/-.1
0,2014-15,MIT (Mass. Inst. of Tech.),ACHA II,17.0,3.0,9.0,12.0,20.0,,|,,,,,,,
1,2015-16,MIT (Mass. Inst. of Tech.),ACHA II,9.0,1.0,1.0,2.0,2.0,,|,,,,,,,
2,2016-17,MIT (Mass. Inst. of Tech.),ACHA II,12.0,5.0,5.0,10.0,8.0,0.0,|,,,,,,,
3,2017-18,Did not play,,,,,,,,|,,,,,,,
4,2018-19,MIT (Mass. Inst. of Tech.),ACHA III,8.0,5.0,10.0,15.0,8.0,,|,,,,,,,


## Grab all fun facts that use word "is"

In [151]:
facts = webpage.select('ul.fun-facts li')
facts_with_is = [fact.find(string = re.compile('is')) for fact in facts]
facts_with_is =[fact.find_parent().get_text() for fact in facts_with_is if fact]#filter none
print(facts_with_is)

['Middle name is Ronald', 'Dunkin Donuts coffee is better than Starbucks', "A favorite book series of mine is Ender's Game", 'Current video game of choice is Rocket League', "The band that I've seen the most times live is the Zac Brown Band"]


## Download an image

In [160]:
url ="https://keithgalli.github.io/web-scraping/"
imagelocation = webpage.select('div.row div.column img')
imagelocation

[<img alt="Lake Como" src="images/italy/lake_como.jpg" style="height:100%"/>,
 <img alt="Pontevecchio, Florence" src="images/italy/pontevecchio.jpg" style="height:100%"/>,
 <img alt="Riomaggiore, Cinque de Terre" src="images/italy/riomaggiore.jpg" style="height:100%"/>]

In [161]:
image = imagelocation[0]['src']
image

'images/italy/lake_como.jpg'

In [162]:
full_url = url+image

In [163]:
img_data = requests.get(full_url).content
with open('image_lake_como.jpg','wb') as handler:
    handler.write(img_data)

### Solve the mystrey Challenge

In [165]:
files = webpage.select('div.block a')
file = [f['href'] for f in files]
file

['challenge/file_1.html',
 'challenge/file_2.html',
 'challenge/file_3.html',
 'challenge/file_4.html',
 'challenge/file_5.html',
 'challenge/file_6.html',
 'challenge/file_7.html',
 'challenge/file_8.html',
 'challenge/file_9.html',
 'challenge/file_10.html']

In [172]:
for f in file:
    full_url = url + f
    page = requests.get(full_url)#load page
    bs_page = bs(page.content)
    print(bs_page.find('p',attrs={"id":"secret-word"}).string)

Make
sure
to
smash
that
like
button
and
subscribe
!!!


In [None]:
img_data = requests.get(full_url).content
with open('image_lake_como.jpg','wb') as handler:
    handler.write(img_data)