<a href="https://colab.research.google.com/github/QuothTheRaven42/learning_files/blob/master/webscraping_with_keithgalli.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# webscraping - Keith Galli, youtube
# https://www.crummy.com/software/BeautifulSoup/bs4/doc/

In [1]:
from bs4 import BeautifulSoup as bs # translating html
import requests # loading web pages

In [2]:
# loading the page
url ='https://keithgalli.github.io/web-scraping/example.html'
request = requests.get(url)

In [3]:
# convert to Beautiful Soup object
soup = bs(request.content)
print(soup.prettify())

<html>
 <head>
  <title>
   HTML Example
  </title>
 </head>
 <body>
  <div align="middle">
   <h1>
    HTML Webpage
   </h1>
   <p>
    Link to more interesting example:
    <a href="https://keithgalli.github.io/web-scraping/webpage.html">
     keithgalli.github.io/web-scraping/webpage.html
    </a>
   </p>
  </div>
  <h2>
   A Header
  </h2>
  <p>
   <i>
    Some italicized text
   </i>
  </p>
  <h2>
   Another header
  </h2>
  <p id="paragraph-id">
   <b>
    Some bold text
   </b>
  </p>
 </body>
</html>



In [4]:
first_header = soup.find('h2')
print(first_header)

<h2>A Header</h2>


In [5]:
second_header = soup.find_all('h2')
print(second_header)

[<h2>A Header</h2>, <h2>Another header</h2>]


In [6]:
# passing in a list of elements to find
first_header = soup.find(['h1', 'h2'])
print(first_header)

<h1>HTML Webpage</h1>


In [7]:
headers = soup.find_all(['h1', 'h2'])
print(headers)

[<h1>HTML Webpage</h1>, <h2>A Header</h2>, <h2>Another header</h2>]


In [8]:
# passing in attributes {dict} to find/find_all funcs
paragraph = soup.find_all('p', attrs={'id':'paragraph-id'})
print(paragraph)

[<p id="paragraph-id"><b>Some bold text</b></p>]


In [9]:
# nesting find/find_all calls
body = soup.find('body')
div = body.find('div')
header = div.find('h1')
print(header)

<h1>HTML Webpage</h1>


In [10]:
# searching specific strings in find/find_all
para = soup.find_all('p',string='Some bold text')
print(para)

[<p id="paragraph-id"><b>Some bold text</b></p>]


In [11]:
# regex is useful because string passed into find/find_all must match entire target
import re
para = soup.find_all('p', stringe=re.compile('Some'))
# print(para)

headers = soup.find_all('h2', string=re.compile('H|header'))
print(headers)

[<h2>A Header</h2>, <h2>Another header</h2>]


In [12]:
soup.body

<body>
<div align="middle">
<h1>HTML Webpage</h1>
<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>
</div>
<h2>A Header</h2>
<p><i>Some italicized text</i></p>
<h2>Another header</h2>
<p id="paragraph-id"><b>Some bold text</b></p>
</body>

# selecting (css selector) 
# https://www.w3schools.com/cssref/css_selectors.asp

In [13]:
content = soup.select('p')
print(content)

[<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>, <p><i>Some italicized text</i></p>, <p id="paragraph-id"><b>Some bold text</b></p>]


In [14]:
content = soup.select('div p')
content

[<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>]

In [15]:
# all paragraphs(p) preceded by a header2(h2)
para = soup.select('h2 ~ p')
print(para)

[<p><i>Some italicized text</i></p>, <p id="paragraph-id"><b>Some bold text</b></p>]


In [16]:
# bold(b) text inside of a paragraph(p) tag with id of #paragraph-id
bold_text = soup.select('p#paragraph-id b')
print(bold_text)

[<b>Some bold text</b>]


In [17]:
# direct descendents(p) of the body
para = soup.select('body > p')
# print(para)

# iterating through the two items in list
for paragraphs in para:
    print(paragraphs.select('i'))

[<i>Some italicized text</i>]
[]


In [18]:
# grabbing by element with specific properties
soup.select('[align=middle]')

[<div align="middle">
 <h1>HTML Webpage</h1>
 <p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>
 </div>]

In [19]:
# getting different properties of the html
header = soup.find('h2')
print(header)
print(header.string)

<h2>A Header</h2>
A Header


In [20]:
div = soup.find('div')
print(div.prettify())

<div align="middle">
 <h1>
  HTML Webpage
 </h1>
 <p>
  Link to more interesting example:
  <a href="https://keithgalli.github.io/web-scraping/webpage.html">
   keithgalli.github.io/web-scraping/webpage.html
  </a>
 </p>
</div>



In [21]:
# get_text() for larger objects
# can also use when string is None to get an empty string instead
print(div.get_text())


HTML Webpage
Link to more interesting example: keithgalli.github.io/web-scraping/webpage.html



In [22]:
# getting specific property from an element
link = soup.find('a')
print(link['href'])

https://keithgalli.github.io/web-scraping/webpage.html


In [23]:
para = soup.select('p#paragraph-id')
para

[<p id="paragraph-id"><b>Some bold text</b></p>]

In [24]:
# pulling specific id by index
para[0]['id']

'paragraph-id'

# path syntax

In [25]:
soup.body

<body>
<div align="middle">
<h1>HTML Webpage</h1>
<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>
</div>
<h2>A Header</h2>
<p><i>Some italicized text</i></p>
<h2>Another header</h2>
<p id="paragraph-id"><b>Some bold text</b></p>
</body>

In [26]:
soup.body.div

<div align="middle">
<h1>HTML Webpage</h1>
<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>
</div>

In [27]:
soup.body.div.h1

<h1>HTML Webpage</h1>

In [28]:
print(soup.body.prettify())
# body is parent of div
# h1 and h2 are siblings because they are on the same level 

<body>
 <div align="middle">
  <h1>
   HTML Webpage
  </h1>
  <p>
   Link to more interesting example:
   <a href="https://keithgalli.github.io/web-scraping/webpage.html">
    keithgalli.github.io/web-scraping/webpage.html
   </a>
  </p>
 </div>
 <h2>
  A Header
 </h2>
 <p>
  <i>
   Some italicized text
  </i>
 </p>
 <h2>
  Another header
 </h2>
 <p id="paragraph-id">
  <b>
   Some bold text
  </b>
 </p>
</body>



In [29]:
# know the terms parent, sibling, children
# find/find_all_select are used more commonly

print(soup.body.find('div').find_next_siblings())

[<h2>A Header</h2>, <p><i>Some italicized text</i></p>, <h2>Another header</h2>, <p id="paragraph-id"><b>Some bold text</b></p>]


# https://keithgalli.github.io/web-scraping/webpage.html

In [30]:
# loading the page
url ='https://keithgalli.github.io/web-scraping/webpage.html'
request = requests.get(url)

# convert to Beautiful Soup object
soup = bs(request.content)
print(soup.prettify())

<html>
 <head>
  <title>
   Keith Galli's Page
  </title>
  <style>
   table {
    border-collapse: collapse;
  }
  th {
    padding:5px;
  }
  td {
    border: 1px solid #ddd;
    padding: 5px;
  }
  tr:nth-child(even) {
    background-color: #f2f2f2;
  }
  th {
    padding-top: 12px;
    padding-bottom: 12px;
    text-align: left;
    background-color: #add8e6;
    color: black;
  }
  .block {
  width: 100px;
  /*float: left;*/
    display: inline-block;
    zoom: 1;
  }
  .column {
  float: left;
  height: 200px;
  /*width: 33.33%;*/
  padding: 5px;
  }

  .row::after {
    content: "";
    clear: both;
    display: table;
  }
  </style>
 </head>
 <body>
  <h1>
   Welcome to my page!
  </h1>
  <img src="./images/selfie1.jpg" width="300px"/>
  <h2>
   About me
  </h2>
  <p>
   Hi, my name is Keith and I am a YouTuber who focuses on content related to programming, data science, and machine learning!
  </p>
  <p>
   Here is a link to my channel:
   <a href="https://www.youtube.com/kgmi

# grabbing social links from page

In [31]:
links = [link['href'] for link in soup.find_all('a', string=re.compile('^http'))]
print(links)

['https://www.instagram.com/keithgalli/', 'https://twitter.com/keithgalli', 'https://www.linkedin.com/in/keithgalli/', 'https://www.tiktok.com/@keithgalli']


# grabbing table from page, converting to dataframe

In [32]:
import pandas as pd

hockey = soup.select('table.hockey-stats')[0]
cols = hockey.find_all('th')
col_names = [c.string for c in cols][:8] # filtering out empty columns
rows = hockey.find('tbody').find_all('tr')

l = []
for tr in rows:
    td = tr.find_all('td')[:8]
    row = [tr.text.strip() for tr in td]
    l.append(row)

df = pd.DataFrame(l, columns = col_names)
df.head()

Unnamed: 0,S,Team,League,GP,G,A,TP,PIM
0,2014-15,MIT (Mass. Inst. of Tech.),ACHA II,17.0,3.0,9.0,12.0,20.0
1,2015-16,MIT (Mass. Inst. of Tech.),ACHA II,9.0,1.0,1.0,2.0,2.0
2,2016-17,MIT (Mass. Inst. of Tech.),ACHA II,12.0,5.0,5.0,10.0,8.0
3,2017-18,Did not play,,,,,,
4,2018-19,MIT (Mass. Inst. of Tech.),ACHA III,8.0,5.0,10.0,15.0,8.0


In [33]:
df.loc[df['Team'] != 'Did not play']

Unnamed: 0,S,Team,League,GP,G,A,TP,PIM
0,2014-15,MIT (Mass. Inst. of Tech.),ACHA II,17,3,9,12,20
1,2015-16,MIT (Mass. Inst. of Tech.),ACHA II,9,1,1,2,2
2,2016-17,MIT (Mass. Inst. of Tech.),ACHA II,12,5,5,10,8
4,2018-19,MIT (Mass. Inst. of Tech.),ACHA III,8,5,10,15,8


# grabbing fun facts

In [34]:
facts = soup.select('ul.fun-facts li')
fun_facts = [fact.find(string=str).find_parent().get_text() for fact in facts]
fun_facts

['Owned my dream car in high school 1',
 'Middle name is Ronald',
 'Never had been on a plane until college',
 'Dunkin Donuts coffee is better than Starbucks',
 "A favorite book series of mine is Ender's Game",
 'Current video game of choice is Rocket League',
 "The band that I've seen the most times live is the Zac Brown Band"]

In [35]:
import urllib
import requests

html = urllib.request.urlopen(url)
soup = bs(html)
imgs = soup.select('img')

base = 'https://keithgalli.github.io/web-scraping/'

l = []
for img in imgs[:1]: # first url was a local path starting with './'
    l.append(base + str(img['src'][2:]))
for img in imgs[1:]:
    l.append(base + str(img['src']))


# write image to file/downloading image
for image in l:
    with open('img.jpg', 'wb') as handler:
        response = requests.get(image, stream=True)
        for block in response.iter_content(1024):
            handler.write(block)

In [36]:
# secret message, words hidden in tags of multiple other pages
links = soup.select('div div li a')

l = []
for link in links:
    secret_link = base + link['href']
    request = requests.get(secret_link)
    new_soup = bs(request.content)
    element = new_soup.find('p', attrs={'id':'secret-word'})
    l.append(element.string)

' '.join(l)

'Make sure to smash that like button and subscribe !!!'

In [37]:
# 