<a href="https://colab.research.google.com/github/isaurabhpandey/PortfolioProjects/blob/main/web_scraping%20Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Load in the necessary libraries




In [None]:
import requests 
from bs4 import BeautifulSoup as bs

## Load our first page

In [None]:

# Load the web page content

r = requests.get("https://keithgalli.github.io/web-scraping/example.html")

# Convert to a beautiful soup object 
soup = bs(r.content)

# Print out our html
print (soup.prettify())

<html>
 <head>
  <title>
   HTML Example
  </title>
 </head>
 <body>
  <div align="middle">
   <h1>
    HTML Webpage
   </h1>
   <p>
    Link to more interesting example:
    <a href="https://keithgalli.github.io/web-scraping/webpage.html">
     keithgalli.github.io/web-scraping/webpage.html
    </a>
   </p>
  </div>
  <h2>
   A Header
  </h2>
  <p>
   <i>
    Some italicized text
   </i>
  </p>
  <h2>
   Another header
  </h2>
  <p id="paragraph-id">
   <b>
    Some bold text
   </b>
  </p>
 </body>
</html>



## Lets start using Beautiful Soup to Scrape

In [None]:
soup

<html>
<head>
<title>HTML Example</title>
</head>
<body>
<div align="middle">
<h1>HTML Webpage</h1>
<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>
</div>
<h2>A Header</h2>
<p><i>Some italicized text</i></p>
<h2>Another header</h2>
<p id="paragraph-id"><b>Some bold text</b></p>
</body>
</html>

find and find all h2 tags

In [None]:
first_header = soup.find("h2")   #finds the first 'h2' tag & prints it

headers = soup.find_all("h2")    # finds all 'h2' tags & prints it
print(headers)

[<h2>A Header</h2>, <h2>Another header</h2>]


In [None]:
# Pass in a list of elements to look for h1 and h2 tags
all_headers = soup.find_all(["h1","h2"])
all_headers

[<h1>HTML Webpage</h1>, <h2>A Header</h2>, <h2>Another header</h2>]

In [None]:
# We can pass attributes to the find/find_all function
paragraph = soup.find_all("p", attrs = {"id": "paragraph-id"})
paragraph

[<p id="paragraph-id"><b>Some bold text</b></p>]

In [None]:
# You can nest find/find_all calls
body = soup.find("body")
div = body.find("div")
header = div.find("h1")
header


<h1>HTML Webpage</h1>

In [None]:
# We can search specific strings in our find/find_all calls

import re

paragraphs = soup.find_all("p", string = re.compile("Some"))
paragraphs

[<p><i>Some italicized text</i></p>,
 <p id="paragraph-id"><b>Some bold text</b></p>]

In [None]:
headers = soup.find_all("h2", string = re.compile("(H|h)eader"))
headers

[<h2>A Header</h2>, <h2>Another header</h2>]

# Select CSS selector


In [None]:
print(soup.body.prettify())

In [None]:
content = soup.select("div p")
content

[<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>]

In [None]:
paragraphs = soup.select("h2 ~ p")
paragraphs

[<p><i>Some italicized text</i></p>,
 <p id="paragraph-id"><b>Some bold text</b></p>]

In [None]:
bold_text = soup.select("p#paragraph-id b")
bold_text

[<b>Some bold text</b>]

In [None]:
paragraphs = soup.select("body > p")
print(paragraphs)

for paragraph in paragraphs:
  print(paragraph.select("i"))



[<p><i>Some italicized text</i></p>, <p id="paragraph-id"><b>Some bold text</b></p>]
[<i>Some italicized text</i>]
[]


# Get different properties of the HTML

In [None]:
header = soup.find('h2')
header.string                 #Use .string if there is a single text

# IF multiple texts/strings use get_text() 
div = soup.find('div')
print(div.prettify())
print(div.get_text())

#Exercises!

Challenge 1: Grab all of the social links from the webpage (Atleast in 3 different ways)


In [None]:
# Ignore SSL certificate errors
import ssl
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE

In [None]:
# Load the webpage content on which we perform exercises
url = "https://keithgalli.github.io/web-scraping/"
w = requests.get( url + "webpage.html")

#Convert to a beautiful soup object
webpage = bs(w.content)

#Print out our html website
print(webpage)

In [None]:
#Method 1

links = webpage.select('ul.socials a')
actual_link = [link['href'] for link in links]
actual_link

['https://www.instagram.com/keithgalli/',
 'https://twitter.com/keithgalli',
 'https://www.linkedin.com/in/keithgalli/',
 'https://www.tiktok.com/@keithgalli']

In [None]:
#Method 2

ulist = webpage.find('ul', attrs ={"class": "socials"})
links = ulist.find_all('a')
links
actual_link = [link['href'] for link in links]
actual_link


['https://www.instagram.com/keithgalli/',
 'https://twitter.com/keithgalli',
 'https://www.linkedin.com/in/keithgalli/',
 'https://www.tiktok.com/@keithgalli']

In [None]:
# Method 3

links = webpage.select('li.social a')
links
actual_link = [link['href'] for link in links]
actual_link

['https://www.instagram.com/keithgalli/',
 'https://twitter.com/keithgalli',
 'https://www.linkedin.com/in/keithgalli/',
 'https://www.tiktok.com/@keithgalli']

# Challenge 2: Scrape the table on {https://keithgalli.github.io/web-scraping/webpage.html}

In [None]:
import pandas as pd

table = webpage.select('table.hockey-stats')[0]
table


In [None]:
l = []
for tr in table_rows:
    td = tr.find_all('td')
    row = [tr.text for tr in td]
    l.append(row)
pd.DataFrame(l, columns=["A", "B", ...])

In [None]:
# scrape header row from the table

import pandas as pd

table = webpage.select("table.hockey-stats")[0]
columns = table.find('thead').find_all('th')
column_names = [c.string for c in columns]

table_rows = table.find('tbody').find_all('tr')
l = []
for tr in table_rows:
    td = tr.find_all('td')
    row = [str(tr.get_text()).strip() for tr in td]
    l.append(row)

print(l[0])

df = pd.DataFrame(l, columns= column_names)
df.head()
df.loc[df['Team'] != 'Did not play']

['2014-15', 'MIT (Mass. Inst. of Tech.)', 'ACHA II', '17', '3', '9', '12', '20', '', '|', '', '', '', '', '', '', '']


Unnamed: 0,S,Team,League,GP,G,A,TP,PIM,+/-,Unnamed: 10,POST,GP.1,G.1,A.1,TP.1,PIM.1,+/-.1
0,2014-15,MIT (Mass. Inst. of Tech.),ACHA II,17,3,9,12,20,,|,,,,,,,
1,2015-16,MIT (Mass. Inst. of Tech.),ACHA II,9,1,1,2,2,,|,,,,,,,
2,2016-17,MIT (Mass. Inst. of Tech.),ACHA II,12,5,5,10,8,0.0,|,,,,,,,
4,2018-19,MIT (Mass. Inst. of Tech.),ACHA III,8,5,10,15,8,,|,,,,,,,


*Challenge 3:*
 # Grab all the fun facts that use word 'is'


In [None]:
import re
fun = webpage.select('ul.fun-facts li')

facts = [fact.find(string = re.compile('is')) for fact in fun]
facts_with_is = [fact.find_parent().get_text() for fact in facts if fact]
facts_with_is

['Middle name is Ronald',
 'Dunkin Donuts coffee is better than Starbucks',
 "A favorite book series of mine is Ender's Game",
 'Current video game of choice is Rocket League',
 "The band that I've seen the most times live is the Zac Brown Band"]

### Challenge 4:

Download one image from the website:

In [None]:
image = webpage.select('div.row div.column img')
image_url = image[0]['src']
full_url = url + image_url

'https://keithgalli.github.io/web-scraping/images/italy/lake_como.jpg'

In [None]:
import requests

img_data = requests.get(full_url).content
with open('lake_como.jpg', 'wb') as handler:
    handler.write(img_data)

Solve the mystery challenge!

In [None]:
files = webpage.select('div.block a')
#files
relative_files = [f['href'] for f in files]
#relative_files

url = 'https://keithgalli.github.io/web-scraping/'
for f in relative_files:
  full_url = url + f
  page = requests.get(full_url)
  bs_page = bs(page.content)
  #print(bs_page.body.prettify())
  secret_word_element = bs_page.find('p', attrs = {'id': 'secret-word'})
  secret_word = secret_word_element.string
  print(secret_word)
  #break

Make
sure
to
smash
that
like
button
and
subscribe
!!!
