In [1]:
from bs4 import BeautifulSoup as bs
import requests
import re,pyperclip
import pandas as pd

source = 'https://www.youtube.com/watch?v=GjKQ6V_ViQE'

In [2]:
url = 'https://keithgalli.github.io/web-scraping/example.html' # can directly pass URL it in request.get() as an argument.

In [3]:
response = requests.get('https://keithgalli.github.io/web-scraping/example.html')

In [4]:
soup = bs(response.content) # gets the html of the webpage.

In [5]:
soup

<html>
<head>
<title>HTML Example</title>
</head>
<body>
<div align="middle">
<h1>HTML Webpage</h1>
<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>
</div>
<h2>A Header</h2>
<p><i>Some italicized text</i></p>
<h2>Another header</h2>
<p id="paragraph-id"><b>Some bold text</b></p>
</body>
</html>

In [6]:
print(soup.prettify())

<html>
 <head>
  <title>
   HTML Example
  </title>
 </head>
 <body>
  <div align="middle">
   <h1>
    HTML Webpage
   </h1>
   <p>
    Link to more interesting example:
    <a href="https://keithgalli.github.io/web-scraping/webpage.html">
     keithgalli.github.io/web-scraping/webpage.html
    </a>
   </p>
  </div>
  <h2>
   A Header
  </h2>
  <p>
   <i>
    Some italicized text
   </i>
  </p>
  <h2>
   Another header
  </h2>
  <p id="paragraph-id">
   <b>
    Some bold text
   </b>
  </p>
 </body>
</html>



## Start Using Beautiful Soup Library to Scrape

###  find And find_all

In [7]:
first_header = soup.find('h2') # return FIRST h2 Tag.

In [8]:
first_header

<h2>A Header</h2>

In [9]:
headers = soup.find_all('h2') # returns list of ALL THE h2 tags present in the web Page.

In [10]:
headers

[<h2>A Header</h2>, <h2>Another header</h2>]

In [11]:
# Passing A List Of Elements to Look For.

In [12]:
first_header = soup.find(['h1','h2']) # returns the first occurence of any of the tags passed in the list.

In [13]:
first_header

<h1>HTML Webpage</h1>

In [14]:
headers = soup.find_all(['h1','h2']) # returns the list of all the occurences of the tags passed in the list.

In [15]:
headers

[<h1>HTML Webpage</h1>, <h2>A Header</h2>, <h2>Another header</h2>]

In [16]:
# Passing Attributes to find and find_all functions

In [17]:
paragraph = soup.find_all('p') # returns a list of all the 'p' tags present in the web page.

In [18]:
paragraph

[<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>,
 <p><i>Some italicized text</i></p>,
 <p id="paragraph-id"><b>Some bold text</b></p>]

In [19]:
paragraph = soup.find_all('p',attrs={'id':'paragraph-id'}) # return list of 'p' tags or tag which have the id 'paragraph-id'

In [20]:
paragraph

[<p id="paragraph-id"><b>Some bold text</b></p>]

In [21]:
# One can NEST find and find_all calls.

In [22]:
body = soup.find('body') 

In [23]:
body

<body>
<div align="middle">
<h1>HTML Webpage</h1>
<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>
</div>
<h2>A Header</h2>
<p><i>Some italicized text</i></p>
<h2>Another header</h2>
<p id="paragraph-id"><b>Some bold text</b></p>
</body>

In [24]:
# we only want div from the body.

In [25]:
div = body.find('div')

In [26]:
div

<div align="middle">
<h1>HTML Webpage</h1>
<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>
</div>

In [27]:
# getting header from the div 

In [28]:
head = div.find_all('h1')

In [29]:
head

[<h1>HTML Webpage</h1>]

In [30]:
# searching string from the web page

In [31]:
string_search = soup.find_all("p", string="Some bold text")

In [32]:
string_search

[<p id="paragraph-id"><b>Some bold text</b></p>]

## select (CSS selector)

In [33]:
content = soup.select('p') # Little bit similar to find_all method

In [34]:
content

[<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>,
 <p><i>Some italicized text</i></p>,
 <p id="paragraph-id"><b>Some bold text</b></p>]

In [35]:
content = soup.select("div p") # returns first p after div tag

In [36]:
content

[<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>]

In [37]:
"""
Please refer to 
https://www.w3schools.com/cssref/css_selectors.asp
to know more selectors and about them.

"""

'\nPlease refer to \nhttps://www.w3schools.com/cssref/css_selectors.asp\nto know more selectors and about them.\n\n'

## Getting different properties of the HTML

In [38]:
# getting text from a h2 tag.

In [39]:
header = soup.find('h2')

In [40]:
header

<h2>A Header</h2>

In [41]:
header = soup.find('h2').text

In [42]:
header

'A Header'

In [43]:
# or you can do
header = soup.find('h2')

In [44]:
header

<h2>A Header</h2>

In [45]:
header = soup.find('h2')

In [46]:
header.string

'A Header'

In [47]:
div  = soup.find('div')

In [48]:
div

<div align="middle">
<h1>HTML Webpage</h1>
<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>
</div>

In [49]:
div = soup.find('div')
div.string # .string methods will return nothing because .string div tag has multiple childs.

In [50]:
print(div.get_text()) 


HTML Webpage
Link to more interesting example: keithgalli.github.io/web-scraping/webpage.html



In [51]:
# if multiple child use get_text() otherwise use .string

## Getting Specific Property From A Element.

In [52]:
link = soup.find('a')

In [53]:
link['href']

'https://keithgalli.github.io/web-scraping/webpage.html'

### Code Navigation

In [54]:
# Know The Terms
# 1: Parent
# 2: Child
# 3: Sibling

# Sibling -: When Attributes are on the same level they are know as siblings.
# Child and Parent -: If a attribute is on the below level of an element , then the element on the lower level is known as child of the 
# element on the higher level, and the element on the higher level is known as the Parent of the child element.


In [55]:
sib = soup.find('div').find_next_siblings() # returns a list of all the siblings of 'div' tag.

In [56]:
sib

[<h2>A Header</h2>,
 <p><i>Some italicized text</i></p>,
 <h2>Another header</h2>,
 <p id="paragraph-id"><b>Some bold text</b></p>]

# Excercise

In [57]:
# Load The Webpage.
# Task 1 -: TO get all the social media links from the webpage.(Using 3 diffrent ways.)

In [58]:
web = requests.get('https://keithgalli.github.io/web-scraping/webpage.html')

In [59]:
soup = bs(web.content)

In [60]:
print(soup.prettify())

<head>
 <title>
  Keith Galli's Page
 </title>
 <style>
  table {
    border-collapse: collapse;
  }
  th {
    padding:5px;
  }
  td {
    border: 1px solid #ddd;
    padding: 5px;
  }
  tr:nth-child(even) {
    background-color: #f2f2f2;
  }
  th {
    padding-top: 12px;
    padding-bottom: 12px;
    text-align: left;
    background-color: #add8e6;
    color: black;
  }
  .block {
  width: 100px;
  /*float: left;*/
    display: inline-block;
    zoom: 1;
  }
  .column {
  float: left;
  height: 200px;
  /*width: 33.33%;*/
  padding: 5px;
  }

  .row::after {
    content: "";
    clear: both;
    display: table;
  }
 </style>
</head>
<body>
 <h1>
  Welcome to my page!
 </h1>
 <img src="./images/selfie1.jpg" width="300px"/>
 <h2>
  About me
 </h2>
 <p>
  Hi, my name is Keith and I am a YouTuber who focuses on content related to programming, data science, and machine learning!
 </p>
 <p>
  Here is a link to my channel:
  <a href="https://www.youtube.com/kgmit">
   youtube.com/kgmit
  </

In [61]:
# method 1
for ul in soup.find_all('ul',class_="socials"):
	    for li in ul.find_all('li'):
		    a = li.find('a')
		    print(a['href'])
            

https://www.instagram.com/keithgalli/
https://twitter.com/keithgalli
https://www.linkedin.com/in/keithgalli/
https://www.tiktok.com/@keithgalli


In [62]:
# method 2

In [63]:
for i in soup.select('ul.socials a[href^="https"]'):
    print(i['href'])

https://www.instagram.com/keithgalli/
https://twitter.com/keithgalli
https://www.linkedin.com/in/keithgalli/
https://www.tiktok.com/@keithgalli


In [64]:
# Method 3

In [65]:
li = soup.find('li',class_="social instagram")
anchor = li.find('a')

In [66]:
insta  = anchor["href"]

In [67]:
li = soup.find('li',class_="social twitter")

In [68]:
anchor = li.find('a')

In [69]:
anchor['href'] #TWITTER 

'https://twitter.com/keithgalli'

In [70]:
# linkedIn 
li = soup.find('li',class_="social linkedin")

In [71]:
anchor = li.find('a')

In [72]:
anchor['href']

'https://www.linkedin.com/in/keithgalli/'

In [73]:
# tik tok

In [74]:
li = soup.find('li',class_='social tiktok')

In [75]:
anchor = li.find('a')

In [76]:
anchor['href']

'https://www.tiktok.com/@keithgalli'

In [77]:
# TASK 2 Scrape the table that is present on the webpage.

In [78]:
table = soup.select('table',class_="hockey-stats")[0]
columns = table.find('thead').find_all('th')
column_name = []
for i in columns:
    column_name.append(i.text)

In [79]:
table_rows = table.find("tbody").find_all('tr')

In [80]:
row = []
for j in table_rows:
    row.append(str(j.get_text().strip()))

In [81]:
row

['2014-15\n              \n\n\n\n MIT (Mass. Inst. of Tech.) \n\n\n  ACHA II  \n17\n3\n9\n12\n20\n\n |',
 '2015-16\n              \n\n\n\n MIT (Mass. Inst. of Tech.) \n\n\n  ACHA II  \n9\n1\n1\n2\n2\n\n |',
 '2016-17\n              \n\n\n\n MIT (Mass. Inst. of Tech.) \n\n\n  ACHA II  \n12\n5\n5\n10\n8\n0\n |',
 '2017-18\n              \n\n                  Did not play\n              \n   \n\n\n\n\n\n\n |',
 '2018-19\n              \n\n\n\n MIT (Mass. Inst. of Tech.) \n\n\n  ACHA III  \n8\n5\n10\n15\n8\n\n |']

In [82]:
df = pd.DataFrame(row,columns=column_name)

ValueError: Shape of passed values is (5, 1), indices imply (5, 17)

In [None]:
# 1: Getting table from the webpage.
table  =  soup.find('table',class_='hockey-stats')

In [None]:
# 2: Getting Column names from the table.
col = table.find('thead').find_all('th')
# getting only text from the col variable
columns_name  = []

for column in col:
    columns_name.append(column.text)

In [None]:
# 3: Getting rows 
l = []
raw_rows = table.find('tbody').find_all('tr')
for i in raw_rows:
    td = i.find_all('td') 
    row = [str(i.get_text()).strip() for i in td]
    l.append(row)


In [None]:
df = pd.DataFrame(l,columns=columns_name)

In [None]:
df

In [None]:
# 3: Getting rows 
l = []
raw_rows = table.find('tbody').find_all('tr')
for i in raw_rows:
    td = i.find_all('td')
    for r in td:
        if r.text:
            
            l.append(r.text.split())

In [None]:
# Task 3 -: Grab all fun facts that use word 'is'

In [None]:
fun_facts = soup.find('ul',class_='fun-facts')

In [None]:
fun_facts

In [None]:
text = []

for f in fun_facts.find_all('li'):
    t = f.get_text()
    if 'is' in t:
        text.append(t)
    
    

In [None]:
text

In [None]:
# extracting Image from the WebPage

In [None]:
im =  soup.select('div.column img ')

In [None]:
im

In [None]:
for i in im:
    i.a['href']

In [None]:
imgs = soup.findAll("div", {"class":"column"})

In [None]:
images

In [None]:
images = soup.find_all('img')
for i in images:
    print(i['src'])
    

In [None]:
print(soup.select('div.row div.column img'))

In [None]:
#task 4-: getting secret message

In [None]:
file_names = soup.select('div.block li')
file_names

In [None]:
file_names = soup.find_all('div',class_='block')

In [None]:
l= []
for i in file_names:
    li = i.find_all('li')
    for a in li:
        a = a.find_all('a')
        for link in a:
            l.append(link['href'])

In [None]:
l