### BeautifulSoup - 
Beautiful Soup is a Python library for pulling data out of HTML and XML files. It works with your favorite parser to provide idiomatic ways of navigating, searching, and modifying the parse tree. It commonly saves programmers hours or days of work.

#### !pip install beautifulsoup4


They also suggest using the lxml parser

parse - resolve (a sentence) into its component parts and describe their syntactic roles.
#### !pip install lxml

In [1]:
from bs4 import BeautifulSoup

In [2]:
with open ('index.html', 'r') as f:
    doc = BeautifulSoup(f, "html.parser")
    
    
print(doc.prettify())

<html>
 <head>
  <title>
   Your Title Here
  </title>
 </head>
 <body bgcolor="FFFFFF">
  <center>
   <img align="BOTTOM" src="clouds.jpg"/>
  </center>
  <hr/>
  <a href="http://somegreatsite.com">
   Link Name
  </a>
  is a link to another nifty site
  <h1>
   This is a Header
  </h1>
  <h2>
   This is a Medium Header
  </h2>
  Send me mail at
  <a href="mailto:support@yourcompany.com">
   support@yourcompany.com
  </a>
  .
  <p>
   This is a new paragraph!
   <p>
    <b color="red">
     This is a new paragraph!
    </b>
    <br/>
    <b>
     <i>
      This is a new sentence without a paragraph break, in bold italics.
     </i>
    </b>
    <hr/>
   </p>
  </p>
 </body>
</html>


### Get tag...

In [3]:
tag = doc.title
print(tag)

<title>Your Title Here</title>


In [4]:
# tag string
print(tag.string)

Your Title Here


### Modify the tag.......inplace

In [5]:
tag.string = "Modified Title"

print(tag)

<title>Modified Title</title>


In [6]:
print(doc.prettify())

<html>
 <head>
  <title>
   Modified Title
  </title>
 </head>
 <body bgcolor="FFFFFF">
  <center>
   <img align="BOTTOM" src="clouds.jpg"/>
  </center>
  <hr/>
  <a href="http://somegreatsite.com">
   Link Name
  </a>
  is a link to another nifty site
  <h1>
   This is a Header
  </h1>
  <h2>
   This is a Medium Header
  </h2>
  Send me mail at
  <a href="mailto:support@yourcompany.com">
   support@yourcompany.com
  </a>
  .
  <p>
   This is a new paragraph!
   <p>
    <b color="red">
     This is a new paragraph!
    </b>
    <br/>
    <b>
     <i>
      This is a new sentence without a paragraph break, in bold italics.
     </i>
    </b>
    <hr/>
   </p>
  </p>
 </body>
</html>


### Get all p tags

In [7]:
tags = doc.find_all('p')

print(tags)

[<p> This is a new paragraph!

<p> <b color="red">This is a new paragraph!</b>
<br/> <b><i>This is a new sentence without a paragraph break, in bold italics.</i></b>
<hr/>
</p></p>, <p> <b color="red">This is a new paragraph!</b>
<br/> <b><i>This is a new sentence without a paragraph break, in bold italics.</i></b>
<hr/>
</p>]


### Get b tag of first p tag...

In [8]:
tags = doc.find_all('p')[0]

print(tags.find_all('b'))

[<b color="red">This is a new paragraph!</b>, <b><i>This is a new sentence without a paragraph break, in bold italics.</i></b>]


## Install requests...

In [None]:
import requests

url = 'https://www.newegg.ca/gigabyte-geforce-rtx-3080-ti-gv-n308tgaming-oc-12gd/p/N82E16814932436?Description=3080&cm_re=3080-_-14-932-436-_-Product'

result = requests.get(url).text

print(result)

In [None]:
doc = BeautifulSoup(result, 'html.parser')

print(doc.prettify())

#### Context -
We are trying to scrape the GPU price

## Find specific text...

In [11]:
prices = doc.find_all(text = "$")
print(prices)

['$', '$', '$']


## Find the parent tag of the specific text...

In [12]:
parent = prices[0].parent

print(parent)

<li class="price-current"><span class="price-current-label"></span>$<strong>2,898</strong><sup>.00</sup></li>


### Notice that the price is inside the strong tag...

In [13]:
strong = parent.find("strong")

print(strong)
print(strong.text)
print(strong.string)

<strong>2,898</strong>
2,898
2,898


----

# Searching and Filtering

In [None]:
from bs4 import BeautifulSoup

with open ('index2.html', 'r') as f:
    doc = BeautifulSoup(f, 'html.parser')
    
# print(doc.prettify())

### Access attributes...

In [15]:
tag = doc.find('option')

print(tag)

<option selected="" value="course-type">Course type*</option>


### How to change the value of the value attribute...

In [16]:
tag['value'] = 'NewVal'

print(tag)

<option selected="" value="NewVal">Course type*</option>


### Add Attributes...

In [17]:
tag['color'] = 'blue'

print(tag)

<option color="blue" selected="" value="NewVal">Course type*</option>


### Show all attributes

In [18]:
print(tag.attrs)

{'value': 'NewVal', 'selected': '', 'color': 'blue'}


### Find Multiple tags

In [None]:
tags = doc.find_all(['p', 'div', 'li'])

print(tags)

### Find tags with specific attributes

In [20]:
tags = doc.find_all(['option'], text = "Undergraduate")

print(tags)

[<option value="undergraduate">Undergraduate</option>]


In [21]:
tags = doc.find_all(['option'], text = "Undergraduate", value = 'undergraduate')

print(tags)

[<option value="undergraduate">Undergraduate</option>]


In [22]:
tags = doc.find_all(['option'], text = "Undergraduate", value = 'under')

print(tags)

[]


### Find tags based on class

In [23]:
tags = doc.find_all(class_ = 'btn-item')

print(tags)

[<a class="btn-item" href="https://www.w3docs.com/learn-html.html">Learn HTML</a>, <a class="btn-item" href="https://www.w3docs.com/quiz/#">Select Quiz</a>]


### Using Regular Expressions

In [24]:
import re

tags = doc.find_all(text = re.compile('\$.*'))

print(tags,'\n')

for tag in tags:
    print(tag.strip())

['\n        $2345\n      ', '\n        $123\n        '] 

$2345
$123


### Limiting responses

In [25]:
tags = doc.find_all(text = re.compile('\$.*'), limit = 1)

print(tags,'\n')

for tag in tags:
    print(tag.strip())

['\n        $2345\n      '] 

$2345


---

# Navigating the HTML tree

In [26]:
url = 'https://coinmarketcap.com/'

result = requests.get(url).text

doc = BeautifulSoup(result, 'html.parser')

In [None]:
tbody = doc.tbody

print(tbody)

### Iterate over sibling tags

In [None]:
trs = tbody.contents # Gives a list of all the tags that are inside the table body

print(trs)

### .next_sibling

In [None]:
print(trs[0].next_sibling) # table row after the first...

### .previous_sibling

In [None]:
print(trs[1].previous_sibling)

### next_siblings

In [None]:
# next_siblings will return a generator object that you can iterate through
print(trs[0].next_siblings, '\n\n')

list(trs[0].next_siblings)

### .parent

In [None]:
# .parent will return the parent tag.....in this case the entire tbody tag
print(trs[0].parent, '\n------------\n----------\n------------\n')

print(trs[0].parent.name)

## Context - 
Getting all the Crypto names and prices

In [33]:
tbody = doc.tbody

trs = tbody.contents

In [34]:
prices = {}

for tr in trs[:10]:
#     print('\n'*3)
    name, price = tr.contents[2:4]  # all table data inside the row...
    fixed_name = name.p.text
    fixed_price = price.a.text
    
    prices[fixed_name] = fixed_price
    
    
print(prices)

{'Bitcoin': '$42,781.95', 'Ethereum': '$3,064.18', 'Tether': '$1.00', 'BNB': '$427.20', 'USD Coin': '$1.00', 'Cardano': '$1.17', 'Solana': '$118.64', 'XRP': '$0.7237', 'Terra': '$56.80', 'Polkadot': '$22.20'}


---

# Finding the best GPU prices...

In [67]:
from bs4 import BeautifulSoup
import requests
import re

In [77]:
search_term = input("Product...? ")
# https://www.newegg.ca/p/pl?d=3080
url = f"https://www.newegg.ca/p/pl?d={search_term}"

page = requests.get(url).text

doc = BeautifulSoup(page, 'html.parser')

Product...? 3080


### Get number of pages

In [78]:
page_text = doc.find(class_ = "list-tool-pagination-text")

pages = int(page_text.text[4:].split('/')[1])

print(pages)

6


### Now you want to loop through all the pages...

In [79]:
items_found = {}


for page in range(1, pages+1):

    url = f"https://www.newegg.ca/p/pl?d={search_term}&page={page}"
    page = requests.get(url).text
    doc = BeautifulSoup(page, 'html.parser')
    div = doc.find(class_ = 'item-cells-wrap border-cells items-grid-view four-cells expulsion-one-cell')
    
    items = div.find_all(text = re.compile(search_term)) # usign re so that we match non-exact matches...
    for item in items:
        parent = item.parent
        
        if parent.name != 'a':
            continue
        
        link = parent['href']
        
        next_parent = item.find_parent( class_= 'item-container')
        
        price = next_parent.find(class_='price-current').strong#.string
        
        if price == None:
            continue

        price = price.text
        
        items_found[item] = {'price': int(price.replace(',', '')), 'link':link}
        
sorted_items = sorted(items_found.items(), key = lambda x: x[1]['price'])  # returns a tuple..

for item in sorted_items:
    print(item[0])
    print(f"${item[1]['price']}")
    print(item[1]['link'])
    print('\n'*5)

GIGABYTE Eagle GeForce RTX 3080 10GB GDDR6X PCI Express 4.0 ATX Video Card GV-N3080EAGLE-10GD (rev. 2.0) (LHR)
$1299
https://www.newegg.ca/gigabyte-geforce-rtx-3080-gv-n3080eagle-10gd/p/N82E16814932461?Description=3080&cm_re=3080-_-14-932-461-_-Product






ASUS TUF Gaming NVIDIA GeForce RTX 3080 V2 OC Edition Graphics Card (PCIe 4.0, 10GB GDDR6X, LHR, HDMI 2.1, DisplayPort 1.4a, Dual Ball Fan Bearings, Military-grade Certification, GPU Tweak II)
$1399
https://www.newegg.ca/asus-geforce-rtx-3080-tuf-rtx3080-o10g-v2-gaming/p/N82E16814126525?Description=3080&cm_re=3080-_-14-126-525-_-Product






ASUS ROG Strix GeForce RTX 3080 V2 OC Edition 10GB GDDR6X PCI Express 4.0 x16 Video Card ROG-STRIX-RTX3080-O10G-V2-GAMING (LHR)
$1399
https://www.newegg.ca/asus-geforce-rtx-3080-strix-rtx3080-o10g-v2-gaming/p/N82E16814126534?Description=3080&cm_re=3080-_-14-126-534-_-Product






ASUS ROG STRIX GeForce RTX 3080 10GB GDDR6X PCI Express 4.0 x16 ATX Video Card ROG-STRIX-RTX3080-O10G-WHITE-V2 (LH

---