In [2]:
 import requests 
from bs4 import BeautifulSoup as bs 


In [3]:
r = requests.get('https://keithgalli.github.io/web-scraping/example.html')
soup = bs(r.content)
#print(soup)
print(soup.prettify())

<html>
 <head>
  <title>
   HTML Example
  </title>
 </head>
 <body>
  <div align="middle">
   <h1>
    HTML Webpage
   </h1>
   <p>
    Link to more interesting example:
    <a href="https://keithgalli.github.io/web-scraping/webpage.html">
     keithgalli.github.io/web-scraping/webpage.html
    </a>
   </p>
  </div>
  <h2>
   A Header
  </h2>
  <p>
   <i>
    Some italicized text
   </i>
  </p>
  <h2>
   Another header
  </h2>
  <p id="paragraph-id">
   <b>
    Some bold text
   </b>
  </p>
 </body>
</html>



In [4]:
#Lets start scraping  
find_header = soup.find('h2')
all_headers = soup.find_all('h2')
find_header
all_headers 



[<h2>A Header</h2>, <h2>Another header</h2>]

In [5]:
find_head = soup.find_all(["h2","h1"])
find_head

[<h1>HTML Webpage</h1>, <h2>A Header</h2>, <h2>Another header</h2>]

In [6]:
#attributes 
parag = soup.find_all('p', attrs = {'id':'paragraph-id'})
parag

[<p id="paragraph-id"><b>Some bold text</b></p>]

In [7]:
#nesting
body = soup.find('body')
div = body.find('div')
header = div.find('h1')
div
#header

<div align="middle">
<h1>HTML Webpage</h1>
<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>
</div>

In [8]:
#Search specific strings  
#Lets find any paragraph with text 
parag = soup.find('p', string='Some bold text')
parag


<p id="paragraph-id"><b>Some bold text</b></p>

In [9]:
#But if we want the output based on even one word we import regex 
import re 
parag = soup.find_all('p', string= re.compile('Some'))
parag
head = soup.find_all('h2', string = re.compile('(H|h)eader'))
head

[<h2>A Header</h2>, <h2>Another header</h2>]

In [10]:
#Selector as css(similar to find_all)
content = soup.select('p')
content


[<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>,
 <p><i>Some italicized text</i></p>,
 <p id="paragraph-id"><b>Some bold text</b></p>]

In [11]:
#paragraph inside div
parag = soup.select('div p')
#parag

#Instead of  
body = soup.find('body')
div = body.find('div')
header = div.find('p')
header

#or  
parag = soup.find_all('p')
parag[0]

#or  

parag = soup.find('p')
parag

  
 

<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>

In [12]:
#get paragraphs strictly after h2
parag = soup.select('h2 ~ p')
parag[0]

<p><i>Some italicized text</i></p>

In [13]:
#Get a specific text inside id  
bold_text = soup.select('p#paragraph-id b')
bold_text

[<b>Some bold text</b>]

In [14]:
#Select a nested type  
parag = soup.select('body > p')
parag

for par in parag:
    
   # print(par.select('i'))
    break

i = par.select(('i'))
i

[<i>Some italicized text</i>]

In [15]:
header = soup.find('h2') 
#header.string
#print(header.string)
#print(header.get_text())

div = soup.find('div')
#print(div.prettify())
#print(div.get_text())


In [16]:
link = soup.find("a")
link['href']

'https://keithgalli.github.io/web-scraping/webpage.html'

In [17]:
paragraph = soup.select('p#paragraph-id')
paragraph[0]['id']

'paragraph-id'

In [18]:
#Path
soup.body.div.h1.string

'HTML Webpage'

In [19]:
print(soup.body.prettify()) 

<body>
 <div align="middle">
  <h1>
   HTML Webpage
  </h1>
  <p>
   Link to more interesting example:
   <a href="https://keithgalli.github.io/web-scraping/webpage.html">
    keithgalli.github.io/web-scraping/webpage.html
   </a>
  </p>
 </div>
 <h2>
  A Header
 </h2>
 <p>
  <i>
   Some italicized text
  </i>
 </p>
 <h2>
  Another header
 </h2>
 <p id="paragraph-id">
  <b>
   Some bold text
  </b>
 </p>
</body>



In [20]:
soup.body.find('div').find_next_siblings()

[<h2>A Header</h2>,
 <p><i>Some italicized text</i></p>,
 <h2>Another header</h2>,
 <p id="paragraph-id"><b>Some bold text</b></p>]

In [21]:
r = requests.get('https://keithgalli.github.io/web-scraping/webpage.html')
webpage = bs(r.content)
print(webpage.prettify())

<html>
 <head>
  <title>
   Keith Galli's Page
  </title>
  <style>
   table {
    border-collapse: collapse;
  }
  th {
    padding:5px;
  }
  td {
    border: 1px solid #ddd;
    padding: 5px;
  }
  tr:nth-child(even) {
    background-color: #f2f2f2;
  }
  th {
    padding-top: 12px;
    padding-bottom: 12px;
    text-align: left;
    background-color: #add8e6;
    color: black;
  }
  .block {
  width: 100px;
  /*float: left;*/
    display: inline-block;
    zoom: 1;
  }
  .column {
  float: left;
  height: 200px;
  /*width: 33.33%;*/
  padding: 5px;
  }

  .row::after {
    content: "";
    clear: both;
    display: table;
  }
  </style>
 </head>
 <body>
  <h1>
   Welcome to my page!
  </h1>
  <img src="./images/selfie1.jpg" width="300px"/>
  <h2>
   About me
  </h2>
  <p>
   Hi, my name is Keith and I am a YouTuber who focuses on content related to programming, data science, and machine learning!
  </p>
  <p>
   Here is a link to my channel:
   <a href="https://www.youtube.com/kgmi

In [22]:
#Grab all of the social links from the page (3 ways)
links = webpage.select('ul.socials a')
actual_links = [link['href'] for link in links]
actual_links

#Or


['https://www.instagram.com/keithgalli/',
 'https://twitter.com/keithgalli',
 'https://www.linkedin.com/in/keithgalli/',
 'https://www.tiktok.com/@keithgalli']

In [23]:

ulist = webpage.find('ul', attrs={'class': 'socials'})
links = ulist.find_all('a')
actual_links = [link['href'] for link in links]
actual_links

['https://www.instagram.com/keithgalli/',
 'https://twitter.com/keithgalli',
 'https://www.linkedin.com/in/keithgalli/',
 'https://www.tiktok.com/@keithgalli']

In [24]:
links = webpage.select('li.social a')
actual_links = [link['href'] for link in links]
actual_links

['https://www.instagram.com/keithgalli/',
 'https://twitter.com/keithgalli',
 'https://www.linkedin.com/in/keithgalli/',
 'https://www.tiktok.com/@keithgalli']

In [25]:
#Scrape table
import pandas as pd  

table = webpage.select('table.hockey-stats')[0]  #avoids listing  
table

<table class="hockey-stats">
<thead>
<tr>
<th class="season" data-sort="">S</th>
<th class="team" data-sort="team">Team</th>
<th class="league" data-sort="league">League</th>
<th class="regular gp" data-sort="gp">GP</th>
<th class="regular g" data-sort="g">G</th>
<th class="regular a" data-sort="a">A</th>
<th class="regular tp" data-sort="tp">TP</th>
<th class="regular pim" data-sort="pim">PIM</th>
<th class="regular pm" data-sort="pm">+/-</th>
<th class="separator"> </th>
<th class="postseason">POST</th>
<th class="postseason gp" data-sort="playoffs-gp">GP</th>
<th class="postseason g" data-sort="playoffs-g">G</th>
<th class="postseason a" data-sort="playoffs-a">A</th>
<th class="postseason tp" data-sort="playoffs-tp">TP</th>
<th class="postseason pim" data-sort="playoffs-pim">PIM</th>
<th class="postseason pm" data-sort="playoffs-pm">+/-</th>
</tr>
</thead>
<tbody>
<tr class="team-continent-NA">
<td class="season sorted">
                  2014-15
              </td>
<td class="team"

In [26]:
#Prepare and load data into database  
import pandas as pd
columns = table.select('thead th')
col_name = [st.string for st in columns]

rows = table.find('tbody').find_all('tr')
rows
l = []

for tr in rows:
    td = tr.find_all('td')
    row = [str(tr.get_text()).strip() for tr in td]
    l.append(row)
   # print(l[0]) 
df = pd.DataFrame(l, columns = col_name)
df.head()

Unnamed: 0,S,Team,League,GP,G,A,TP,PIM,+/-,Unnamed: 10,POST,GP.1,G.1,A.1,TP.1,PIM.1,+/-.1
0,2014-15,MIT (Mass. Inst. of Tech.),ACHA II,17.0,3.0,9.0,12.0,20.0,,|,,,,,,,
1,2015-16,MIT (Mass. Inst. of Tech.),ACHA II,9.0,1.0,1.0,2.0,2.0,,|,,,,,,,
2,2016-17,MIT (Mass. Inst. of Tech.),ACHA II,12.0,5.0,5.0,10.0,8.0,0.0,|,,,,,,,
3,2017-18,Did not play,,,,,,,,|,,,,,,,
4,2018-19,MIT (Mass. Inst. of Tech.),ACHA III,8.0,5.0,10.0,15.0,8.0,,|,,,,,,,


In [27]:
df.A

Unnamed: 0,A,A.1
0,9.0,
1,1.0,
2,5.0,
3,,
4,10.0,


In [28]:
#Get a spesific text  by importing regex
import re

texting = webpage.select('ul.fun-facts li')
text_spec = [text.find(string = re.compile('choice is')) for text  in texting ]
text_spec = [text.find_parent().get_text() for text in text_spec if text ] #Getting out from None 
text_spec

['Current video game of choice is Rocket League']

In [29]:
#Downloading an image  
#Changing url  
url = "https://keithgalli.github.io/web-scraping"
r = requests.get(url + '/webpage.html')
webpage = bs(r.content)




In [30]:
 webpage.body.div.find_all('img') 

[<img alt="Lake Como" src="images/italy/lake_como.jpg" style="height:100%"/>,
 <img alt="Pontevecchio, Florence" src="images/italy/pontevecchio.jpg" style="height:100%"/>,
 <img alt="Riomaggiore, Cinque de Terre" src="images/italy/riomaggiore.jpg" style="height:100%"/>]

In [38]:
#images = webpage.body.div.find_all('img') 
listt = [0,1,2]

images = webpage.select('div.row div.column img')

#for single image 
image_url = images[0]['src'] #or add zero at the end of images and delete here 
#print(image_url)

#for multiple images 
for image in images:
    image_urll = image['src']
    print(image_urll)


#get full url 
    full_url = url + image_urll
    print(full_url)

images/italy/lake_como.jpg
https://keithgalli.github.io/web-scraping/images/italy/lake_como.jpg
images/italy/pontevecchio.jpg
https://keithgalli.github.io/web-scraping/images/italy/pontevecchio.jpg
images/italy/riomaggiore.jpg
https://keithgalli.github.io/web-scraping/images/italy/riomaggiore.jpg


In [39]:
#Lets dowload it  
img_data = requests.get(full_url).content
with open('My_image.png', 'wb') as handler:
    handler.write(img_data)

In [33]:
#lets find a secret id  
files = webpage.select('div.block a')
rel_path = [f['href'] for f in files]

rel_path

['challenge/file_1.html',
 'challenge/file_2.html',
 'challenge/file_3.html',
 'challenge/file_4.html',
 'challenge/file_5.html',
 'challenge/file_6.html',
 'challenge/file_7.html',
 'challenge/file_8.html',
 'challenge/file_9.html',
 'challenge/file_10.html']

In [35]:
url = "https://keithgalli.github.io/web-scraping/"

for f in rel_path: 
    #print(f)
    full_url = url + f 
    #print(full_url)
    page = requests.get(full_url)
    #print(page)
    bs_page= bs(page.content)
    #print(bs_page) 
    sec_word = bs_page.find('p', attrs={'id': 'secret-word'})
    print(sec_word.string)


Make
sure
to
smash
that
like
button
and
subscribe
!!!


In [180]:
#ul_links = webpage.find_all('div', attrs={'class': 'block'})
#find('h3', {'class': 'listing-results-attr'}).find('span', {'class': 'num-beds'})
#trends['Price'] = trends['Price'].apply(lambda x:  x.split()[0].replace('$', ''))
#trends['Price'] = trends['Price'].str.replace(',', '.') 
#trends['Price'].astype(str).str.replace(",", ".").astype(float)
#trends['Price'] = trends['Price'].astype(float)
 

In [None]:
.

In [None]:
.