# Web scrapping intro

In [1]:
# import necessary libraries
import requests # pip install if not already
from bs4 import BeautifulSoup as bs # pip install BeautifulSoup if not already installed


In [2]:
# Load our first page

url = 'https://keithgalli.github.io/web-scraping/example.html'
r = requests.get(url)

#what does r look like?
r, type(r)

(<Response [200]>, requests.models.Response)

In [3]:
str(r.content)

'b\'<html>\\n<head>\\n<title>HTML Example</title>\\n</head>\\n<body>\\n\\n<div align="middle">\\n<h1>HTML Webpage</h1>\\n<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>\\n</div>\\n\\n<h2>A Header</h2>\\n<p><i>Some italicized text</i></p>\\n\\n<h2>Another header</h2>\\n<p id="paragraph-id"><b>Some bold text</b></p>\\n\\n</body>\\n</html>\\n\''

In [4]:
# convert r to a beautiful soup object
soup = bs(r.content)
soup

<html>
<head>
<title>HTML Example</title>
</head>
<body>
<div align="middle">
<h1>HTML Webpage</h1>
<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>
</div>
<h2>A Header</h2>
<p><i>Some italicized text</i></p>
<h2>Another header</h2>
<p id="paragraph-id"><b>Some bold text</b></p>
</body>
</html>

In [5]:
print(soup.prettify())

<html>
 <head>
  <title>
   HTML Example
  </title>
 </head>
 <body>
  <div align="middle">
   <h1>
    HTML Webpage
   </h1>
   <p>
    Link to more interesting example:
    <a href="https://keithgalli.github.io/web-scraping/webpage.html">
     keithgalli.github.io/web-scraping/webpage.html
    </a>
   </p>
  </div>
  <h2>
   A Header
  </h2>
  <p>
   <i>
    Some italicized text
   </i>
  </p>
  <h2>
   Another header
  </h2>
  <p id="paragraph-id">
   <b>
    Some bold text
   </b>
  </p>
 </body>
</html>



In [6]:
# start scraping!
# start with find and find_all

soup.find('h2'), soup.find_all('h2')

(<h2>A Header</h2>, [<h2>A Header</h2>, <h2>Another header</h2>])

In [7]:
# storing as variables
first_header=soup.find('h2')
all_headers=soup.find_all('h2')
first_header, all_headers

(<h2>A Header</h2>, [<h2>A Header</h2>, <h2>Another header</h2>])

.find will only find the first occurence of a tag \
.find_all will find all occurences of the taga and place each item in a list

In [8]:
# pass in a list of elements to look for
first_header=soup.find(['h1','h2'])
first_header

<h1>HTML Webpage</h1>

order does not matter when you pass a list into .find it will still return the first tag in the list it finds.

In [9]:
headers=soup.find_all(['h1','h2'])
headers

[<h1>HTML Webpage</h1>, <h2>A Header</h2>, <h2>Another header</h2>]

In [10]:
# you can pass in attributes to the find/find_all functions.
paragraph=soup.find_all("p")
paragraph

[<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>,
 <p><i>Some italicized text</i></p>,
 <p id="paragraph-id"><b>Some bold text</b></p>]

In [11]:
# say we want to find the paragraph that has the attribute id='paragraph-id', use the parameter attr={}
# to find this attribute with the paragraph tag.

soup.find_all("p", attrs={"id": "paragraph-id"} )

[<p id="paragraph-id"><b>Some bold text</b></p>]

In [12]:
# you can nest find and find all calls.
body = soup.find('body')
body

<body>
<div align="middle">
<h1>HTML Webpage</h1>
<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>
</div>
<h2>A Header</h2>
<p><i>Some italicized text</i></p>
<h2>Another header</h2>
<p id="paragraph-id"><b>Some bold text</b></p>
</body>

now lets say you want to find a 'div' tag, you can nest this with the body object just created

In [13]:
div =body.find('div')
div

<div align="middle">
<h1>HTML Webpage</h1>
<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>
</div>

now lets do the same process for the header

In [14]:
header=div.find('h1')
header

<h1>HTML Webpage</h1>

In [15]:
# we can search for specific strings in find_all calls.
# say we wanted to find paragraphs that contained the word 'some'

print(body.prettify())

<body>
 <div align="middle">
  <h1>
   HTML Webpage
  </h1>
  <p>
   Link to more interesting example:
   <a href="https://keithgalli.github.io/web-scraping/webpage.html">
    keithgalli.github.io/web-scraping/webpage.html
   </a>
  </p>
 </div>
 <h2>
  A Header
 </h2>
 <p>
  <i>
   Some italicized text
  </i>
 </p>
 <h2>
  Another header
 </h2>
 <p id="paragraph-id">
  <b>
   Some bold text
  </b>
 </p>
</body>



In [16]:
some_paragraphs=body.find_all('p', string='Some')
some_paragraphs

[]

oh no! oh no! Oh no no no no!

In [17]:
# it doesn't quite work...what if we put in 'Some bold text'??
some_paragraphs=body.find_all('p', string='Some bold text')
some_paragraphs

[<p id="paragraph-id"><b>Some bold text</b></p>]

In [18]:
# this situation is not ideal.
# let's leverage our friend regex :)

import re

some_paragraphs=body.find_all('p', string=re.compile('Some'))
some_paragraphs

[<p><i>Some italicized text</i></p>,
 <p id="paragraph-id"><b>Some bold text</b></p>]

In [19]:
# another example find all headers that have the word "header" in them

headers=body.find_all('h2', string=re.compile('header'))
headers

[<h2>Another header</h2>]

only finds one result because regex is looking for 'header'. we can change it by giving it a different pattern to capture the capital "H", string=re.compile('(H|h)eader')

In [20]:
headers=body.find_all('h2', string=re.compile('(H|h)eader'))
headers

[<h2>A Header</h2>, <h2>Another header</h2>]

### select (CSS selector)

In [21]:
print(soup.body.prettify())

<body>
 <div align="middle">
  <h1>
   HTML Webpage
  </h1>
  <p>
   Link to more interesting example:
   <a href="https://keithgalli.github.io/web-scraping/webpage.html">
    keithgalli.github.io/web-scraping/webpage.html
   </a>
  </p>
 </div>
 <h2>
  A Header
 </h2>
 <p>
  <i>
   Some italicized text
  </i>
 </p>
 <h2>
  Another header
 </h2>
 <p id="paragraph-id">
  <b>
   Some bold text
  </b>
 </p>
</body>



In [22]:
soup.select('p')

[<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>,
 <p><i>Some italicized text</i></p>,
 <p id="paragraph-id"><b>Some bold text</b></p>]

looks the same as find_all. Let's try finding all paragraph tags inside of div.

In [23]:
soup.select('div p')

[<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>]

In [24]:
# select all the paragraphs that are also preceeded by a header.
soup.select('h2 ~ p')


[<p><i>Some italicized text</i></p>,
 <p id="paragraph-id"><b>Some bold text</b></p>]

In [25]:
# let's do some more of this. It's useful to grab elements with specific id's

In [26]:
soup.select('p#paragraph-id b')

[<b>Some bold text</b>]

In [27]:
paragraphs=soup.select('body > p')
paragraphs

[<p><i>Some italicized text</i></p>,
 <p id="paragraph-id"><b>Some bold text</b></p>]

In [28]:
# we can loop through and make select calls with these objects that we just made. Let's take paragraphs for example
#first look at the type(paragraphs)

type(paragraphs)

list

In [29]:
# it's a list so we can loop. Let's inspect the elements in the list for their types
type(paragraphs[0])

bs4.element.Tag

They are beautiful soup objects so we can use the select and find_all calls. Let's do an example of finding the elements with the id tag.

In [32]:
paragraphs

[<p><i>Some italicized text</i></p>,
 <p id="paragraph-id"><b>Some bold text</b></p>]

In [30]:
paragraphs_i=[]
[paragraphs_i.append(x.select('i')) for x in paragraphs]
paragraphs_i

[[<i>Some italicized text</i>], []]

In [33]:
# Grab an element with a specific property
soup.select("[align='middle']")

[<div align="middle">
 <h1>HTML Webpage</h1>
 <p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>
 </div>]

# Get different properties of the HTML

In [34]:
# want to grab only the string from an element, use the .string property
header = soup.find('h2')
header, header.string

(<h2>A Header</h2>, 'A Header')

In [35]:
# let's try with div.
div = soup.find('div')
print(div.prettify())
print(div.string)

<div align="middle">
 <h1>
  HTML Webpage
 </h1>
 <p>
  Link to more interesting example:
  <a href="https://keithgalli.github.io/web-scraping/webpage.html">
   keithgalli.github.io/web-scraping/webpage.html
  </a>
 </p>
</div>

None


We got an answer of None. This is because there are children elements inside of the div, so the .string property is not sure which text to grab.

In [36]:
# Let's try with .get_text()
print(div.get_text())
div.get_text()


HTML Webpage
Link to more interesting example: keithgalli.github.io/web-scraping/webpage.html



'\nHTML Webpage\nLink to more interesting example: keithgalli.github.io/web-scraping/webpage.html\n'

In [37]:
# Get a specific property from an element.
# Let's get the href link tag
link = soup.find('a')
link

<a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a>

In [38]:
link['href']

'https://keithgalli.github.io/web-scraping/webpage.html'

In [39]:
# try to get the paragraph id tag.
paragraphs = soup.select('p#paragraph-id')
paragraphs

[<p id="paragraph-id"><b>Some bold text</b></p>]

In [40]:
type(paragraphs)

list

In [41]:
paragraphs[0], type(paragraphs[0])

(<p id="paragraph-id"><b>Some bold text</b></p>, bs4.element.Tag)

In [42]:
paragraphs[0]['id']

'paragraph-id'

# Code Navigation

In [44]:
# the beautiful soup object
soup

<html>
<head>
<title>HTML Example</title>
</head>
<body>
<div align="middle">
<h1>HTML Webpage</h1>
<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>
</div>
<h2>A Header</h2>
<p><i>Some italicized text</i></p>
<h2>Another header</h2>
<p id="paragraph-id"><b>Some bold text</b></p>
</body>
</html>

In [45]:
# the <body> portion and all it contains of beautiful soup object
soup.body

<body>
<div align="middle">
<h1>HTML Webpage</h1>
<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>
</div>
<h2>A Header</h2>
<p><i>Some italicized text</i></p>
<h2>Another header</h2>
<p id="paragraph-id"><b>Some bold text</b></p>
</body>

In [47]:
soup.div

<div align="middle">
<h1>HTML Webpage</h1>
<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>
</div>

In [48]:
soup.h1

<h1>HTML Webpage</h1>

In [49]:
soup.h1.string

'HTML Webpage'

In [50]:
# know the terms parent, sibling, child
# start with a pretty print of the body.

print(soup.body.prettify())

<body>
 <div align="middle">
  <h1>
   HTML Webpage
  </h1>
  <p>
   Link to more interesting example:
   <a href="https://keithgalli.github.io/web-scraping/webpage.html">
    keithgalli.github.io/web-scraping/webpage.html
   </a>
  </p>
 </div>
 <h2>
  A Header
 </h2>
 <p>
  <i>
   Some italicized text
  </i>
 </p>
 <h2>
  Another header
 </h2>
 <p id="paragraph-id">
  <b>
   Some bold text
  </b>
 </p>
</body>



In [51]:
soup.p

<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>

The body has a nested structure. The body is the parent. Everything nested inside the body is the children. If elements are on the same level they are considered siblings like `<div>` and `<h2>`.

In [53]:
# start with looking at div
div=soup.find('div')
# looks like there are 4 elements that are siblings of the div
div

<div align="middle">
<h1>HTML Webpage</h1>
<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>
</div>

In [54]:
div.find_next_siblings()

[<h2>A Header</h2>,
 <p><i>Some italicized text</i></p>,
 <h2>Another header</h2>,
 <p id="paragraph-id"><b>Some bold text</b></p>]

In [55]:
len(div.find_next_siblings())

4

# Exercises

In [56]:
# Grab all the social link from the webpage.
# webpage: https://keithgalli.github.io/web-scraping/webpage.html

url = 'https://keithgalli.github.io/web-scraping/webpage.html'
request=requests.get(url)
request

<Response [200]>

In [57]:
# look at the content
request.content

b'<head>\n  <title>Keith Galli\'s Page</title>\n  <style>\n  table {\n    border-collapse: collapse;\n  }\n  th {\n    padding:5px;\n  }\n  td {\n    border: 1px solid #ddd;\n    padding: 5px;\n  }\n  tr:nth-child(even) {\n    background-color: #f2f2f2;\n  }\n  th {\n    padding-top: 12px;\n    padding-bottom: 12px;\n    text-align: left;\n    background-color: #add8e6;\n    color: black;\n  }\n  .block {\n  width: 100px;\n  /*float: left;*/\n    display: inline-block;\n    zoom: 1;\n  }\n  .column {\n  float: left;\n  height: 200px;\n  /*width: 33.33%;*/\n  padding: 5px;\n  }\n\n  .row::after {\n    content: "";\n    clear: both;\n    display: table;\n  }\n</style>\n</head>\n<body>\n  <h1>Welcome to my page!</h1>\n  <img src="./images/selfie1.jpg" width="300px">\n  <h2>About me</h2>\n  <p>Hi, my name is Keith and I am a YouTuber who focuses on content related to programming, data science, and machine learning!</p>\n  <p>Here is a link to my channel: <a href="https://www.youtube.com/kg

In [58]:
# create a variable 'webpage' that takes request.content and makes it a beautiful soup object.
webpage=bs(request.content)


In [59]:
print(webpage.prettify())

<html>
 <head>
  <title>
   Keith Galli's Page
  </title>
  <style>
   table {
    border-collapse: collapse;
  }
  th {
    padding:5px;
  }
  td {
    border: 1px solid #ddd;
    padding: 5px;
  }
  tr:nth-child(even) {
    background-color: #f2f2f2;
  }
  th {
    padding-top: 12px;
    padding-bottom: 12px;
    text-align: left;
    background-color: #add8e6;
    color: black;
  }
  .block {
  width: 100px;
  /*float: left;*/
    display: inline-block;
    zoom: 1;
  }
  .column {
  float: left;
  height: 200px;
  /*width: 33.33%;*/
  padding: 5px;
  }

  .row::after {
    content: "";
    clear: both;
    display: table;
  }
  </style>
 </head>
 <body>
  <h1>
   Welcome to my page!
  </h1>
  <img src="./images/selfie1.jpg" width="300px"/>
  <h2>
   About me
  </h2>
  <p>
   Hi, my name is Keith and I am a YouTuber who focuses on content related to programming, data science, and machine learning!
  </p>
  <p>
   Here is a link to my channel:
   <a href="https://www.youtube.com/kgmi

# starting on my own.

In [61]:
# all of the social links look like they are inside of <ul> (unordered list) elements with attribute class='socials'
socials = webpage.find('ul', attrs={'class': 'socials'})
socials

<ul class="socials">
<li class="social instagram"><b>Instagram: </b><a href="https://www.instagram.com/keithgalli/">https://www.instagram.com/keithgalli/</a></li>
<li class="social twitter"><b>Twitter: </b><a href="https://twitter.com/keithgalli">https://twitter.com/keithgalli</a></li>
<li class="social linkedin"><b>LinkedIn: </b><a href="https://www.linkedin.com/in/keithgalli/">https://www.linkedin.com/in/keithgalli/</a></li>
<li class="social tiktok"><b>TikTok: </b><a href="https://www.tiktok.com/@keithgalli">https://www.tiktok.com/@keithgalli</a></li>
</ul>

In [62]:
type(socials)

bs4.element.Tag

In [63]:
social_str = str(socials)
social_str

'<ul class="socials">\n<li class="social instagram"><b>Instagram: </b><a href="https://www.instagram.com/keithgalli/">https://www.instagram.com/keithgalli/</a></li>\n<li class="social twitter"><b>Twitter: </b><a href="https://twitter.com/keithgalli">https://twitter.com/keithgalli</a></li>\n<li class="social linkedin"><b>LinkedIn: </b><a href="https://www.linkedin.com/in/keithgalli/">https://www.linkedin.com/in/keithgalli/</a></li>\n<li class="social tiktok"><b>TikTok: </b><a href="https://www.tiktok.com/@keithgalli">https://www.tiktok.com/@keithgalli</a></li>\n</ul>'

In [64]:
social_str.split('\n')

['<ul class="socials">',
 '<li class="social instagram"><b>Instagram: </b><a href="https://www.instagram.com/keithgalli/">https://www.instagram.com/keithgalli/</a></li>',
 '<li class="social twitter"><b>Twitter: </b><a href="https://twitter.com/keithgalli">https://twitter.com/keithgalli</a></li>',
 '<li class="social linkedin"><b>LinkedIn: </b><a href="https://www.linkedin.com/in/keithgalli/">https://www.linkedin.com/in/keithgalli/</a></li>',
 '<li class="social tiktok"><b>TikTok: </b><a href="https://www.tiktok.com/@keithgalli">https://www.tiktok.com/@keithgalli</a></li>',
 '</ul>']

In [65]:
# another way - find_all
socials = webpage.find_all('ul', attrs={'class': 'socials'})
socials

[<ul class="socials">
 <li class="social instagram"><b>Instagram: </b><a href="https://www.instagram.com/keithgalli/">https://www.instagram.com/keithgalli/</a></li>
 <li class="social twitter"><b>Twitter: </b><a href="https://twitter.com/keithgalli">https://twitter.com/keithgalli</a></li>
 <li class="social linkedin"><b>LinkedIn: </b><a href="https://www.linkedin.com/in/keithgalli/">https://www.linkedin.com/in/keithgalli/</a></li>
 <li class="social tiktok"><b>TikTok: </b><a href="https://www.tiktok.com/@keithgalli">https://www.tiktok.com/@keithgalli</a></li>
 </ul>]

In [66]:
# using select
links = webpage.select('ul > li > a[href*="keith"]')

In [67]:
links

[<a href="https://www.instagram.com/keithgalli/">https://www.instagram.com/keithgalli/</a>,
 <a href="https://twitter.com/keithgalli">https://twitter.com/keithgalli</a>,
 <a href="https://www.linkedin.com/in/keithgalli/">https://www.linkedin.com/in/keithgalli/</a>,
 <a href="https://www.tiktok.com/@keithgalli">https://www.tiktok.com/@keithgalli</a>]

In [68]:
links[0]['href']

'https://www.instagram.com/keithgalli/'

In [69]:
links[1]['href']

'https://twitter.com/keithgalli'

In [70]:
actual_links = [x['href'] for x in links]
actual_links

['https://www.instagram.com/keithgalli/',
 'https://twitter.com/keithgalli',
 'https://www.linkedin.com/in/keithgalli/',
 'https://www.tiktok.com/@keithgalli']

In [None]:
# [requests.get(x) for x in actual_links]

# following video solutions

In [72]:
# what happens when I select all of the <a> elements on the page?
webpage.select('a')

[<a href="https://www.youtube.com/kgmit">youtube.com/kgmit</a>,
 <a href="#footer"><sup>1</sup></a>,
 <a href="https://www.instagram.com/keithgalli/">https://www.instagram.com/keithgalli/</a>,
 <a href="https://twitter.com/keithgalli">https://twitter.com/keithgalli</a>,
 <a href="https://www.linkedin.com/in/keithgalli/">https://www.linkedin.com/in/keithgalli/</a>,
 <a href="https://www.tiktok.com/@keithgalli">https://www.tiktok.com/@keithgalli</a>,
 <a href="https://www.eliteprospects.com/team/10263/mit-mass.-inst.-of-tech./2014-2015?tab=stats"> MIT (Mass. Inst. of Tech.) </a>,
 <a href="https://www.eliteprospects.com/league/acha-ii/stats/2014-2015"> ACHA II </a>,
 <a href="https://www.eliteprospects.com/league/acha-ii/stats/2014-2015"> </a>,
 <a href="https://www.eliteprospects.com/team/10263/mit-mass.-inst.-of-tech./2015-2016?tab=stats"> MIT (Mass. Inst. of Tech.) </a>,
 <a href="https://www.eliteprospects.com/league/acha-ii/stats/2015-2016"> ACHA II </a>,
 <a href="https://www.elite

This gave us too much... \
On the page all of the socials are stored in an unordered list (ul) with class="socials"

In [73]:
# NOTE in select '#' goes with id names, '.' goes with class names
links=webpage.select('ul.socials')
links

[<ul class="socials">
 <li class="social instagram"><b>Instagram: </b><a href="https://www.instagram.com/keithgalli/">https://www.instagram.com/keithgalli/</a></li>
 <li class="social twitter"><b>Twitter: </b><a href="https://twitter.com/keithgalli">https://twitter.com/keithgalli</a></li>
 <li class="social linkedin"><b>LinkedIn: </b><a href="https://www.linkedin.com/in/keithgalli/">https://www.linkedin.com/in/keithgalli/</a></li>
 <li class="social tiktok"><b>TikTok: </b><a href="https://www.tiktok.com/@keithgalli">https://www.tiktok.com/@keithgalli</a></li>
 </ul>]

In [74]:
# now we just want the a elements with the href in them.
links=webpage.select('ul.socials a')
links

[<a href="https://www.instagram.com/keithgalli/">https://www.instagram.com/keithgalli/</a>,
 <a href="https://twitter.com/keithgalli">https://twitter.com/keithgalli</a>,
 <a href="https://www.linkedin.com/in/keithgalli/">https://www.linkedin.com/in/keithgalli/</a>,
 <a href="https://www.tiktok.com/@keithgalli">https://www.tiktok.com/@keithgalli</a>]

In [75]:
# we hant to get only the links - https://..... 
#since the variable 'links' is a list we can do a list comprehension to grab these.

[link['href'] for link in links]


['https://www.instagram.com/keithgalli/',
 'https://twitter.com/keithgalli',
 'https://www.linkedin.com/in/keithgalli/',
 'https://www.tiktok.com/@keithgalli']

In [76]:
# another way
# this time lets use find
# a starting point. let's see the first '<a>' tag

links=webpage.find('a')
links

<a href="https://www.youtube.com/kgmit">youtube.com/kgmit</a>

this is not what we are looking for :(

In [77]:
# let's try doing something similar to our first approach.
links=webpage.find('ul', attrs={'class': 'socials'})
links

<ul class="socials">
<li class="social instagram"><b>Instagram: </b><a href="https://www.instagram.com/keithgalli/">https://www.instagram.com/keithgalli/</a></li>
<li class="social twitter"><b>Twitter: </b><a href="https://twitter.com/keithgalli">https://twitter.com/keithgalli</a></li>
<li class="social linkedin"><b>LinkedIn: </b><a href="https://www.linkedin.com/in/keithgalli/">https://www.linkedin.com/in/keithgalli/</a></li>
<li class="social tiktok"><b>TikTok: </b><a href="https://www.tiktok.com/@keithgalli">https://www.tiktok.com/@keithgalli</a></li>
</ul>

This is the point that I got to when I worked on my own! \
Let's do a step further and grab the actual links.

In [78]:
# from the 'links' object grab all the 'a' tags
a_tags = links.find_all('a')
a_tags

[<a href="https://www.instagram.com/keithgalli/">https://www.instagram.com/keithgalli/</a>,
 <a href="https://twitter.com/keithgalli">https://twitter.com/keithgalli</a>,
 <a href="https://www.linkedin.com/in/keithgalli/">https://www.linkedin.com/in/keithgalli/</a>,
 <a href="https://www.tiktok.com/@keithgalli">https://www.tiktok.com/@keithgalli</a>]

In [79]:
# similar to befor we can now do a list comprehension to grab all the href=''
[link['href'] for link in a_tags]

['https://www.instagram.com/keithgalli/',
 'https://twitter.com/keithgalli',
 'https://www.linkedin.com/in/keithgalli/',
 'https://www.tiktok.com/@keithgalli']

In [80]:
# one more way
links = webpage.select('li.social a')
links

[<a href="https://www.instagram.com/keithgalli/">https://www.instagram.com/keithgalli/</a>,
 <a href="https://twitter.com/keithgalli">https://twitter.com/keithgalli</a>,
 <a href="https://www.linkedin.com/in/keithgalli/">https://www.linkedin.com/in/keithgalli/</a>,
 <a href="https://www.tiktok.com/@keithgalli">https://www.tiktok.com/@keithgalli</a>]

In [81]:
# and lets loop through one more time
actual_links = [link['href'] for link in links]
actual_links

['https://www.instagram.com/keithgalli/',
 'https://twitter.com/keithgalli',
 'https://www.linkedin.com/in/keithgalli/',
 'https://www.tiktok.com/@keithgalli']

In [89]:
# scrape the table from the keithgalli html page.
table_headers = webpage.find('table').find_all('th')
table_headers

[<th class="season" data-sort="">S</th>,
 <th class="team" data-sort="team">Team</th>,
 <th class="league" data-sort="league">League</th>,
 <th class="regular gp" data-sort="gp">GP</th>,
 <th class="regular g" data-sort="g">G</th>,
 <th class="regular a" data-sort="a">A</th>,
 <th class="regular tp" data-sort="tp">TP</th>,
 <th class="regular pim" data-sort="pim">PIM</th>,
 <th class="regular pm" data-sort="pm">+/-</th>,
 <th class="separator"> </th>,
 <th class="postseason">POST</th>,
 <th class="postseason gp" data-sort="playoffs-gp">GP</th>,
 <th class="postseason g" data-sort="playoffs-g">G</th>,
 <th class="postseason a" data-sort="playoffs-a">A</th>,
 <th class="postseason tp" data-sort="playoffs-tp">TP</th>,
 <th class="postseason pim" data-sort="playoffs-pim">PIM</th>,
 <th class="postseason pm" data-sort="playoffs-pm">+/-</th>]

In [92]:
column_names=[th.string for th in table_headers]
column_names

['S',
 'Team',
 'League',
 'GP',
 'G',
 'A',
 'TP',
 'PIM',
 '+/-',
 '\xa0',
 'POST',
 'GP',
 'G',
 'A',
 'TP',
 'PIM',
 '+/-']

In [116]:
# all of the table rows in a list
table_rows_list =webpage.find('tbody').find_all('tr')
table_rows_list

[<tr class="team-continent-NA">
 <td class="season sorted">
                   2014-15
               </td>
 <td class="team">
 <i><img src="images/flag.png"/></i>
 <span class="txt-blue">
 <a href="https://www.eliteprospects.com/team/10263/mit-mass.-inst.-of-tech./2014-2015?tab=stats"> MIT (Mass. Inst. of Tech.) </a>
 </span>
 </td>
 <td class="league"> <a href="https://www.eliteprospects.com/league/acha-ii/stats/2014-2015"> ACHA II </a> </td>
 <td class="regular gp">17</td>
 <td class="regular g">3</td>
 <td class="regular a">9</td>
 <td class="regular tp">12</td>
 <td class="regular pim">20</td>
 <td class="regular pm"></td>
 <td class="separator"> | </td>
 <td class="postseason">
 <a href="https://www.eliteprospects.com/league/acha-ii/stats/2014-2015"> </a>
 </td>
 <td class="postseason gp">
 </td>
 <td class="postseason g">
 </td>
 <td class="postseason a">
 </td>
 <td class="postseason tp">
 </td>
 <td class="postseason pim">
 </td>
 <td class="postseason pm">
 </td>
 </tr>, <tr 

In [118]:
# looking at the first item in the list
# create an empty dictionary to store details
print(table_rows_list[0].prettify())
table_row_dict={}

<tr class="team-continent-NA">
 <td class="season sorted">
  2014-15
 </td>
 <td class="team">
  <i>
   <img src="images/flag.png"/>
  </i>
  <span class="txt-blue">
   <a href="https://www.eliteprospects.com/team/10263/mit-mass.-inst.-of-tech./2014-2015?tab=stats">
    MIT (Mass. Inst. of Tech.)
   </a>
  </span>
 </td>
 <td class="league">
  <a href="https://www.eliteprospects.com/league/acha-ii/stats/2014-2015">
   ACHA II
  </a>
 </td>
 <td class="regular gp">
  17
 </td>
 <td class="regular g">
  3
 </td>
 <td class="regular a">
  9
 </td>
 <td class="regular tp">
  12
 </td>
 <td class="regular pim">
  20
 </td>
 <td class="regular pm">
 </td>
 <td class="separator">
  |
 </td>
 <td class="postseason">
  <a href="https://www.eliteprospects.com/league/acha-ii/stats/2014-2015">
  </a>
 </td>
 <td class="postseason gp">
 </td>
 <td class="postseason g">
 </td>
 <td class="postseason a">
 </td>
 <td class="postseason tp">
 </td>
 <td class="postseason pim">
 </td>
 <td class="postsea

In [127]:
# to get the season information go down to where the season is contained in the bs_object.
first_row = table_rows_list[0]
first_row.find('td', attrs={'class':'season sorted'}).string

'\n                  2014-15\n              '

In [129]:
#How does it look with .get_text()?
first_row.find('td', attrs={'class':'season sorted'}).get_text()

'\n                  2014-15\n              '

In [131]:
# since these look the same let's clean up the string.
# give the string a variable name 'season_string'
season_string = first_row.find('td', attrs={'class':'season sorted'}).get_text()
season_string

'\n                  2014-15\n              '

In [135]:
# get rid of the white space and new new lines with .strip()
season_string.strip()

'2014-15'

In [139]:
# putting this example into a for a loop to make a list of all of the seasons.
seasons_list=[]
for r in table_rows_list:
    season_string=r.find('td', attrs={'class':'season sorted'}).get_text()
    seasons_list.append(season_string.strip())
seasons_list

['2014-15', '2015-16', '2016-17', '2017-18', '2018-19']

### function: create_seasons_list()

In [141]:
# let's make the for loop into a function to call the seasons_list for the pandas dataframe.
def create_seasons_list():
    seasons_list = []
    for r in table_rows_list:
        season_string=r.find('td', attrs={'class':'season sorted'}).get_text()
        seasons_list.append(season_string.strip())
    return(seasons_list)

create_seasons_list()

['2014-15', '2015-16', '2016-17', '2017-18', '2018-19']

In [156]:
# grab the team information:
team_info = table_rows_list[0].find('td', attrs={'class':'team'})
team_info

<td class="team">
<i><img src="images/flag.png"/></i>
<span class="txt-blue">
<a href="https://www.eliteprospects.com/team/10263/mit-mass.-inst.-of-tech./2014-2015?tab=stats"> MIT (Mass. Inst. of Tech.) </a>
</span>
</td>

In [161]:
team_name = team_info.find('a').get_text().strip()
team_name

'MIT (Mass. Inst. of Tech.)'

In [166]:
team_season_link = team_info.find('a')
team_season_link['href']

'https://www.eliteprospects.com/team/10263/mit-mass.-inst.-of-tech./2014-2015?tab=stats'

In [201]:
#putting this example into a for loop to make lists of team names & season links
team_name_list=[]
team_links_list=[]
for r in table_rows_list:
    team_info=r.find('td', attrs={'class':'team'})
    if team_info.find('a'):
        team_name=team_info.find('a').get_text().strip()
        team_season_link=team_info.find('a')
        team_name_list.append(team_name)
        team_season_link = team_info.find('a')
        team_links_list.append(team_season_link['href'])
    else:
        team_name=team_info.get_text().strip()
        team_season_link= ""
        team_name_list.append(team_name)
        team_links_list.append(team_season_link)

team_name_list, team_links_list

(['MIT (Mass. Inst. of Tech.)',
  'MIT (Mass. Inst. of Tech.)',
  'MIT (Mass. Inst. of Tech.)',
  'Did not play',
  'MIT (Mass. Inst. of Tech.)'],
 ['https://www.eliteprospects.com/team/10263/mit-mass.-inst.-of-tech./2014-2015?tab=stats',
  'https://www.eliteprospects.com/team/10263/mit-mass.-inst.-of-tech./2015-2016?tab=stats',
  'https://www.eliteprospects.com/team/10263/mit-mass.-inst.-of-tech./2016-2017?tab=stats',
  '',
  'https://www.eliteprospects.com/team/10263/mit-mass.-inst.-of-tech./2018-2019?tab=stats'])

### function: create_team_name_links_list()

In [230]:
# Let's make a function!

def create_team_name_links_lists():
    team_name_list=[]
    team_links_list=[]
    for r in table_rows_list:
        team_info=r.find('td', attrs={'class':'team'})
        if team_info.find('a'):
            team_name=team_info.find('a').get_text().strip()
            team_season_link=team_info.find('a')
            team_name_list.append(team_name)
            team_season_link = team_info.find('a')
            team_links_list.append(team_season_link['href'])
        else:
            team_name=team_info.get_text().strip()
            team_season_link= ""
            team_name_list.append(team_name)
            team_links_list.append(team_season_link)

    return(team_name_list, team_links_list)

(team_name_list, team_links_list) = create_team_name_links_lists()

In [205]:
# sanity check: looking at the lists created from the function.
team_name_list, team_links_list

(['MIT (Mass. Inst. of Tech.)',
  'MIT (Mass. Inst. of Tech.)',
  'MIT (Mass. Inst. of Tech.)',
  'Did not play',
  'MIT (Mass. Inst. of Tech.)'],
 ['https://www.eliteprospects.com/team/10263/mit-mass.-inst.-of-tech./2014-2015?tab=stats',
  'https://www.eliteprospects.com/team/10263/mit-mass.-inst.-of-tech./2015-2016?tab=stats',
  'https://www.eliteprospects.com/team/10263/mit-mass.-inst.-of-tech./2016-2017?tab=stats',
  '',
  'https://www.eliteprospects.com/team/10263/mit-mass.-inst.-of-tech./2018-2019?tab=stats'])

In [221]:
# get the league info for the 'league' column
# test with the first item in 'table_rows_list'
league_info=table_rows_list[0].find('td',attrs={'class':'league'})
league_info

<td class="league"> <a href="https://www.eliteprospects.com/league/acha-ii/stats/2014-2015"> ACHA II </a> </td>

In [217]:
# get the league name with .get_text()
league = league_info.get_text().strip()
league

'ACHA II'

In [220]:
# get the league link with find('a') and ['href']
league_link=league_info.find('a')
league_link['href']

'https://www.eliteprospects.com/league/acha-ii/stats/2014-2015'

In [222]:
# test this with the 'Did not play' table_rows_list[3] info:
table_rows_list[3]

<tr class="team-continent-EU">
<td class="season sorted">
                  2017-18
              </td>
<td class="team">
                  Did not play
              </td>
<td class="league"> <a href="https://www.eliteprospects.com/stats"> </a> </td>
<td class="regular gp"></td>
<td class="regular g"></td>
<td class="regular a"></td>
<td class="regular tp"></td>
<td class="regular pim"></td>
<td class="regular pm"></td>
<td class="separator"> | </td>
<td class="postseason">
<a href="https://www.eliteprospects.com/stats"> </a>
</td>
<td class="postseason gp">
</td>
<td class="postseason g">
</td>
<td class="postseason a">
</td>
<td class="postseason tp">
</td>
<td class="postseason pim">
</td>
<td class="postseason pm">
</td>
</tr>

In [226]:
league_info=table_rows_list[3].find('td',attrs={'class':'league'})
league_info.get_text().strip()

''

In [227]:
league = league_info.get_text().strip()
league

''

In [229]:
# you get empty information for the year that is not played table_rows_list[3]
# I think this will be okay to add into the list

# go forward with creating the loop.

league_list=[]
league_link_list=[]
for r in table_rows_list:
    league_info=r.find('td', attrs={'class':'league'})
    league_list.append(league_info.get_text().strip())
    league_link=league_info.find('a')
    league_link_list.append(league_link['href'])

league_list, league_link_list

(['ACHA II', 'ACHA II', 'ACHA II', '', 'ACHA III'],
 ['https://www.eliteprospects.com/league/acha-ii/stats/2014-2015',
  'https://www.eliteprospects.com/league/acha-ii/stats/2015-2016',
  'https://www.eliteprospects.com/league/acha-ii/stats/2016-2017',
  'https://www.eliteprospects.com/stats',
  'https://www.eliteprospects.com/league/acha-iii/stats/2018-2019'])

In [231]:
def create_league_name_link_lists():
    league_list=[]
    league_link_list=[]
    for r in table_rows_list:
        league_info=r.find('td', attrs={'class':'league'})
        league_list.append(league_info.get_text().strip())
        league_link=league_info.find('a')
        league_link_list.append(league_link['href'])

    return(league_list, league_link_list)

league_list, league_link_list=create_league_name_link_lists()

In [232]:
league_list

['ACHA II', 'ACHA II', 'ACHA II', '', 'ACHA III']

In [233]:
league_link_list

['https://www.eliteprospects.com/league/acha-ii/stats/2014-2015',
 'https://www.eliteprospects.com/league/acha-ii/stats/2015-2016',
 'https://www.eliteprospects.com/league/acha-ii/stats/2016-2017',
 'https://www.eliteprospects.com/stats',
 'https://www.eliteprospects.com/league/acha-iii/stats/2018-2019']

In [237]:
# look at table_rows_list again to see whats next...
table_rows_list

[<tr class="team-continent-NA">
 <td class="season sorted">
                   2014-15
               </td>
 <td class="team">
 <i><img src="images/flag.png"/></i>
 <span class="txt-blue">
 <a href="https://www.eliteprospects.com/team/10263/mit-mass.-inst.-of-tech./2014-2015?tab=stats"> MIT (Mass. Inst. of Tech.) </a>
 </span>
 </td>
 <td class="league"> <a href="https://www.eliteprospects.com/league/acha-ii/stats/2014-2015"> ACHA II </a> </td>
 <td class="regular gp">17</td>
 <td class="regular g">3</td>
 <td class="regular a">9</td>
 <td class="regular tp">12</td>
 <td class="regular pim">20</td>
 <td class="regular pm"></td>
 <td class="separator"> | </td>
 <td class="postseason">
 <a href="https://www.eliteprospects.com/league/acha-ii/stats/2014-2015"> </a>
 </td>
 <td class="postseason gp">
 </td>
 <td class="postseason g">
 </td>
 <td class="postseason a">
 </td>
 <td class="postseason tp">
 </td>
 <td class="postseason pim">
 </td>
 <td class="postseason pm">
 </td>
 </tr>, <tr 

In [236]:
# what were the column headers again?
print(column_names)

['S', 'Team', 'League', 'GP', 'G', 'A', 'TP', 'PIM', '+/-', '\xa0', 'POST', 'GP', 'G', 'A', 'TP', 'PIM', '+/-']


In [249]:
# try to see if there is the same for loop being repeated so we could use parameters to put into a function.
gp_list=[]
for r in table_rows_list:
    gp_info=r.find('td', attrs={'class':'regular gp'})
    if gp_info.get_text()=='':
        gp_list.append(0)
        
    else:
        gp_list.append(gp_info.get_text())

gp_list

['17', '9', '12', 0, '8']

In [250]:
[int(x) for x in gp_list]

[17, 9, 12, 0, 8]

In [255]:
g_list=[]
for r in table_rows_list:
    g_info=r.find('td', attrs={'class': 'regular g'})
    if g_info.get_text()=='':
        g_list.append(0)
    else:
        g_list.append(g_info.get_text())
g_list

['3', '1', '5', 0, '5']

In [256]:
[int(x) for x in g_list]

[3, 1, 5, 0, 5]

In [351]:
# work on creating a function that will make lists for each numerical stats.
stats=['GP','G','A','TP','PIM', '+/-']
stats

['GP', 'G', 'A', 'TP', 'PIM', '+/-']

In [352]:
# lower the stats names to match the text in html.
stats = [str.lower(x) for x in stats]

In [353]:
stats_dictionary = {}
for x in stats:
    stats_dictionary[x]=''
stats_dictionary


{'gp': '', 'g': '', 'a': '', 'tp': '', 'pim': '', '+/-': ''}

In [354]:
# stats_dictionary['gp'].append(0)
# stats_dictionary

In [355]:
# for i in stats_dictionary:
#     info=table_rows_list[0].find('td', attrs={'class': f'regular {i}'})
#     if info:
#         stats_dictionary[i]=(int(info.get_text()))
#     else:
#         stats_dictionary[i]='Null'
    
# stats_dictionary

In [356]:
# Let's see whats going on with the 'Did Not Play Row'
table_rows_list[3]

<tr class="team-continent-EU">
<td class="season sorted">
                  2017-18
              </td>
<td class="team">
                  Did not play
              </td>
<td class="league"> <a href="https://www.eliteprospects.com/stats"> </a> </td>
<td class="regular gp"></td>
<td class="regular g"></td>
<td class="regular a"></td>
<td class="regular tp"></td>
<td class="regular pim"></td>
<td class="regular pm"></td>
<td class="separator"> | </td>
<td class="postseason">
<a href="https://www.eliteprospects.com/stats"> </a>
</td>
<td class="postseason gp">
</td>
<td class="postseason g">
</td>
<td class="postseason a">
</td>
<td class="postseason tp">
</td>
<td class="postseason pim">
</td>
<td class="postseason pm">
</td>
</tr>

In [366]:
for i in stats_dictionary:
    if table_rows_list[3].find('td', attrs={'class': f'regular {i}'}):
        info=table_rows_list[3].find('td', attrs={'class': f'regular {i}'})
        if info.get_text()=='':
            print(i, 'No Text')
        else:
            print(i)
    else:
        print(i, 'Null')
    

gp No Text
g No Text
a No Text
tp No Text
pim No Text
+/- Null


In [375]:
numeric_stats_list=[]
for r in table_rows_list:
    stats=['GP','G','A','TP','PIM', '+/-']
    stats=[str.lower(x) for x in stats]
    stats_dictionary={}
    for i in stats:
        if r.find('td', attrs={'class': f'regular {i}'}):
            info=r.find('td', attrs={'class': f'regular {i}'})
            if info.get_text()!='':
                stats_dictionary[i]=(int(info.get_text()))
            else:
                stats_dictionary[i]=''
        else:
            stats_dictionary[i]=''
    numeric_stats_list.append(stats_dictionary)

In [376]:
numeric_stats_list

[{'gp': 17, 'g': 3, 'a': 9, 'tp': 12, 'pim': 20, '+/-': ''},
 {'gp': 9, 'g': 1, 'a': 1, 'tp': 2, 'pim': 2, '+/-': ''},
 {'gp': 12, 'g': 5, 'a': 5, 'tp': 10, 'pim': 8, '+/-': ''},
 {'gp': '', 'g': '', 'a': '', 'tp': '', 'pim': '', '+/-': ''},
 {'gp': 8, 'g': 5, 'a': 10, 'tp': 15, 'pim': 8, '+/-': ''}]

In [377]:
for k,v in enumerate(numeric_stats_list):
    print(k,v)

0 {'gp': 17, 'g': 3, 'a': 9, 'tp': 12, 'pim': 20, '+/-': ''}
1 {'gp': 9, 'g': 1, 'a': 1, 'tp': 2, 'pim': 2, '+/-': ''}
2 {'gp': 12, 'g': 5, 'a': 5, 'tp': 10, 'pim': 8, '+/-': ''}
3 {'gp': '', 'g': '', 'a': '', 'tp': '', 'pim': '', '+/-': ''}
4 {'gp': 8, 'g': 5, 'a': 10, 'tp': 15, 'pim': 8, '+/-': ''}
