### Hockey Teams Extraction

In [1]:
# Import libraries
import requests
import pandas as pd
from bs4 import BeautifulSoup

In [2]:
# Url of the first page to scrape
url = "https://www.scrapethissite.com/pages/forms/?page_num=1"

In [3]:
# Request to the url
response = requests.get(url)
print (response)

<Response [200]>


In [4]:
# Parse the response content with BeautifulSoup using the HTML parser
soup = BeautifulSoup(response.text,"html.parser")
print(soup)

<!DOCTYPE html>

<html lang="en">
<head>
<meta charset="utf-8"/>
<title>Hockey Teams: Forms, Searching and Pagination | Scrape This Site | A public sandbox for learning web scraping</title>
<link href="/static/images/scraper-icon.png" rel="icon" type="image/png"/>
<meta content="width=device-width, initial-scale=1.0" name="viewport"/>
<meta content="Browse through a database of NHL team stats since 1990. Practice building a scraper that handles common website interface components." name="description"/>
<link crossorigin="anonymous" href="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.5/css/bootstrap.min.css" integrity="sha256-MfvZlkHCEqatNoGiOXveE8FIwMzZg4W85qfrfIFBfYc= sha512-dTfge/zgoMYpP7QbHy4gWMEGsbsdZeCXz7irItjcC3sPUFtf0kuFbDz/ixG7ArTxmDjLXDmezHubeNikyKGVyQ==" rel="stylesheet"/>
<link href="https://fonts.googleapis.com/css?family=Lato:400,700" rel="stylesheet" type="text/css"/>
<link href="/static/css/styles.css" rel="stylesheet" type="text/css"/>
<meta content="noindex" name="robo

In [5]:
# Access the first <table> tag with class="table"
soup.find("table",class_="table")

<table class="table">
<tr>
<th>
                            Team Name
                        </th>
<th>
                            Year
                        </th>
<th>
                            Wins
                        </th>
<th>
                            Losses
                        </th>
<th>
                            OT Losses
                        </th>
<th>
                            Win %
                        </th>
<th>
                            Goals For (GF)
                        </th>
<th>
                            Goals Against (GA)
                        </th>
<th>
                            + / -
                        </th>
</tr>
<tr class="team">
<td class="name">
                            Boston Bruins
                        </td>
<td class="year">
                            1990
                        </td>
<td class="wins">
                            44
                        </td>
<td class="losses">
                            2

In [6]:
# Table Variable
table = soup.find("table")
print(table)

<table class="table">
<tr>
<th>
                            Team Name
                        </th>
<th>
                            Year
                        </th>
<th>
                            Wins
                        </th>
<th>
                            Losses
                        </th>
<th>
                            OT Losses
                        </th>
<th>
                            Win %
                        </th>
<th>
                            Goals For (GF)
                        </th>
<th>
                            Goals Against (GA)
                        </th>
<th>
                            + / -
                        </th>
</tr>
<tr class="team">
<td class="name">
                            Boston Bruins
                        </td>
<td class="year">
                            1990
                        </td>
<td class="wins">
                            44
                        </td>
<td class="losses">
                            2

In [7]:
# Access all <th> elements (table headers) within the table
titles = table.find_all("th")
print(titles)

[<th>
                            Team Name
                        </th>, <th>
                            Year
                        </th>, <th>
                            Wins
                        </th>, <th>
                            Losses
                        </th>, <th>
                            OT Losses
                        </th>, <th>
                            Win %
                        </th>, <th>
                            Goals For (GF)
                        </th>, <th>
                            Goals Against (GA)
                        </th>, <th>
                            + / -
                        </th>]


In [8]:
# Loop through the column headers
title_headers =[title.text.strip() for title in titles]
print(title_headers)

['Team Name', 'Year', 'Wins', 'Losses', 'OT Losses', 'Win %', 'Goals For (GF)', 'Goals Against (GA)', '+ / -']


In [9]:
# Creating dataframe
df = pd.DataFrame(columns=title_headers)
df

Unnamed: 0,Team Name,Year,Wins,Losses,OT Losses,Win %,Goals For (GF),Goals Against (GA),+ / -


In [12]:
# Looping through all 25 pages in the url to get the data within each row

for i in range(1, 26):
    url = f"https://www.scrapethissite.com/pages/forms/?page_num={i}"
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")
    table = soup.find("table", class_="table")
    column_data = table.find_all("tr")

    for row in column_data[1:]:
      row_data = row.find_all("td")
      each_row = [data.text.strip() for data in row_data]
      length = len(df)
      df.loc[length] = each_row


In [13]:
# View dataframe
df

Unnamed: 0,Team Name,Year,Wins,Losses,OT Losses,Win %,Goals For (GF),Goals Against (GA),+ / -
0,Boston Bruins,1990,44,24,,0.55,299,264,35
1,Buffalo Sabres,1990,31,30,,0.388,292,278,14
2,Calgary Flames,1990,46,26,,0.575,344,263,81
3,Chicago Blackhawks,1990,49,23,,0.613,284,211,73
4,Detroit Red Wings,1990,34,38,,0.425,273,298,-25
...,...,...,...,...,...,...,...,...,...
802,Tampa Bay Lightning,2011,38,36,8,0.463,235,281,-46
803,Toronto Maple Leafs,2011,35,37,10,0.427,231,264,-33
804,Vancouver Canucks,2011,51,22,9,0.622,249,198,51
805,Washington Capitals,2011,42,32,8,0.512,222,230,-8


In [14]:
# Convert to excel
df.to_excel("Hockey Data Extraction.xlsx")