# Web scrapping intro

In [1]:
# import necessary libraries
import requests # pip install if not already
from bs4 import BeautifulSoup as bs # pip install BeautifulSoup if not already installed


In [None]:
# Load our first page

url = 'https://keithgalli.github.io/web-scraping/example.html'
r = requests.get(url)

#what does r look like?
r, type(r)

In [None]:
str(r.content)

In [None]:
# convert r to a beautiful soup object
soup = bs(r.content)
soup

In [None]:
print(soup.prettify())

In [None]:
# start scraping!
# start with find and find_all

soup.find('h2'), soup.find_all('h2')

In [None]:
# storing as variables
first_header=soup.find('h2')
all_headers=soup.find_all('h2')
first_header, all_headers

.find will only find the first occurence of a tag \
.find_all will find all occurences of the taga and place each item in a list

In [None]:
# pass in a list of elements to look for
first_header=soup.find(['h1','h2'])
first_header

order does not matter when you pass a list into .find it will still return the first tag in the list it finds.

In [None]:
headers=soup.find_all(['h1','h2'])
headers

In [None]:
# you can pass in attributes to the find/find_all functions.
paragraph=soup.find_all("p")
paragraph

In [None]:
# say we want to find the paragraph that has the attribute id='paragraph-id', use the parameter attr={}
# to find this attribute with the paragraph tag.

soup.find_all("p", attrs={"id": "paragraph-id"} )

In [None]:
# you can nest find and find all calls.
body = soup.find('body')
body

now lets say you want to find a 'div' tag, you can nest this with the body object just created

In [None]:
div =body.find('div')
div

now lets do the same process for the header

In [None]:
header=div.find('h1')
header

In [None]:
# we can search for specific strings in find_all calls.
# say we wanted to find paragraphs that contained the word 'some'

print(body.prettify())

In [None]:
some_paragraphs=body.find_all('p', string='Some')
some_paragraphs

oh no! oh no! Oh no no no no!

In [None]:
# it doesn't quite work...what if we put in 'Some bold text'??
some_paragraphs=body.find_all('p', string='Some bold text')
some_paragraphs

In [None]:
# this situation is not ideal.
# let's leverage our friend regex :)

import re

some_paragraphs=body.find_all('p', string=re.compile('Some'))
some_paragraphs

In [None]:
# another example find all headers that have the word "header" in them

headers=body.find_all('h2', string=re.compile('header'))
headers

only finds one result because regex is looking for 'header'. we can change it by giving it a different pattern to capture the capital "H", string=re.compile('(H|h)eader')

In [None]:
headers=body.find_all('h2', string=re.compile('(H|h)eader'))
headers

### select (CSS selector)

In [None]:
print(soup.body.prettify())

In [None]:
soup.select('p')

looks the same as find_all. Let's try finding all paragraph tags inside of div.

In [None]:
soup.select('div p')

In [None]:
# select all the paragraphs that are also preceeded by a header.
soup.select('h2 ~ p')


In [None]:
# let's do some more of this. It's useful to grab elements with specific id's

In [None]:
soup.select('p#paragraph-id b')

In [None]:
paragraphs=soup.select('body > p')
paragraphs

In [None]:
# we can loop through and make select calls with these objects that we just made. Let's take paragraphs for example
#first look at the type(paragraphs)

type(paragraphs)

In [None]:
# it's a list so we can loop. Let's inspect the elements in the list for their types
type(paragraphs[0])

They are beautiful soup objects so we can use the select and find_all calls. Let's do an example of finding the elements with the id tag.

In [None]:
paragraphs

In [None]:
paragraphs_i=[]
[paragraphs_i.append(x.select('i')) for x in paragraphs]
paragraphs_i

In [None]:
# Grab an element with a specific property
soup.select("[align='middle']")

# Get different properties of the HTML

In [None]:
# want to grab only the string from an element, use the .string property
header = soup.find('h2')
header, header.string

In [None]:
# let's try with div.
div = soup.find('div')
print(div.prettify())
print(div.string)

We got an answer of None. This is because there are children elements inside of the div, so the .string property is not sure which text to grab.

In [None]:
# Let's try with .get_text()
print(div.get_text())
div.get_text()

In [None]:
# Get a specific property from an element.
# Let's get the href link tag
link = soup.find('a')
link

In [None]:
link['href']

In [None]:
# try to get the paragraph id tag.
paragraphs = soup.select('p#paragraph-id')
paragraphs

In [None]:
type(paragraphs)

In [None]:
paragraphs[0], type(paragraphs[0])

In [None]:
paragraphs[0]['id']

# Code Navigation

In [None]:
# the beautiful soup object
soup

In [None]:
# the <body> portion and all it contains of beautiful soup object
soup.body

In [None]:
soup.div

In [None]:
soup.h1

In [None]:
soup.h1.string

In [None]:
# know the terms parent, sibling, child
# start with a pretty print of the body.

print(soup.body.prettify())

In [None]:
soup.p

The body has a nested structure. The body is the parent. Everything nested inside the body is the children. If elements are on the same level they are considered siblings like `<div>` and `<h2>`.

In [None]:
# start with looking at div
div=soup.find('div')
# looks like there are 4 elements that are siblings of the div
div

In [None]:
div.find_next_siblings()

In [None]:
len(div.find_next_siblings())

# Exercises

In [2]:
# Grab all the social link from the webpage.
# webpage: https://keithgalli.github.io/web-scraping/webpage.html

url = 'https://keithgalli.github.io/web-scraping/webpage.html'
request=requests.get(url)
request

<Response [200]>

In [None]:
# look at the content
request.content

In [3]:
# create a variable 'webpage' that takes request.content and makes it a beautiful soup object.
webpage=bs(request.content)


In [None]:
print(webpage.prettify())

# starting on my own.

In [None]:
# all of the social links look like they are inside of <ul> (unordered list) elements with attribute class='socials'
socials = webpage.find('ul', attrs={'class': 'socials'})
socials

In [None]:
type(socials)

In [None]:
social_str = str(socials)
social_str

In [None]:
social_str.split('\n')

In [None]:
# another way - find_all
socials = webpage.find_all('ul', attrs={'class': 'socials'})
socials

In [None]:
# using select
links = webpage.select('ul > li > a[href*="keith"]')

In [None]:
links

In [None]:
links[0]['href']

In [None]:
links[1]['href']

In [None]:
actual_links = [x['href'] for x in links]
actual_links

In [None]:
# [requests.get(x) for x in actual_links]

# following video solutions

In [None]:
# what happens when I select all of the <a> elements on the page?
webpage.select('a')

This gave us too much... \
On the page all of the socials are stored in an unordered list (ul) with class="socials"

In [None]:
# NOTE in select '#' goes with id names, '.' goes with class names
links=webpage.select('ul.socials')
links

In [None]:
# now we just want the a elements with the href in them.
links=webpage.select('ul.socials a')
links

In [None]:
# we hant to get only the links - https://..... 
#since the variable 'links' is a list we can do a list comprehension to grab these.

[link['href'] for link in links]


In [None]:
# another way
# this time lets use find
# a starting point. let's see the first '<a>' tag

links=webpage.find('a')
links

this is not what we are looking for :(

In [None]:
# let's try doing something similar to our first approach.
links=webpage.find('ul', attrs={'class': 'socials'})
links

This is the point that I got to when I worked on my own! \
Let's do a step further and grab the actual links.

In [None]:
# from the 'links' object grab all the 'a' tags
a_tags = links.find_all('a')
a_tags

In [None]:
# similar to befor we can now do a list comprehension to grab all the href=''
[link['href'] for link in a_tags]

In [None]:
# one more way
links = webpage.select('li.social a')
links

In [None]:
# and lets loop through one more time
actual_links = [link['href'] for link in links]
actual_links

# Scrape the table from the keithgalli html page.

In [None]:
# get the table_headers from the season stats table
table_headers = webpage.find('table').find_all('th')
table_headers

In [None]:
''.join(table_headers[0]['class'])

In [None]:
column_names=[' '.join(th['class']) for th in table_headers]
column_names

# function: create column_names()

In [4]:
# for the data frame these table headers will be column names.
# we will name the function that creates the list of column names 'create_column_names_list'
def create_column_names_list(webpage):
    table_headers = webpage.find('table').find_all('th')
    column_names_list=[' '.join(th['class']) for th in table_headers]
    return(column_names_list)

column_names_list=create_column_names_list(webpage)
column_names_list


['season',
 'team',
 'league',
 'regular gp',
 'regular g',
 'regular a',
 'regular tp',
 'regular pim',
 'regular pm',
 'separator',
 'postseason',
 'postseason gp',
 'postseason g',
 'postseason a',
 'postseason tp',
 'postseason pim',
 'postseason pm']

### function: create_table_rows_list

In [5]:
# all of the table rows in a list
def create_table_rows_list(webpage):
    table_rows_list =webpage.find('tbody').find_all('tr')
    return(table_rows_list)

table_rows_list=create_table_rows_list(webpage)

In [None]:
# looking at the first item in the list
# create an empty dictionary to store details
# print(table_rows_list[0].prettify())
# table_row_dict={}
# table_row_dict

In [None]:
# to get the season information go down to where the season is contained in the bs_object.
# first_row = table_rows_list[0]
# first_row.find('td', attrs={'class':'season sorted'}).string

In [None]:
#How does it look with .get_text()?
# first_row.find('td', attrs={'class':'season sorted'}).get_text()

In [None]:
# since these look the same let's clean up the string.
# give the string a variable name 'season_string'
# season_string = first_row.find('td', attrs={'class':'season sorted'}).get_text()
# season_string

In [None]:
# get rid of the white space and new new lines with .strip()
# season_string.strip()

In [None]:
# putting this example into a for a loop to make a list of all of the seasons.
# seasons_list=[]
# for r in table_rows_list:
#     season_string=r.find('td', attrs={'class':'season sorted'}).get_text()
#     seasons_list.append(season_string.strip())
# seasons_list

### function: create_seasons_list()

In [6]:
# let's make the for loop into a function to call the seasons_list for the pandas dataframe.
def create_seasons_list():
    seasons_list = []
    for r in table_rows_list:
        season_string=r.find('td', attrs={'class':'season sorted'}).get_text()
        seasons_list.append(season_string.strip())
    return(seasons_list)

seasons_list = create_seasons_list()
seasons_list

['2014-15', '2015-16', '2016-17', '2017-18', '2018-19']

In [None]:
# grab the team information:
# team_info = table_rows_list[0].find('td', attrs={'class':'team'})
# team_info

In [None]:
# team_name = team_info.find('a').get_text().strip()
# team_name

In [None]:
# team_season_link = team_info.find('a')
# team_season_link['href']

In [None]:
#putting this example into a for loop to make lists of team names & season links
# team_name_list=[]
# team_links_list=[]
# for r in table_rows_list:
#     team_info=r.find('td', attrs={'class':'team'})
#     if team_info.find('a'):
#         team_name=team_info.find('a').get_text().strip()
#         team_season_link=team_info.find('a')
#         team_name_list.append(team_name)
#         team_season_link = team_info.find('a')
#         team_links_list.append(team_season_link['href'])
#     else:
#         team_name=team_info.get_text().strip()
#         team_season_link= ""
#         team_name_list.append(team_name)
#         team_links_list.append(team_season_link)

# team_name_list, team_links_list

### function: create_team_name_links_list()

In [7]:
# Let's make a function!

def create_team_name_links_lists():
    team_name_list=[]
    team_links_list=[]
    for r in table_rows_list:
        team_info=r.find('td', attrs={'class':'team'})
        if team_info.find('a'):
            team_name=team_info.find('a').get_text().strip()
            team_season_link=team_info.find('a')
            team_name_list.append(team_name)
            team_season_link = team_info.find('a')
            team_links_list.append(team_season_link['href'])
        else:
            team_name=team_info.get_text().strip()
            team_season_link= ""
            team_name_list.append(team_name)
            team_links_list.append(team_season_link)

    return(team_name_list, team_links_list)

(team_name_list, team_links_list) = create_team_name_links_lists()
team_name_list, team_links_list

(['MIT (Mass. Inst. of Tech.)',
  'MIT (Mass. Inst. of Tech.)',
  'MIT (Mass. Inst. of Tech.)',
  'Did not play',
  'MIT (Mass. Inst. of Tech.)'],
 ['https://www.eliteprospects.com/team/10263/mit-mass.-inst.-of-tech./2014-2015?tab=stats',
  'https://www.eliteprospects.com/team/10263/mit-mass.-inst.-of-tech./2015-2016?tab=stats',
  'https://www.eliteprospects.com/team/10263/mit-mass.-inst.-of-tech./2016-2017?tab=stats',
  '',
  'https://www.eliteprospects.com/team/10263/mit-mass.-inst.-of-tech./2018-2019?tab=stats'])

In [None]:
# sanity check: looking at the lists created from the function.
# team_name_list, team_links_list

In [None]:
# get the league info for the 'league' column
# test with the first item in 'table_rows_list'
# league_info=table_rows_list[0].find('td',attrs={'class':'league'})
# league_info

In [None]:
# get the league name with .get_text()
# league = league_info.get_text().strip()
# league

In [None]:
# get the league link with find('a') and ['href']
# league_link=league_info.find('a')
# league_link['href']

In [None]:
# test this with the 'Did not play' table_rows_list[3] info:
# table_rows_list[3]

In [None]:
# league_info=table_rows_list[3].find('td',attrs={'class':'league'})
# league_info.get_text().strip()

In [None]:
# league = league_info.get_text().strip()
# league

In [None]:
# You get empty information for the year that is 'Did not play', table_rows_list[3].
# I think this will be okay to add into the list

# Go forward with creating the loop.

# league_list=[]
# league_link_list=[]
# for r in table_rows_list:
#     league_info=r.find('td', attrs={'class':'league'})
#     league_list.append(league_info.get_text().strip())
#     league_link=league_info.find('a')
#     league_link_list.append(league_link['href'])

# league_list, league_link_list

### function: create_league_name_link_lists()

In [8]:
def create_league_name_link_lists():
    league_list=[]
    league_link_list=[]
    for r in table_rows_list:
        league_info=r.find('td', attrs={'class':'league'})
        league_list.append(league_info.get_text().strip())
        league_link=league_info.find('a')
        league_link_list.append(league_link['href'])

    return(league_list, league_link_list)

league_list, league_link_list=create_league_name_link_lists()

In [None]:
league_list

In [None]:
league_link_list

In [None]:
# look at table_rows_list again to see whats next...
# table_rows_list

In [None]:
# what were the column headers again?
# print(column_names_list)

In [None]:
# try to see if there is the same 'for' loop being repeated so we could use parameters to put into a function.
# gp_list=[]
# for r in table_rows_list:
#     gp_info=r.find('td', attrs={'class':'regular gp'})
#     if gp_info.get_text()=='':
#         gp_list.append(0)
        
#     else:
#         gp_list.append(gp_info.get_text())

# gp_list

In [None]:
# [int(x) for x in gp_list]

In [None]:
# g_list=[]
# for r in table_rows_list:
#     g_info=r.find('td', attrs={'class': 'regular g'})
#     if g_info.get_text()=='':
#         g_list.append(0)
#     else:
#         g_list.append(g_info.get_text())
# g_list

In [None]:
# [int(x) for x in g_list]

In [None]:
# ***Do not use this piece**
# :later created a function to get these names as they look in the 'column_names_list' variable.

# work on creating a function that will make lists for each numerical stats.
# stats=['GP','G','A','TP','PIM', '+/-']
# stats

In [None]:
# lower the stats names to match the text in html.
# stats = [str.lower(x) for x in stats]

In [None]:
# stats_dictionary = {}
# for x in stats:
#     stats_dictionary[x]=''
# stats_dictionary


In [None]:
# stats_dictionary['gp'].append(0)
# stats_dictionary

In [None]:
# for i in stats_dictionary:
#     info=table_rows_list[0].find('td', attrs={'class': f'regular {i}'})
#     if info:
#         stats_dictionary[i]=(int(info.get_text()))
#     else:
#         stats_dictionary[i]='Null'
    
# stats_dictionary

In [None]:
# Let's see whats going on with the 'Did Not Play Row'
# table_rows_list[3]

In [None]:
# for i in stats_dictionary:
#     if table_rows_list[3].find('td', attrs={'class': f'regular {i}'}):
#         info=table_rows_list[3].find('td', attrs={'class': f'regular {i}'})
#         if info.get_text()=='':
#             print(i, 'No Text')
#         else:
#             print(i)
#     else:
#         print(i, 'Null')
    

In [None]:
# create a for loop to put all of the stats together into a list of dictionaries

# numeric_stats_list=[]
# for r in table_rows_list:
#     stats=['GP','G','A','TP','PIM', '+/-']
#     stats=[str.lower(x) for x in stats]
#     stats_dictionary={}
#     for i in stats:
#         if r.find('td', attrs={'class': f'regular {i}'}):
#             info=r.find('td', attrs={'class': f'regular {i}'})
#             if info.get_text()!='':
#                 stats_dictionary[i]=(int(info.get_text()))
#             else:
#                 stats_dictionary[i]=''
#         else:
#             stats_dictionary[i]=''
#     numeric_stats_list.append(stats_dictionary)

In [None]:
# numeric_stats_list

In [None]:
# Thinking of an idea
# Did not use this, not important

# for k,v in enumerate(numeric_stats_list):
#     print(k,v)

### Function create_regular_stats_list()

In [9]:
def create_regular_stats_list():
    regular_stats_list=[]
    for r in table_rows_list:
        numeric_stats_names_list=['regular GP','regular G','regular A','regular TP','regular PIM', 'regular PM']
        numeric_stats_names_list=[str.lower(x) for x in numeric_stats_names_list]
        stats_dictionary={}
        for i in numeric_stats_names_list:
            if r.find('td', attrs={'class': f'{i}'}):
                info=r.find('td', attrs={'class': f'{i}'})
                if info.get_text()!='':
                    stats_dictionary[i]=(int(info.get_text()))
                else:
                    stats_dictionary[i]=''
            else:
                stats_dictionary[i]=''
        regular_stats_list.append(stats_dictionary)
    return(regular_stats_list)
    


regular_stats_list=create_regular_stats_list()
regular_stats_list

[{'regular gp': 17,
  'regular g': 3,
  'regular a': 9,
  'regular tp': 12,
  'regular pim': 20,
  'regular pm': ''},
 {'regular gp': 9,
  'regular g': 1,
  'regular a': 1,
  'regular tp': 2,
  'regular pim': 2,
  'regular pm': ''},
 {'regular gp': 12,
  'regular g': 5,
  'regular a': 5,
  'regular tp': 10,
  'regular pim': 8,
  'regular pm': 0},
 {'regular gp': '',
  'regular g': '',
  'regular a': '',
  'regular tp': '',
  'regular pim': '',
  'regular pm': ''},
 {'regular gp': 8,
  'regular g': 5,
  'regular a': 10,
  'regular tp': 15,
  'regular pim': 8,
  'regular pm': ''}]

In [None]:
# exploring the separator row
# separator_info=table_rows_list[0].find('td', attrs={'class':'separator'})

In [None]:
# separator_info.get_text().strip()

In [None]:
# create a for loop to contain all of the separators

# separators=[]
# for r in table_rows_list:
#     sep = r.find('td',attrs={'class':'separator'})
#     separators.append(sep.get_text().strip())

# separators

In [None]:
#list comprehension
# [x.get_text().strip()for x in [r.find('td', attrs={'class':'separator'}) for r in table_rows_list]]

### function: create_separators & postseason_stats

In [10]:
def create_postseason_stat_list(column_name):
    column_list=[x.get_text().strip()for x in [r.find('td', attrs={'class':column_name}) for r in table_rows_list]]
    return(column_list)

separtors_list=create_postseason_stat_list('separator')
separtors_list

['|', '|', '|', '|', '|']

In [None]:
#get post season text
# post_info=table_rows_list[0].find('td', attrs={'class':'postseason'})
# post_info.get_text().strip()

In [None]:
# get post_season_links
# post_info=table_rows_list[0].find('td', attrs={'class':'postseason'})
# post_a=post_info.find('a')
# post_a['href']

In [None]:
# ['no link' if link=='no link' else link['href'] for link in [a_tag.find('a') if a_tag.find('a') else 'no link' for a_tag in [x.find('td', attrs={'class':'postseason'}) for x in table_rows_list]]]


### function: create_postseason_links()

In [11]:
def create_postseason_links():
    postseason_links= ['no link' if link=='no link' else link['href'] for link in [a_tag.find('a') if a_tag.find('a') else 'no link' for a_tag in [x.find('td', attrs={'class':'postseason'}) for x in table_rows_list]]]
    return(postseason_links)

postseason_links=create_postseason_links()
postseason_links

['https://www.eliteprospects.com/league/acha-ii/stats/2014-2015',
 'https://www.eliteprospects.com/league/acha-ii/stats/2015-2016',
 'no link',
 'https://www.eliteprospects.com/stats',
 'https://www.eliteprospects.com/league/acha-iii/stats/2018-2019']

In [None]:
# wanted to look at the table_rows_list again
# table_rows_list

### function: create_postseason_stats

In [12]:
regular_stats_values_list = create_regular_stats_list()
regular_stats_values_list

[{'regular gp': 17,
  'regular g': 3,
  'regular a': 9,
  'regular tp': 12,
  'regular pim': 20,
  'regular pm': ''},
 {'regular gp': 9,
  'regular g': 1,
  'regular a': 1,
  'regular tp': 2,
  'regular pim': 2,
  'regular pm': ''},
 {'regular gp': 12,
  'regular g': 5,
  'regular a': 5,
  'regular tp': 10,
  'regular pim': 8,
  'regular pm': 0},
 {'regular gp': '',
  'regular g': '',
  'regular a': '',
  'regular tp': '',
  'regular pim': '',
  'regular pm': ''},
 {'regular gp': 8,
  'regular g': 5,
  'regular a': 10,
  'regular tp': 15,
  'regular pim': 8,
  'regular pm': ''}]

In [13]:
post_stats_names_list=[]
for x in regular_stats_values_list[0]:
    post_stats_names_list.append(x.replace('regular', 'postseason'))

post_stats_names_list
post_stats_names_list.append('postseason')
post_stats_names_list.append('separator')
post_stats_names_list

['postseason gp',
 'postseason g',
 'postseason a',
 'postseason tp',
 'postseason pim',
 'postseason pm',
 'postseason',
 'separator']

### function: create_postseason_stats_dictionary()

In [15]:
def create_postseason_stats_dictionary(post_stats_names_list):
    post_stats_dictionary={}
    
    for x in post_stats_names_list:
        post_stats_dictionary[x]=create_postseason_stat_list(x)
    return(post_stats_dictionary)

postseason_stats_dictionary = create_postseason_stats_dictionary(post_stats_names_list)
postseason_stats_dictionary

{'postseason gp': ['', '', '', '', ''],
 'postseason g': ['', '', '', '', ''],
 'postseason a': ['', '', '', '', ''],
 'postseason tp': ['', '', '', '', ''],
 'postseason pim': ['', '', '', '', ''],
 'postseason pm': ['', '', '', '', ''],
 'postseason': ['', '', '', '', ''],
 'separator': ['|', '|', '|', '|', '|']}

# creating the whole table/enchilada

In [None]:
# we start with an empty dictionary 'table_row_dict'
table_row_dict

In [None]:
# use the column_names_list to fill in the keys of the dictionary.
for x in column_names_list:
    table_row_dict[x]=''

table_row_dict

In [None]:
# use the season_list to fill in the values of the 'season' key.
table_row_dict['season'] = seasons_list
table_row_dict

In [None]:
# use the team_names_list to fill in the values of the 'team' key.
table_row_dict['team']=team_name_list
table_row_dict

In [None]:
# use the league_link_list to fill in the values of the 'league_link_list' key.
table_row_dict['league_link_list']=league_link_list
table_row_dict

In [None]:
# use the 'league_list' to fill in the values of the 'league' key.
table_row_dict['league'] = league_list
table_row_dict

In [None]:
# 'regular_stats_values_list' is a list of dictionaries that contains each regular season's (row's) stats.   
regular_stats_values_list

In [None]:
# create regular_stats_names_list
regular_stats_names_list=[]
for k in regular_stats_values_list[0]:
    regular_stats_names_list.append(k)
    
regular_stats_names_list

In [None]:
# create post_season_names_list
post_season_names_list=[]
for x in regular_stats_names_list:
        post_season_names_list.append(x.replace('regular', 'postseason'))

post_season_names_list
post_season_names_list.append('postseason')
post_season_names_list.append('separator')
post_season_names_list

In [None]:
regular_stats_dictionary={}
for x in regular_stats_names_list:
    regular_stats_dictionary[x]=[]
regular_stats_dictionary

In [None]:
# # learning/trying out somthing
# for k,v in enumerate(create_regular_stats_list()):
#     print(k,v)

In [None]:
# # learning how to navigate in each dictionary to create for loop.
# create_regular_stats_list()[0]['regular gp']

In [None]:
# create a range the is the length of 'regular_stats_values_list'. 
# Go through each item in the range (0-4), to collect each dictionary and the corresponding key
# to append to each key in regular_stats_dict.
for r in range(len(regular_stats_values_list)):
    for k in regular_stats_dictionary:
        regular_stats_dictionary[k].append(regular_stats_values_list[r][k])
    
regular_stats_dictionary

In [None]:
# append regular season stats dictionary to table_row_dict
for k in regular_stats_dict:
    table_row_dict[k]=regular_stats_dict[k]

table_row_dict

In [None]:
# use postseason_stats_dictionary to fill in values for the 'postseason' and 'separator' keys
for k in postseason_stats_dictionary:
    table_row_dict[k]=postseason_stats_dictionary[k]

table_row_dict

In [None]:
# use the post_season_links_list to fill in the 'post_season_links' key
table_row_dict['post_season_links'] = postseason_links
table_row_dict

In [None]:
# Now that we have the table information web sraped and into the table_row dictionary we can import pandas and create the Data Frame.
# NOTE: We could have imported Pandas at the beginning in our imports section at the top. (this is usally prefered)

import pandas as pd

In [None]:
df = pd.DataFrame(table_row_dict)
df

In [None]:
# clean up a little bit. join the column names with '_' i.e. 'regular_gp'.

# get the column names:
df.columns

In [None]:
new_col_names_list = []
for x in df.columns:
    x = x.replace(' ','_')
    x = x.replace('pm', '+/-')
    new_col_names_list.append(x)
    
new_col_names_list

In [None]:
df.columns = new_col_names_list

In [None]:
df