# Data and How To Source It

# What is an API?

API stands for Application Programming Interface

# How do we make HTTPS requests in Python

In [None]:
import requests

In [None]:
# Issues a get request to the API Endpoint
r = requests.get('https://api.github.com/events')

In [None]:
#Attribute that gets me the raw text as a string
r.text

In [None]:
r.json()[0]

In [None]:
# It's as simple as that. Sometimes, the encoding isn't right. Requests tries to infer
# What encoding to use, so you can change it manually with
# r.encoding = 'encoding_to_use'
# We can see that the returned object is what we are operating on.
#UTF-8

# Let's work through an example

In [None]:
r = requests.get('https://data.cityofnewyork.us/resource/f9bf-2cp4.json')

In [None]:
r.text

In [None]:
r.json()

In [None]:
import pandas as pd

In [None]:
df = pd.read_json(r.text)

In [None]:
df


In [None]:
payload = {'$limit':10,"$$app_token":"tq19sOJfOJFRot5OrORkOEbsS"}
r = requests.get('https://data.cityofnewyork.us/resource/f9bf-2cp4.json', params=payload)

In [None]:
r.text

In [None]:
r.json()

In [None]:
test = pd.read_json(r.text)
test

In [None]:
test.boxplot(column='sat_critical_reading_avg_score')

In [None]:
# form a get to grab the websites HTML
r = requests.get('https://basketball-reference.com/teams/PHI/2022.html')

In [None]:
from bs4 import BeautifulSoup

In [100]:
# turn the raw HTML into a soup object
soup = BeautifulSoup(r.text,'html.parser')

In [None]:
#One way of finding the table that contains the data
earnest_attempt = soup.findAll('div',attrs={'id':'div_roster'})

In [None]:
#Another way of finding the table that contains the data.
found_table = soup.findAll('table')

In [None]:
#I know the roster table is the first table that appears, so I'm grabbing it
roster_table = found_table[0]

In [None]:
#Grab all the table rows from the table.
player_rows = roster_table('tr')

In [None]:
player_rows

In [None]:
#I know that I don't need the first row, so I'm deleting it
del player_rows[0]

In [None]:
player_rows

In [None]:
#I am selecting all the Table Data from the first row
player_rows[0].find_all('td')

In [None]:
#List comprehension to extract all the text data from each TD element.
extracted_text = [ele.text for ele in player_rows[0].find_all('td') ]

In [None]:
extracted_text 

In [None]:
#This is going to serve as the keys in my k,v pairings
keys = ['Player_Name','Position','Height','Weight','DOB','COB','Experience','Alma_Matter']

In [None]:
#initialize a dict to hold the result of my for loop
player_dict = []


#Iterate over the list of rows in my table
#apply the same logic as before.
#Locate all the td elements
#extract all the text from the td elements
#zip them with my keys
for player in player_rows:
    found_rows = player.find_all('td')
    stripped_text = [ele.text for ele in found_rows]
    player_dict.append(dict(zip(keys,stripped_text))) 

In [96]:
player_dict

[{'Player_Name': 'Georges Niang',
  'Position': 'PF',
  'Height': '6-7',
  'Weight': '230',
  'DOB': 'June 17, 1993',
  'COB': 'us',
  'Experience': '5',
  'Alma_Matter': 'Iowa State'},
 {'Player_Name': 'Tyrese Maxey',
  'Position': 'PG',
  'Height': '6-2',
  'Weight': '200',
  'DOB': 'November 4, 2000',
  'COB': 'us',
  'Experience': '1',
  'Alma_Matter': 'Kentucky'},
 {'Player_Name': 'Tobias Harris',
  'Position': 'PF',
  'Height': '6-8',
  'Weight': '226',
  'DOB': 'July 15, 1992',
  'COB': 'us',
  'Experience': '10',
  'Alma_Matter': 'Tennessee'},
 {'Player_Name': 'Joel Embiid',
  'Position': 'C',
  'Height': '7-0',
  'Weight': '280',
  'DOB': 'March 16, 1994',
  'COB': 'cm',
  'Experience': '5',
  'Alma_Matter': 'Kansas'},
 {'Player_Name': 'Furkan Korkmaz',
  'Position': 'SG',
  'Height': '6-7',
  'Weight': '202',
  'DOB': 'July 24, 1997',
  'COB': 'tr',
  'Experience': '4',
  'Alma_Matter': ''},
 {'Player_Name': 'Matisse Thybulle',
  'Position': 'SG',
  'Height': '6-5',
  'Weight

In [None]:
#create a new df from my list of Dicts
df = pd.DataFrame(player_dict)

In [97]:
df

Unnamed: 0,Player_Name,Position,Height,Weight,DOB,COB,Experience,Alma_Matter
0,Georges Niang,PF,6-7,230,"June 17, 1993",us,5,Iowa State
1,Tyrese Maxey,PG,6-2,200,"November 4, 2000",us,1,Kentucky
2,Tobias Harris,PF,6-8,226,"July 15, 1992",us,10,Tennessee
3,Joel Embiid,C,7-0,280,"March 16, 1994",cm,5,Kansas
4,Furkan Korkmaz,SG,6-7,202,"July 24, 1997",tr,4,
5,Matisse Thybulle,SG,6-5,201,"March 4, 1997",us,2,Washington
6,Danny Green,SF,6-6,215,"June 22, 1987",us,12,UNC
7,Isaiah Joe,SG,6-4,165,"July 2, 1999",us,1,Arkansas
8,Shake Milton,PG,6-5,205,"September 26, 1996",us,3,SMU
9,Paul Reed,C,6-9,210,"June 14, 1999",us,1,DePaul


In [None]:
#Method number 2 using lists
player_dict = []
for player in player_rows:
    found_rows = player.find_all('td')
    stripped_text = [ele.text for ele in found_rows]
    player_dict.append(stripped_text)

In [None]:
#Initialize my DF using list of lists
df = pd.DataFrame(player_dict,columns=['Player_Name','Position','Height','Weight','DOB','COB','Experience','Alma_Matter'])


In [98]:
df

Unnamed: 0,Player_Name,Position,Height,Weight,DOB,COB,Experience,Alma_Matter
0,Georges Niang,PF,6-7,230,"June 17, 1993",us,5,Iowa State
1,Tyrese Maxey,PG,6-2,200,"November 4, 2000",us,1,Kentucky
2,Tobias Harris,PF,6-8,226,"July 15, 1992",us,10,Tennessee
3,Joel Embiid,C,7-0,280,"March 16, 1994",cm,5,Kansas
4,Furkan Korkmaz,SG,6-7,202,"July 24, 1997",tr,4,
5,Matisse Thybulle,SG,6-5,201,"March 4, 1997",us,2,Washington
6,Danny Green,SF,6-6,215,"June 22, 1987",us,12,UNC
7,Isaiah Joe,SG,6-4,165,"July 2, 1999",us,1,Arkansas
8,Shake Milton,PG,6-5,205,"September 26, 1996",us,3,SMU
9,Paul Reed,C,6-9,210,"June 14, 1999",us,1,DePaul


In [103]:
soup.findAll('table')

[<table class="sortable stats_table" data-cols-to-freeze=",2" id="roster">
 <caption>Roster Table</caption>
 <colgroup><col/><col/><col/><col/><col/><col/><col/><col/><col/></colgroup>
 <thead>
 <tr>
 <th aria-label="No." class="poptip sort_default_asc center" data-stat="number" data-tip="Uniform Number" scope="col">No.</th>
 <th aria-label="Player" class="poptip sort_default_asc center" data-stat="player" scope="col">Player</th>
 <th aria-label="Pos" class="poptip sort_default_asc center" data-stat="pos" data-tip="Position" scope="col">Pos</th>
 <th aria-label="Ht" class="poptip sort_default_asc center" data-stat="height" data-tip="Height" scope="col">Ht</th>
 <th aria-label="Wt" class="poptip sort_default_asc center" data-stat="weight" data-tip="Weight" scope="col">Wt</th>
 <th aria-label="Birth Date" class="poptip sort_default_asc center" data-stat="birth_date" scope="col">Birth Date</th>
 <th aria-label=" " class="poptip center" data-stat="birth_country" scope="col"> </th>
 <th ari

In [104]:
import selenium

In [110]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys


In [None]:
#driver = webdriver.Chrome(executable_path='C:/path/to/chromedriver.exe')

In [114]:
#Code to initialize the chrome driver
driver = webdriver.Chrome()

#have the chrome browser issue a get request to googl
driver.get("http://www.google.com")

In [115]:
search_box = driver.find_element(By.NAME, "q")

In [116]:
search_button = driver.find_element(By.NAME, "btnK")

In [117]:
search_box.send_keys("Selenium")

In [118]:
search_button.click()

In [119]:
driver.find_element(By.NAME, "q").get_attribute("value")

'Selenium'

In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys



#driver = webdriver.Chrome(executable_path='C:/path/to/chromedriver.exe')
#This is for you want the driver in a static place

driver = webdriver.Chrome()
#Initialize the chrome instance

driver.get("http://www.google.com")
#Issue a get request to the website


search_box = driver.find_element(By.NAME, "q")
#Search the webpage for an element called a, set it equal to search_box


driver.implicitly_wait(2)
#Tell the driver to not do anything for 2 seconds ( Why? It's because we need the search button to load in after some text is being put into the search box)

search_button = driver.find_element(By.NAME, "btnK")
#Find the element on the webpage called btnL


search_box.send_keys("Selenium")
#Mimic the keystrokes S E L E N I U M to the search box


search_button.click()
#Mimic a click of the button on the search_button element



driver.find_element(By.NAME, "q").get_attribute("value") # => "Selenium"
#Find the element on the webpage called q, and get the attribute value.


In [142]:
driver = webdriver.Chrome()
driver.implicitly_wait(2)



driver.get("https://www.basketball-reference.com/teams/PHI/2022.html")
roster_table = driver.find_element(By.ID, "roster")
raw_html = roster_table.get_attribute('innerHTML')
print(roster_table.get_attribute('innerHTML'))
driver.close()



    <caption>Roster Table</caption>
    

   <colgroup><col><col><col><col><col><col><col><col><col></colgroup>
   <thead>      
      <tr>
         <th aria-label="No." data-stat="number" scope="col" class=" poptip sort_default_asc center" data-tip="Uniform Number">No.</th>
         <th aria-label="Player" data-stat="player" scope="col" class=" poptip sort_default_asc center">Player</th>
         <th aria-label="Pos" data-stat="pos" scope="col" class=" poptip sort_default_asc center" data-tip="Position">Pos</th>
         <th aria-label="Ht" data-stat="height" scope="col" class=" poptip sort_default_asc center" data-tip="Height">Ht</th>
         <th aria-label="Wt" data-stat="weight" scope="col" class=" poptip sort_default_asc center" data-tip="Weight">Wt</th>
         <th aria-label="Birth Date" data-stat="birth_date" scope="col" class=" poptip sort_default_asc center">Birth Date</th>
         <th aria-label="&nbsp;" data-stat="birth_country" scope="col" class=" poptip center">&nbsp;

In [143]:
soup = BeautifulSoup(raw_html,'html.parser')

In [123]:
all_rows = soup.findAll('tr')

In [125]:
del all_rows[0]

In [129]:
player_dict = []
player_list = []
keys = ['Player_Name','Position','Height','Weight','DOB','COB','Number','Alma_Matter']
for player in all_rows:
    found_rows= player.find_all('td')
    stripped_text = [ele.text for ele in found_rows]
    player_dict.append(dict(zip(keys,stripped_text)))


player_dict

[{'Player_Name': 'Georges Niang',
  'Position': 'PF',
  'Height': '6-7',
  'Weight': '230',
  'DOB': 'June 17, 1993',
  'COB': 'us',
  'Number': '5',
  'Alma_Matter': 'Iowa State'},
 {'Player_Name': 'Tyrese Maxey',
  'Position': 'PG',
  'Height': '6-2',
  'Weight': '200',
  'DOB': 'November 4, 2000',
  'COB': 'us',
  'Number': '1',
  'Alma_Matter': 'Kentucky'},
 {'Player_Name': 'Tobias Harris',
  'Position': 'PF',
  'Height': '6-8',
  'Weight': '226',
  'DOB': 'July 15, 1992',
  'COB': 'us',
  'Number': '10',
  'Alma_Matter': 'Tennessee'},
 {'Player_Name': 'Joel Embiid',
  'Position': 'C',
  'Height': '7-0',
  'Weight': '280',
  'DOB': 'March 16, 1994',
  'COB': 'cm',
  'Number': '5',
  'Alma_Matter': 'Kansas'},
 {'Player_Name': 'Furkan Korkmaz',
  'Position': 'SG',
  'Height': '6-7',
  'Weight': '202',
  'DOB': 'July 24, 1997',
  'COB': 'tr',
  'Number': '4',
  'Alma_Matter': ''},
 {'Player_Name': 'Matisse Thybulle',
  'Position': 'SG',
  'Height': '6-5',
  'Weight': '201',
  'DOB': '

In [130]:
df = pd.DataFrame(player_dict)

In [131]:
df

Unnamed: 0,Player_Name,Position,Height,Weight,DOB,COB,Number,Alma_Matter
0,Georges Niang,PF,6-7,230,"June 17, 1993",us,5,Iowa State
1,Tyrese Maxey,PG,6-2,200,"November 4, 2000",us,1,Kentucky
2,Tobias Harris,PF,6-8,226,"July 15, 1992",us,10,Tennessee
3,Joel Embiid,C,7-0,280,"March 16, 1994",cm,5,Kansas
4,Furkan Korkmaz,SG,6-7,202,"July 24, 1997",tr,4,
5,Matisse Thybulle,SG,6-5,201,"March 4, 1997",us,2,Washington
6,Danny Green,SF,6-6,215,"June 22, 1987",us,12,UNC
7,Isaiah Joe,SG,6-4,165,"July 2, 1999",us,1,Arkansas
8,Shake Milton,PG,6-5,205,"September 26, 1996",us,3,SMU
9,Paul Reed,C,6-9,210,"June 14, 1999",us,1,DePaul


In [156]:
driver = webdriver.Chrome()
driver.implicitly_wait(2)



driver.get("https://www.basketball-reference.com/teams/PHI/2022.html")
salaries_table = driver.find_element(By.ID, "salaries2")
raw_html = salaries_table.get_attribute('innerHTML')
print(salaries_table.get_attribute('innerHTML'))
driver.close()



    <caption>Salaries Table</caption>
    

   <colgroup><col><col><col></colgroup>
   <thead>      
      <tr>
         <th aria-label="Rk" data-stat="ranker" scope="col" class="ranker poptip sort_default_asc show_partial_when_sorting center" data-tip="Rank">Rk</th>
         <th aria-label="&nbsp;" data-stat="player" scope="col" class=" poptip sort_default_asc center">&nbsp;</th>
         <th aria-label="Salary" data-stat="salary" scope="col" class=" poptip right">Salary</th>
      </tr>
      </thead>
<tbody><tr><th scope="row" class="center " data-stat="ranker" csk="1">1</th><td class="left " data-stat="player"><a href="/players/h/hardeja01.html">James Harden</a></td><td class="right " data-stat="salary" csk="43848000">$43,848,000</td></tr>
<tr><th scope="row" class="center " data-stat="ranker" csk="2">2</th><td class="left " data-stat="player"><a href="/players/h/harrito02.html">Tobias Harris</a></td><td class="right " data-stat="salary" csk="36000000">$36,000,000</td></tr>
<tr><t

In [157]:
soup = BeautifulSoup(raw_html,'html.parser')

In [159]:
player_salaries = soup.findAll('tr')

In [160]:
del player_salaries[0]

In [161]:
player_salaries

[<tr><th class="center" csk="1" data-stat="ranker" scope="row">1</th><td class="left" data-stat="player"><a href="/players/h/hardeja01.html">James Harden</a></td><td class="right" csk="43848000" data-stat="salary">$43,848,000</td></tr>,
 <tr><th class="center" csk="2" data-stat="ranker" scope="row">2</th><td class="left" data-stat="player"><a href="/players/h/harrito02.html">Tobias Harris</a></td><td class="right" csk="36000000" data-stat="salary">$36,000,000</td></tr>,
 <tr><th class="center" csk="3" data-stat="ranker" scope="row">3</th><td class="left" data-stat="player"><a href="/players/e/embiijo01.html">Joel Embiid</a></td><td class="right" csk="31579390" data-stat="salary">$31,579,390</td></tr>,
 <tr><th class="center" csk="4" data-stat="ranker" scope="row">4</th><td class="left" data-stat="player"><a href="/players/g/greenda02.html">Danny Green</a></td><td class="right" csk="10000000" data-stat="salary">$10,000,000</td></tr>,
 <tr><th class="center" csk="5" data-stat="ranker" sc

In [163]:
player_salaries_list = []
for player in player_salaries:
    found_rows= player.find_all('td')
    stripped_text = [ele.text for ele in found_rows]
    player_salaries_list.append(stripped_text)




In [164]:
player_salaries_list

[['James Harden', '$43,848,000'],
 ['Tobias Harris', '$36,000,000'],
 ['Joel Embiid', '$31,579,390'],
 ['Danny Green', '$10,000,000'],
 ['Furkan Korkmaz', '$4,629,630'],
 ['Georges Niang', '$3,300,000'],
 ['Matisse Thybulle', '$2,840,160'],
 ['Paul Millsap', '$2,641,691'],
 ['Tyrese Maxey', '$2,602,920'],
 ['Jaden Springer', '$2,023,800'],
 ['Shake Milton', '$1,846,738'],
 ['Isaiah Joe', '$1,517,981'],
 ['Paul Reed', '$1,517,981'],
 ['George Hill', '$1,275,491'],
 ['Charles Bassey', '$925,258'],
 ['DeAndre Jordan', '$592,103'],
 ['Tyler Johnson', '$128,709'],
 ['Willie Cauley-Stein', '$120,083'],
 ['Charlie Brown Jr.', '$95,930'],
 ['Braxton Key', '$53,176']]