# Reading HTML tables

*install lxml package*

In [1]:
import pandas as pd

## parsing raw HTML strings

-- using methods like head_html()

In [2]:
html_string = """
<table>
    <thead>
      <tr>
        <th>Order date</th>
        <th>Region</th> 
        <th>Item</th>
        <th>Units</th>
        <th>Unit cost</th>
      </tr>
    </thead>
    <tbody>
      <tr>
        <td>1/6/2018</td>
        <td>East</td> 
        <td>Pencil</td>
        <td>95</td>
        <td>1.99</td>
      </tr>
      <tr>
        <td>1/23/2018</td>
        <td>Central</td> 
        <td>Binder</td>
        <td>50</td>
        <td>19.99</td>
      </tr>
      <tr>
        <td>2/9/2018</td>
        <td>Central</td> 
        <td>Pencil</td>
        <td>36</td>
        <td>4.99</td>
      </tr>
      <tr>
        <td>3/15/2018</td>
        <td>West</td> 
        <td>Pen</td>
        <td>27</td>
        <td>19.99</td>
      </tr>
    </tbody>
</table>
"""

In [3]:
dfs = pd.read_html(html_string)

  dfs = pd.read_html(html_string)


In [4]:
from IPython.display import display, HTML
display(HTML(html_string))

Order date,Region,Item,Units,Unit cost
1/6/2018,East,Pencil,95,1.99
1/23/2018,Central,Binder,50,19.99
2/9/2018,Central,Pencil,36,4.99
3/15/2018,West,Pen,27,19.99


In [5]:
len(dfs)  # Check how many DataFrames were created

1

In [6]:
df = dfs[0]  # Get the first DataFrame
df

Unnamed: 0,Order date,Region,Item,Units,Unit cost
0,1/6/2018,East,Pencil,95,1.99
1,1/23/2018,Central,Binder,50,19.99
2,2/9/2018,Central,Pencil,36,4.99
3,3/15/2018,West,Pen,27,19.99


In [7]:
df.shape

(4, 5)

In [8]:
df.loc[df['Region'] == 'Central']

Unnamed: 0,Order date,Region,Item,Units,Unit cost
1,1/23/2018,Central,Binder,50,19.99
2,2/9/2018,Central,Pencil,36,4.99


In [9]:
df.loc[df['Units'] > 35]

Unnamed: 0,Order date,Region,Item,Units,Unit cost
0,1/6/2018,East,Pencil,95,1.99
1,1/23/2018,Central,Binder,50,19.99
2,2/9/2018,Central,Pencil,36,4.99


## Defining header

- pandas will automatically find the header to use thanks to the tag

In [10]:
html_string = """
<table>
  <tr>
    <td>Order date</td>
    <td>Region</td> 
    <td>Item</td>
    <td>Units</td>
    <td>Unit cost</td>
  </tr>
  <tr>
    <td>1/6/2018</td>
    <td>East</td> 
    <td>Pencil</td>
    <td>95</td>
    <td>1.99</td>
  </tr>
  <tr>
    <td>1/23/2018</td>
    <td>Central</td> 
    <td>Binder</td>
    <td>50</td>
    <td>19.99</td>
  </tr>
  <tr>
    <td>2/9/2018</td>
    <td>Central</td> 
    <td>Pencil</td>
    <td>36</td>
    <td>4.99</td>
  </tr>
  <tr>
    <td>3/15/2018</td>
    <td>West</td> 
    <td>Pen</td>
    <td>27</td>
    <td>19.99</td>
  </tr>
</table>
"""

In [11]:
pd.read_html(html_string)[0]  # Use the first row as the header

  pd.read_html(html_string)[0]  # Use the first row as the header


Unnamed: 0,0,1,2,3,4
0,Order date,Region,Item,Units,Unit cost
1,1/6/2018,East,Pencil,95,1.99
2,1/23/2018,Central,Binder,50,19.99
3,2/9/2018,Central,Pencil,36,4.99
4,3/15/2018,West,Pen,27,19.99


In [12]:
pd.read_html(html_string, header=0)[0]

  pd.read_html(html_string, header=0)[0]


Unnamed: 0,Order date,Region,Item,Units,Unit cost
0,1/6/2018,East,Pencil,95,1.99
1,1/23/2018,Central,Binder,50,19.99
2,2/9/2018,Central,Pencil,36,4.99
3,3/15/2018,West,Pen,27,19.99


## Parsing HTML tables from the web 

In [13]:
html_url = "https://www.basketball-reference.com/leagues/NBA_2019_per_game.html"

In [14]:
nba_tables = pd.read_html(html_url)

In [15]:
len(nba_tables)  # Check how many DataFrames were created

2

In [16]:
nba = nba_tables[0]  # Get the first DataFrame

In [17]:
nba.head()

Unnamed: 0,Rk,Player,Age,Team,Pos,G,GS,MP,FG,FGA,...,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,Awards
0,1.0,James Harden,29.0,HOU,PG,78.0,78.0,36.8,10.8,24.5,...,0.8,5.8,6.6,7.5,2.0,0.7,5.0,3.1,36.1,"MVP-2,AS,NBA1"
1,2.0,Paul George,28.0,OKC,SF,77.0,77.0,36.9,9.2,21.0,...,1.4,6.8,8.2,4.1,2.2,0.4,2.7,2.8,28.0,"MVP-3,DPOY-3,AS,NBA1"
2,3.0,Giannis Antetokounmpo,24.0,MIL,PF,72.0,72.0,32.8,10.0,17.3,...,2.2,10.3,12.5,5.9,1.3,1.5,3.7,3.2,27.7,"MVP-1,DPOY-2,AS,NBA1"
3,4.0,Joel Embiid,24.0,PHI,C,64.0,64.0,33.7,9.1,18.7,...,2.5,11.1,13.6,3.7,0.7,1.9,3.5,3.3,27.5,"MVP-7,DPOY-4,AS,NBA2"
4,5.0,LeBron James,34.0,LAL,SF,55.0,55.0,35.2,10.1,19.9,...,1.0,7.4,8.5,8.3,1.3,0.6,3.6,1.7,27.4,"MVP-11,AS,NBA3"


### Complex Example
- We can also use the Requests module to get HTML codes from URL to parse into a DataFrame object

In [18]:
import requests

html_url = "https://en.wikipedia.org/wiki/The_Simpsons"

In [19]:
r = requests.get(html_url)
wiki_tables = pd.read_html(r.text, header=0)

  wiki_tables = pd.read_html(r.text, header=0)


In [20]:
len(wiki_tables)  # Check how many DataFrames were created

49

In [21]:
simpsons = wiki_tables[1]  # Get the first DataFrame

In [22]:
simpsons.head()

Unnamed: 0,Cast members,Cast members.1,Cast members.2,Cast members.3,Cast members.4,Cast members.5,Cast members.6,Cast members.7,Cast members.8
0,,,,,,,,,
1,Dan Castellaneta,Julie Kavner,Nancy Cartwright,Yeardley Smith,Hank Azaria,Harry Shearer,,,
2,"Homer Simpson, Abe Simpson, Krusty the Clown, ...","Marge Simpson, Patty and Selma Bouvier, additi...","Bart Simpson, Maggie Simpson, Nelson Muntz, va...",Lisa Simpson,"Moe Szyslak, Chief Wiggum, Apu Nahasapeemapeti...","Ned Flanders, Mr. Burns, Dr. Hibbert (1990–202...",,,


In [23]:
simpsons.drop([0,1], inplace=True)  # Drop the first two rows