# Reading HTML Table

In [6]:
import pandas as pd
from io import StringIO

![purple-divider](https://user-images.githubusercontent.com/7065401/52071927-c1cd7100-2562-11e9-908a-dde91ba14e59.png)

# Parsing raw HTML string

In [3]:
html_string = """
<table>
    <thead>
      <tr>
        <th>Order date</th>
        <th>Region</th> 
        <th>Item</th>
        <th>Units</th>
        <th>Unit cost</th>
      </tr>
    </thead>
    <tbody>
      <tr>
        <td>1/6/2018</td>
        <td>East</td> 
        <td>Pencil</td>
        <td>95</td>
        <td>1.99</td>
      </tr>
      <tr>
        <td>1/23/2018</td>
        <td>Central</td> 
        <td>Binder</td>
        <td>50</td>
        <td>19.99</td>
      </tr>
      <tr>
        <td>2/9/2018</td>
        <td>Central</td> 
        <td>Pencil</td>
        <td>36</td>
        <td>4.99</td>
      </tr>
      <tr>
        <td>3/15/2018</td>
        <td>West</td> 
        <td>Pen</td>
        <td>27</td>
        <td>19.99</td>
      </tr>
    </tbody>
</table>
"""

In [10]:
from IPython.core.display import display, HTML
display(HTML(html_string))

  from IPython.core.display import display, HTML


Order date,Region,Item,Units,Unit cost
1/6/2018,East,Pencil,95,1.99
1/23/2018,Central,Binder,50,19.99
2/9/2018,Central,Pencil,36,4.99
3/15/2018,West,Pen,27,19.99


In [9]:
dfs = pd.read_html(StringIO(html_string))

In [12]:
dfs

[  Order date   Region    Item  Units  Unit cost
 0   1/6/2018     East  Pencil     95       1.99
 1  1/23/2018  Central  Binder     50      19.99
 2   2/9/2018  Central  Pencil     36       4.99
 3  3/15/2018     West     Pen     27      19.99]

In [13]:
len(dfs)

1

In [15]:
df = dfs[0]
df

Unnamed: 0,Order date,Region,Item,Units,Unit cost
0,1/6/2018,East,Pencil,95,1.99
1,1/23/2018,Central,Binder,50,19.99
2,2/9/2018,Central,Pencil,36,4.99
3,3/15/2018,West,Pen,27,19.99


In [16]:
df.shape

(4, 5)

In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Order date  4 non-null      object 
 1   Region      4 non-null      object 
 2   Item        4 non-null      object 
 3   Units       4 non-null      int64  
 4   Unit cost   4 non-null      float64
dtypes: float64(1), int64(1), object(3)
memory usage: 292.0+ bytes


In [20]:
df.loc[df['Region'] == 'Central']

Unnamed: 0,Order date,Region,Item,Units,Unit cost
1,1/23/2018,Central,Binder,50,19.99
2,2/9/2018,Central,Pencil,36,4.99


In [21]:
df.loc[df['Units'] > 35]

Unnamed: 0,Order date,Region,Item,Units,Unit cost
0,1/6/2018,East,Pencil,95,1.99
1,1/23/2018,Central,Binder,50,19.99
2,2/9/2018,Central,Pencil,36,4.99


![purple-divider](https://user-images.githubusercontent.com/7065401/52071927-c1cd7100-2562-11e9-908a-dde91ba14e59.png)

### Defining Headers

In [22]:
html_string = """
<table>
  <tr>
    <td>Order date</td>
    <td>Region</td> 
    <td>Item</td>
    <td>Units</td>
    <td>Unit cost</td>
  </tr>
  <tr>
    <td>1/6/2018</td>
    <td>East</td> 
    <td>Pencil</td>
    <td>95</td>
    <td>1.99</td>
  </tr>
  <tr>
    <td>1/23/2018</td>
    <td>Central</td> 
    <td>Binder</td>
    <td>50</td>
    <td>19.99</td>
  </tr>
  <tr>
    <td>2/9/2018</td>
    <td>Central</td> 
    <td>Pencil</td>
    <td>36</td>
    <td>4.99</td>
  </tr>
  <tr>
    <td>3/15/2018</td>
    <td>West</td> 
    <td>Pen</td>
    <td>27</td>
    <td>19.99</td>
  </tr>
</table>
"""

In [23]:
pd.read_html(StringIO(html_string))[0]

Unnamed: 0,0,1,2,3,4
0,Order date,Region,Item,Units,Unit cost
1,1/6/2018,East,Pencil,95,1.99
2,1/23/2018,Central,Binder,50,19.99
3,2/9/2018,Central,Pencil,36,4.99
4,3/15/2018,West,Pen,27,19.99


In [24]:
pd.read_html(StringIO(html_string), header=0)[0]

Unnamed: 0,Order date,Region,Item,Units,Unit cost
0,1/6/2018,East,Pencil,95,1.99
1,1/23/2018,Central,Binder,50,19.99
2,2/9/2018,Central,Pencil,36,4.99
3,3/15/2018,West,Pen,27,19.99


![purple-divider](https://user-images.githubusercontent.com/7065401/52071927-c1cd7100-2562-11e9-908a-dde91ba14e59.png)

# Parsing HTML tables from web

In [25]:
html_url = 'https://www.basketball-reference.com/leagues/NBA_2024_per_game.html'

In [26]:
nba_tables = pd.read_html(html_url)

In [27]:
len(nba_tables)

1

In [29]:
nba = nba_tables[0]

In [30]:
nba.head()

Unnamed: 0,Rk,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,...,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
0,1,Precious Achiuwa,PF-C,24,TOT,74,18,21.9,3.2,6.3,...,0.616,2.6,4.0,6.6,1.3,0.6,0.9,1.1,1.9,7.6
1,1,Precious Achiuwa,C,24,TOR,25,0,17.5,3.1,6.8,...,0.571,2.0,3.4,5.4,1.8,0.6,0.5,1.2,1.6,7.7
2,1,Precious Achiuwa,PF,24,NYK,49,18,24.2,3.2,6.1,...,0.643,2.9,4.3,7.2,1.1,0.6,1.1,1.1,2.1,7.6
3,2,Bam Adebayo,C,26,MIA,71,71,34.0,7.5,14.3,...,0.755,2.2,8.1,10.4,3.9,1.1,0.9,2.3,2.2,19.3
4,3,Ochai Agbaji,SG,23,TOT,78,28,21.0,2.3,5.6,...,0.661,0.9,1.8,2.8,1.1,0.6,0.6,0.8,1.5,5.8


![purple-divider](https://user-images.githubusercontent.com/7065401/52071927-c1cd7100-2562-11e9-908a-dde91ba14e59.png)

### Getting HTML code from an URL to parse it into DataFrame objects.

In [81]:
import requests

In [82]:
html_url = 'https://en.wikipedia.org/wiki/The_Simpsons'

In [83]:
r = requests.get(html_url)

In [85]:
wiki_tables = pd.read_html(StringIO(r.text), header=0)

In [86]:
len(wiki_tables)

48

In [87]:
simpsons = wiki_tables[2]

In [88]:
simpsons.head()

Unnamed: 0,Season,Season.1,No. of episodes,Originally aired,Originally aired.1,Originally aired.2,Viewership,Viewership.1,Viewership.2
0,Season,Season,No. of episodes,Season premiere,Season finale,Time slot (ET),Avg. viewers (in millions),Most watched episode,Most watched episode
1,Season,Season,No. of episodes,Season premiere,Season finale,Time slot (ET),Avg. viewers (in millions),Viewers (millions),Episode title
2,1,1989–90,13,"December 17, 1989","May 13, 1990",Sunday 8:30 pm,27.8,33.5[168],"""Life on the Fast Lane"""
3,2,1990–91,22,"October 11, 1990","July 11, 1991",Thursday 8:00 pm,24.4,33.6[169],"""Bart Gets an 'F'"""
4,3,1991–92,24,"September 19, 1991","August 27, 1992",Thursday 8:00 pm,21.8,25.5[170],"""Colonel Homer"""


In [89]:
simpsons.columns = simpsons.iloc[1]

In [90]:
simpsons.head()

1,Season,Season.1,No. of episodes,Season premiere,Season finale,Time slot (ET),Avg. viewers (in millions),Viewers (millions),Episode title
0,Season,Season,No. of episodes,Season premiere,Season finale,Time slot (ET),Avg. viewers (in millions),Most watched episode,Most watched episode
1,Season,Season,No. of episodes,Season premiere,Season finale,Time slot (ET),Avg. viewers (in millions),Viewers (millions),Episode title
2,1,1989–90,13,"December 17, 1989","May 13, 1990",Sunday 8:30 pm,27.8,33.5[168],"""Life on the Fast Lane"""
3,2,1990–91,22,"October 11, 1990","July 11, 1991",Thursday 8:00 pm,24.4,33.6[169],"""Bart Gets an 'F'"""
4,3,1991–92,24,"September 19, 1991","August 27, 1992",Thursday 8:00 pm,21.8,25.5[170],"""Colonel Homer"""


In [91]:
simpsons.drop([0, 1], inplace=True)

In [92]:
simpsons.head()

1,Season,Season.1,No. of episodes,Season premiere,Season finale,Time slot (ET),Avg. viewers (in millions),Viewers (millions),Episode title
2,1,1989–90,13,"December 17, 1989","May 13, 1990",Sunday 8:30 pm,27.8,33.5[168],"""Life on the Fast Lane"""
3,2,1990–91,22,"October 11, 1990","July 11, 1991",Thursday 8:00 pm,24.4,33.6[169],"""Bart Gets an 'F'"""
4,3,1991–92,24,"September 19, 1991","August 27, 1992",Thursday 8:00 pm,21.8,25.5[170],"""Colonel Homer"""
5,4,1992–93,22,"September 24, 1992","May 13, 1993",Thursday 8:00 pm,22.4,28.6[171],"""Lisa's First Word"""
6,5,1993–94,22,"September 30, 1993","May 19, 1994",Thursday 8:00 pm,18.9,24.0[172],"""Treehouse of Horror IV"""


In [95]:
simpsons.set_index("Season")

1,No. of episodes,Season premiere,Season finale,Time slot (ET),Avg. viewers (in millions),Viewers (millions),Episode title
Season,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
"(1, 1989–90)",13,"December 17, 1989","May 13, 1990",Sunday 8:30 pm,27.8,33.5[168],"""Life on the Fast Lane"""
"(2, 1990–91)",22,"October 11, 1990","July 11, 1991",Thursday 8:00 pm,24.4,33.6[169],"""Bart Gets an 'F'"""
"(3, 1991–92)",24,"September 19, 1991","August 27, 1992",Thursday 8:00 pm,21.8,25.5[170],"""Colonel Homer"""
"(4, 1992–93)",22,"September 24, 1992","May 13, 1993",Thursday 8:00 pm,22.4,28.6[171],"""Lisa's First Word"""
"(5, 1993–94)",22,"September 30, 1993","May 19, 1994",Thursday 8:00 pm,18.9,24.0[172],"""Treehouse of Horror IV"""
"(6, 1994–95)",25,"September 4, 1994","May 21, 1995",Sunday 8:00 pm,15.6,22.2[173],"""Treehouse of Horror V"""
"(7, 1995–96)",25,"September 17, 1995","May 19, 1996",Sunday 8:00 pm (Episodes 1–24) Sunday 8:30 pm ...,15.1,22.6[174],"""Who Shot Mr. Burns? – Part II"""
"(8, 1996–97)",25,"October 27, 1996","May 18, 1997",Sunday 8:30 pm (Episodes 1–3) Sunday 8:00 pm (...,14.5,20.41[176],"""The Springfield Files"""
"(9, 1997–98)",25,"September 21, 1997","May 17, 1998",Sunday 8:00 pm,15.3,19.80[177],"""The Two Mrs. Nahasapeemapetilons"""
"(10, 1998–99)",23,"August 23, 1998","May 16, 1999",Sunday 8:00 pm,13.5,19.11[178],"""Sunday, Cruddy Sunday"""


In [79]:
simpsons.head()

1,No. of episodes,Season premiere,Season finale,Time slot (ET),Avg. viewers (in millions),Viewers (millions),Episode title
Season,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
"(1, 1989–90)",13,"December 17, 1989","May 13, 1990",Sunday 8:30 pm,27.8,33.5[168],"""Life on the Fast Lane"""
"(2, 1990–91)",22,"October 11, 1990","July 11, 1991",Thursday 8:00 pm,24.4,33.6[169],"""Bart Gets an 'F'"""
"(3, 1991–92)",24,"September 19, 1991","August 27, 1992",Thursday 8:00 pm,21.8,25.5[170],"""Colonel Homer"""
"(4, 1992–93)",22,"September 24, 1992","May 13, 1993",Thursday 8:00 pm,22.4,28.6[171],"""Lisa's First Word"""
"(5, 1993–94)",22,"September 30, 1993","May 19, 1994",Thursday 8:00 pm,18.9,24.0[172],"""Treehouse of Horror IV"""


![purple-divider](https://user-images.githubusercontent.com/7065401/52071927-c1cd7100-2562-11e9-908a-dde91ba14e59.png)

### Saving to the CSV

In [96]:
simpsons.to_csv('data/simpsons.csv')

In [97]:
new_csv = pd.read_csv('data/simpsons.csv')

In [98]:
new_csv.drop(columns=new_csv.columns[0], axis=1)

Unnamed: 0,Season,Season.1,No. of episodes,Season premiere,Season finale,Time slot (ET),Avg. viewers (in millions),Viewers (millions),Episode title
0,1,1989–90,13,"December 17, 1989","May 13, 1990",Sunday 8:30 pm,27.8,33.5[168],"""Life on the Fast Lane"""
1,2,1990–91,22,"October 11, 1990","July 11, 1991",Thursday 8:00 pm,24.4,33.6[169],"""Bart Gets an 'F'"""
2,3,1991–92,24,"September 19, 1991","August 27, 1992",Thursday 8:00 pm,21.8,25.5[170],"""Colonel Homer"""
3,4,1992–93,22,"September 24, 1992","May 13, 1993",Thursday 8:00 pm,22.4,28.6[171],"""Lisa's First Word"""
4,5,1993–94,22,"September 30, 1993","May 19, 1994",Thursday 8:00 pm,18.9,24.0[172],"""Treehouse of Horror IV"""
5,6,1994–95,25,"September 4, 1994","May 21, 1995",Sunday 8:00 pm,15.6,22.2[173],"""Treehouse of Horror V"""
6,7,1995–96,25,"September 17, 1995","May 19, 1996",Sunday 8:00 pm (Episodes 1–24) Sunday 8:30 pm ...,15.1,22.6[174],"""Who Shot Mr. Burns? – Part II"""
7,8,1996–97,25,"October 27, 1996","May 18, 1997",Sunday 8:30 pm (Episodes 1–3) Sunday 8:00 pm (...,14.5,20.41[176],"""The Springfield Files"""
8,9,1997–98,25,"September 21, 1997","May 17, 1998",Sunday 8:00 pm,15.3,19.80[177],"""The Two Mrs. Nahasapeemapetilons"""
9,10,1998–99,23,"August 23, 1998","May 16, 1999",Sunday 8:00 pm,13.5,19.11[178],"""Sunday, Cruddy Sunday"""


In [99]:
new_csv = new_csv.drop(columns=new_csv.columns[0], axis=1)

In [100]:
new_csv.to_csv('data/simpsons.csv')