Problems arise even before you can use Pandas - most of the time, your dataset simply can’t be imported with simple `read_` functions. In this section, we try to tackle a myriad of gotchas with different data sources.

In [3]:
import pandas as pd

# Dealing with messy Excel sheets and misformatted CSV files


In [16]:
# import a simple excel file
df = pd.read_excel("chapter1.xlsx")
df

Unnamed: 0,id,first_name,last_name,email,gender,ip_address
0,1,sheet_1,Creavin,acreavin0@ft.com,Male,219.239.109.106
1,2,Farand,Twoohy,ftwoohy1@unblog.fr,Female,67.166.10.128
2,3,Kelsey,Dabbes,kdabbes2@bing.com,Male,92.93.7.233
3,4,Pansie,Itzchaky,pitzchaky3@newyorker.com,Female,32.29.61.166
4,5,Bobbette,Tonnesen,btonnesen4@discovery.com,Female,247.13.30.175
5,6,Gianni,Spurier,gspurier5@feedburner.com,Male,240.87.93.73
6,7,Franklyn,Slaten,fslaten6@techcrunch.com,Male,232.198.29.44
7,8,Manda,Lovering,mlovering7@yolasite.com,Female,194.168.231.187
8,9,Joshia,Castelijn,jcastelijn8@scribd.com,Male,211.202.207.204
9,10,Kerry,Gewer,kgewer9@wordpress.org,Female,4.124.71.3


In [17]:
df = pd.read_excel(open("chapter1.xlsx", "rb"))
df

Unnamed: 0,id,first_name,last_name,email,gender,ip_address
0,1,sheet_1,Creavin,acreavin0@ft.com,Male,219.239.109.106
1,2,Farand,Twoohy,ftwoohy1@unblog.fr,Female,67.166.10.128
2,3,Kelsey,Dabbes,kdabbes2@bing.com,Male,92.93.7.233
3,4,Pansie,Itzchaky,pitzchaky3@newyorker.com,Female,32.29.61.166
4,5,Bobbette,Tonnesen,btonnesen4@discovery.com,Female,247.13.30.175
5,6,Gianni,Spurier,gspurier5@feedburner.com,Male,240.87.93.73
6,7,Franklyn,Slaten,fslaten6@techcrunch.com,Male,232.198.29.44
7,8,Manda,Lovering,mlovering7@yolasite.com,Female,194.168.231.187
8,9,Joshia,Castelijn,jcastelijn8@scribd.com,Male,211.202.207.204
9,10,Kerry,Gewer,kgewer9@wordpress.org,Female,4.124.71.3


In [19]:
# import a specific sheet with the name of sheet
df = pd.read_excel("chapter1.xlsx", sheet_name="Sheet2")
df

Unnamed: 0,id,first_name,last_name,email,gender,ip_address
0,1,sheet_2,Creavin,acreavin0@ft.com,Male,219.239.109.106
1,2,Farand,Twoohy,ftwoohy1@unblog.fr,Female,67.166.10.128
2,3,Kelsey,Dabbes,kdabbes2@bing.com,Male,92.93.7.233
3,4,Pansie,Itzchaky,pitzchaky3@newyorker.com,Female,32.29.61.166
4,5,Bobbette,Tonnesen,btonnesen4@discovery.com,Female,247.13.30.175
5,6,Gianni,Spurier,gspurier5@feedburner.com,Male,240.87.93.73
6,7,Franklyn,Slaten,fslaten6@techcrunch.com,Male,232.198.29.44
7,8,Manda,Lovering,mlovering7@yolasite.com,Female,194.168.231.187
8,9,Joshia,Castelijn,jcastelijn8@scribd.com,Male,211.202.207.204
9,10,Kerry,Gewer,kgewer9@wordpress.org,Female,4.124.71.3


In [20]:
# import a specific sheet by the sheet ordering
df = pd.read_excel("chapter1.xlsx", sheet_name=1)
df

Unnamed: 0,id,first_name,last_name,email,gender,ip_address
0,1,sheet_2,Creavin,acreavin0@ft.com,Male,219.239.109.106
1,2,Farand,Twoohy,ftwoohy1@unblog.fr,Female,67.166.10.128
2,3,Kelsey,Dabbes,kdabbes2@bing.com,Male,92.93.7.233
3,4,Pansie,Itzchaky,pitzchaky3@newyorker.com,Female,32.29.61.166
4,5,Bobbette,Tonnesen,btonnesen4@discovery.com,Female,247.13.30.175
5,6,Gianni,Spurier,gspurier5@feedburner.com,Male,240.87.93.73
6,7,Franklyn,Slaten,fslaten6@techcrunch.com,Male,232.198.29.44
7,8,Manda,Lovering,mlovering7@yolasite.com,Female,194.168.231.187
8,9,Joshia,Castelijn,jcastelijn8@scribd.com,Male,211.202.207.204
9,10,Kerry,Gewer,kgewer9@wordpress.org,Female,4.124.71.3


In [21]:
# import a few sheets with a list
df = pd.read_excel("chapter1.xlsx", sheet_name=[0, 1, "Sheet3"])
df

OrderedDict([(0,
                  id first_name   last_name                          email  gender  \
              0    1    sheet_1     Creavin               acreavin0@ft.com    Male   
              1    2     Farand      Twoohy             ftwoohy1@unblog.fr  Female   
              2    3     Kelsey      Dabbes              kdabbes2@bing.com    Male   
              3    4     Pansie    Itzchaky       pitzchaky3@newyorker.com  Female   
              4    5   Bobbette    Tonnesen       btonnesen4@discovery.com  Female   
              5    6     Gianni     Spurier       gspurier5@feedburner.com    Male   
              6    7   Franklyn      Slaten        fslaten6@techcrunch.com    Male   
              7    8      Manda    Lovering        mlovering7@yolasite.com  Female   
              8    9     Joshia   Castelijn         jcastelijn8@scribd.com    Male   
              9   10      Kerry       Gewer          kgewer9@wordpress.org  Female   
              10  11       Vail      

In [22]:
# import all sheets
df = pd.read_excel("chapter1.xlsx", sheet_name=None)
df

OrderedDict([('Sheet1',
                  id first_name   last_name                          email  gender  \
              0    1    sheet_1     Creavin               acreavin0@ft.com    Male   
              1    2     Farand      Twoohy             ftwoohy1@unblog.fr  Female   
              2    3     Kelsey      Dabbes              kdabbes2@bing.com    Male   
              3    4     Pansie    Itzchaky       pitzchaky3@newyorker.com  Female   
              4    5   Bobbette    Tonnesen       btonnesen4@discovery.com  Female   
              5    6     Gianni     Spurier       gspurier5@feedburner.com    Male   
              6    7   Franklyn      Slaten        fslaten6@techcrunch.com    Male   
              7    8      Manda    Lovering        mlovering7@yolasite.com  Female   
              8    9     Joshia   Castelijn         jcastelijn8@scribd.com    Male   
              9   10      Kerry       Gewer          kgewer9@wordpress.org  Female   
              10  11       Vai

In [24]:
# a csv with a header
df = pd.read_csv("chapter1.csv", header=0)
df

Unnamed: 0,id,first_name,last_name,email,gender,ip_address
0,1,sheet_1,Creavin,acreavin0@ft.com,Male,219.239.109.106
1,2,Farand,Twoohy,ftwoohy1@unblog.fr,Female,67.166.10.128
2,3,Kelsey,Dabbes,kdabbes2@bing.com,Male,92.93.7.233
3,4,Pansie,Itzchaky,pitzchaky3@newyorker.com,Female,32.29.61.166
4,5,Bobbette,Tonnesen,btonnesen4@discovery.com,Female,247.13.30.175
5,6,Gianni,Spurier,gspurier5@feedburner.com,Male,240.87.93.73
6,7,Franklyn,Slaten,fslaten6@techcrunch.com,Male,232.198.29.44
7,8,Manda,Lovering,mlovering7@yolasite.com,Female,194.168.231.187
8,9,Joshia,Castelijn,jcastelijn8@scribd.com,Male,211.202.207.204
9,10,Kerry,Gewer,kgewer9@wordpress.org,Female,4.124.71.3


In [25]:
# a csv with no headers, self define headers
df = pd.read_csv("chapter1.csv", header=None, names=["A", "B", "C", "D", "E", "F"])
df

Unnamed: 0,A,B,C,D,E,F
0,id,first_name,last_name,email,gender,ip_address
1,1,sheet_1,Creavin,acreavin0@ft.com,Male,219.239.109.106
2,2,Farand,Twoohy,ftwoohy1@unblog.fr,Female,67.166.10.128
3,3,Kelsey,Dabbes,kdabbes2@bing.com,Male,92.93.7.233
4,4,Pansie,Itzchaky,pitzchaky3@newyorker.com,Female,32.29.61.166
5,5,Bobbette,Tonnesen,btonnesen4@discovery.com,Female,247.13.30.175
6,6,Gianni,Spurier,gspurier5@feedburner.com,Male,240.87.93.73
7,7,Franklyn,Slaten,fslaten6@techcrunch.com,Male,232.198.29.44
8,8,Manda,Lovering,mlovering7@yolasite.com,Female,194.168.231.187
9,9,Joshia,Castelijn,jcastelijn8@scribd.com,Male,211.202.207.204


In [26]:
# a csv with all useful columns
df = pd.read_csv("chapter1.csv")
df

Unnamed: 0,id,first_name,last_name,email,gender,ip_address
0,1,sheet_1,Creavin,acreavin0@ft.com,Male,219.239.109.106
1,2,Farand,Twoohy,ftwoohy1@unblog.fr,Female,67.166.10.128
2,3,Kelsey,Dabbes,kdabbes2@bing.com,Male,92.93.7.233
3,4,Pansie,Itzchaky,pitzchaky3@newyorker.com,Female,32.29.61.166
4,5,Bobbette,Tonnesen,btonnesen4@discovery.com,Female,247.13.30.175
5,6,Gianni,Spurier,gspurier5@feedburner.com,Male,240.87.93.73
6,7,Franklyn,Slaten,fslaten6@techcrunch.com,Male,232.198.29.44
7,8,Manda,Lovering,mlovering7@yolasite.com,Female,194.168.231.187
8,9,Joshia,Castelijn,jcastelijn8@scribd.com,Male,211.202.207.204
9,10,Kerry,Gewer,kgewer9@wordpress.org,Female,4.124.71.3


In [27]:
# a csv with 5 useful columns
df = pd.read_csv("chapter1.csv", usecols=[0, 1, 2])
df

Unnamed: 0,id,first_name,last_name
0,1,sheet_1,Creavin
1,2,Farand,Twoohy
2,3,Kelsey,Dabbes
3,4,Pansie,Itzchaky
4,5,Bobbette,Tonnesen
5,6,Gianni,Spurier
6,7,Franklyn,Slaten
7,8,Manda,Lovering
8,9,Joshia,Castelijn
9,10,Kerry,Gewer


In [28]:
df = pd.read_csv("chapter1.csv", usecols=[2, 1, 0])
df

Unnamed: 0,id,first_name,last_name
0,1,sheet_1,Creavin
1,2,Farand,Twoohy
2,3,Kelsey,Dabbes
3,4,Pansie,Itzchaky
4,5,Bobbette,Tonnesen
5,6,Gianni,Spurier
6,7,Franklyn,Slaten
7,8,Manda,Lovering
8,9,Joshia,Castelijn
9,10,Kerry,Gewer


# Coping with unstructured HTML and JSON formats


In [30]:
# reading a local HTML file containing a table
df = pd.read_html("chapter1.html")
df

[     id   first_name     last_name                               email  \
 0     1      Jacklyn      Katzmann                jkatzmann0@exblog.jp   
 1     2    Magdalene     Bernadzki                 mbernadzki1@icio.us   
 2     3       Gloria      Lodemann                glodemann2@topsy.com   
 3     4     Prentice      Maillard            pmaillard3@hostgator.com   
 4     5         Arel       Guirard               aguirard4@weather.com   
 5     6        Shane       Doleman                  sdoleman5@dell.com   
 6     7       Talbot       Lofting                   tlofting6@hhs.gov   
 7     8       Marlow       Blacket           mblacket7@fastcompany.com   
 8     9      Fleming        Sunter                fsunter8@twitter.com   
 9    10     Laughton        Cadman                   lcadman9@uiuc.edu   
 10   11        Belle       Elbourn             belbourna@homestead.com   
 11   12       Durand         Ilyas                  dilyasb@netlog.com   
 12   13      Amerigo    

In [7]:
# reading a URL containing a table
url = 'http://www.fdic.gov/bank/individual/failed/banklist.html'
df = pd.read_html(url)
df

[                                             Bank Name                City  \
 0                  Washington Federal Bank for Savings             Chicago   
 1      The Farmers and Merchants State Bank of Argonia             Argonia   
 2                                  Fayette County Bank          Saint Elmo   
 3    Guaranty Bank, (d/b/a BestBank in Georgia & Mi...           Milwaukee   
 4                                       First NBC Bank         New Orleans   
 5                                        Proficio Bank  Cottonwood Heights   
 6                        Seaway Bank and Trust Company             Chicago   
 7                               Harvest Community Bank          Pennsville   
 8                                          Allied Bank            Mulberry   
 9                         The Woodbury Banking Company            Woodbury   
 10                              First CornerStone Bank     King of Prussia   
 11                                  Trust Company B

In [31]:
# specifying what table is parsed using a HTML attribute
df = pd.read_html("chapter1.html", attrs = {'id': 'table1'})
df

[     id   first_name     last_name                               email  \
 0     1      Jacklyn      Katzmann                jkatzmann0@exblog.jp   
 1     2    Magdalene     Bernadzki                 mbernadzki1@icio.us   
 2     3       Gloria      Lodemann                glodemann2@topsy.com   
 3     4     Prentice      Maillard            pmaillard3@hostgator.com   
 4     5         Arel       Guirard               aguirard4@weather.com   
 5     6        Shane       Doleman                  sdoleman5@dell.com   
 6     7       Talbot       Lofting                   tlofting6@hhs.gov   
 7     8       Marlow       Blacket           mblacket7@fastcompany.com   
 8     9      Fleming        Sunter                fsunter8@twitter.com   
 9    10     Laughton        Cadman                   lcadman9@uiuc.edu   
 10   11        Belle       Elbourn             belbourna@homestead.com   
 11   12       Durand         Ilyas                  dilyasb@netlog.com   
 12   13      Amerigo    

In [35]:
# reading a HTML containing a European thousands separator
df = pd.read_html("chapter1.html", attrs = {'id': 'table3'})
df

[     id
 0  9651
 1  2317
 2  8689
 3  9028
 4  3925
 5  5843
 6   349
 7  2341
 8  1005
 9  9113]

In [36]:
df = pd.read_html("chapter1.html", attrs = {'id': 'table3'}, thousands='.')
df

[      id
 0  96,51
 1  23,17
 2  86,89
 3  90,28
 4  39,25
 5  58,43
 6   3,49
 7  23,41
 8  10,05
 9  91,13]

In [37]:
# reading in hidden tables
df = pd.read_html("chapter1.html", displayed_only=False)
df

[     id   first_name     last_name                               email  \
 0     1      Jacklyn      Katzmann                jkatzmann0@exblog.jp   
 1     2    Magdalene     Bernadzki                 mbernadzki1@icio.us   
 2     3       Gloria      Lodemann                glodemann2@topsy.com   
 3     4     Prentice      Maillard            pmaillard3@hostgator.com   
 4     5         Arel       Guirard               aguirard4@weather.com   
 5     6        Shane       Doleman                  sdoleman5@dell.com   
 6     7       Talbot       Lofting                   tlofting6@hhs.gov   
 7     8       Marlow       Blacket           mblacket7@fastcompany.com   
 8     9      Fleming        Sunter                fsunter8@twitter.com   
 9    10     Laughton        Cadman                   lcadman9@uiuc.edu   
 10   11        Belle       Elbourn             belbourna@homestead.com   
 11   12       Durand         Ilyas                  dilyasb@netlog.com   
 12   13      Amerigo    

In [44]:
# “Normalize” semi-structured JSON data into a flat table
json_data = [{
    "country": "UK",
    "key_people": {
        "prime_minister": "theresa may"
    }
}]
pd.io.json.json_normalize(json_data)

Unnamed: 0,country,key_people.prime_minister
0,UK,theresa may


# Handling too much data from HDF5 and SQL sources


In [14]:
# creating HDF5 stores


In [None]:
# inserting DataFrames into HDF5 files


In [None]:
# reading and writing HDF5


In [None]:
# table format HDF5 (append vs put)


In [None]:
# Hierarchical Keys

In [None]:
# querying HDF5 stores

In [38]:
# creating an SQLAlchemy engine
from sqlalchemy import create_engine
engine = create_engine('sqlite:///:memory:')

In [40]:
# writing data from pandas to SQL
df[0].to_sql('data', engine)

In [41]:
# reading data from SQL to pandas
df2 = pd.read_sql_table('data', engine)
df2

Unnamed: 0,index,id,first_name,last_name,email,gender,ip_address
0,0,1,Jacklyn,Katzmann,jkatzmann0@exblog.jp,Female,56.12.241.249
1,1,2,Magdalene,Bernadzki,mbernadzki1@icio.us,Female,12.32.253.13
2,2,3,Gloria,Lodemann,glodemann2@topsy.com,Female,235.51.30.138
3,3,4,Prentice,Maillard,pmaillard3@hostgator.com,Male,137.169.60.41
4,4,5,Arel,Guirard,aguirard4@weather.com,Male,195.224.211.181
5,5,6,Shane,Doleman,sdoleman5@dell.com,Female,174.150.239.74
6,6,7,Talbot,Lofting,tlofting6@hhs.gov,Male,5.113.125.2
7,7,8,Marlow,Blacket,mblacket7@fastcompany.com,Male,189.148.2.7
8,8,9,Fleming,Sunter,fsunter8@twitter.com,Male,80.75.79.52
9,9,10,Laughton,Cadman,lcadman9@uiuc.edu,Male,97.46.151.242
