# Ch 5: Getting Comfortable with Different Kinds of Data Sources

In [5]:
# !apt-get update
# !apt-get install -y default-jdk
# !pip install tabula-py xlrd lxml 

In [47]:
# import requests
import pandas as pd
import numpy as np

## Exercise 60: Reading Data from a CSV File Where Headers Are Missing

1) Read a CSV file:

In [6]:
df1 = pd.read_csv('CSV_EX_1.csv')
df1

Unnamed: 0,Bedroom,Sq. foot,Locality,Price ($)
0,2,1500,Good,300000
1,3,1300,Fair,240000
2,3,1900,Very good,450000
3,3,1850,Bad,280000
4,2,1640,Good,310000


2) Read a CSV file that has no header:

In [7]:
df2 = pd.read_csv('CSV_EX_2.csv')
df2

Unnamed: 0,2,1500,Good,300000
0,3,1300,Fair,240000
1,3,1900,Very good,450000
2,3,1850,Bad,280000
3,2,1640,Good,310000


3) Avoid assigning the top row as header by using **header=None**:  

In [8]:
df2 = pd.read_csv('CSV_EX_2.csv', header=None)
df2

Unnamed: 0,0,1,2,3
0,2,1500,Good,300000
1,3,1300,Fair,240000
2,3,1900,Very good,450000
3,3,1850,Bad,280000
4,2,1640,Good,310000


Notice how it defaults to numeric indices.

4) Use the **names** argument to assign headers:

In [9]:
df2 = pd.read_csv('CSV_EX_2.csv', header=None, 
                  names=['Bedroom', 'Sq.foot', 'Locality', 'Price ($)'])
df2

Unnamed: 0,Bedroom,Sq.ft,Locality,Price($)
0,2,1500,Good,300000
1,3,1300,Fair,240000
2,3,1900,Very good,450000
3,3,1850,Bad,280000
4,2,1640,Good,310000


## Exercise 61: Reading from a CSV File where Delimiters are not Commas

1) Read a CSV file:

In [10]:
df3 = pd.read_csv('CSV_EX_3.csv')
df3

Unnamed: 0,Bedroom; Sq. foot; Locality; Price ($)
0,2; 1500; Good; 300000
1,3; 1300; Fair; 240000
2,3; 1900; Very good; 450000
3,3; 1850; Bad; 280000
4,2; 1640; Good; 310000


Notice this DF has a semicolon separator.

2) Specify the ';' delimiter explicitly:

In [11]:
df3 = pd.read_csv('CSV_EX_3.csv', sep=';')
df3

Unnamed: 0,Bedroom,Sq. foot,Locality,Price ($)
0,2,1500,Good,300000
1,3,1300,Fair,240000
2,3,1900,Very good,450000
3,3,1850,Bad,280000
4,2,1640,Good,310000


## Exercise 62: Bypassing the Headers of a CSV File

1) Try to specify your own headers on a file with existing headers:

In [12]:
df4 = pd.read_csv('CSV_EX_1.csv', names=['A', 'B', 'C', 'D'])
df4

Unnamed: 0,A,B,C,D
0,Bedroom,Sq. foot,Locality,Price ($)
1,2,1500,Good,300000
2,3,1300,Fair,240000
3,3,1900,Very good,450000
4,3,1850,Bad,280000
5,2,1640,Good,310000


2) If you want to use your own headers instead of the given headers, you need to set **header=0**:

In [13]:
df4 = pd.read_csv('CSV_EX_1.csv', header=0, names=['A', 'B', 'C', 'D'])
df4

Unnamed: 0,A,B,C,D
0,2,1500,Good,300000
1,3,1300,Fair,240000
2,3,1900,Very good,450000
3,3,1850,Bad,280000
4,2,1640,Good,310000


## Exercise 63: Skipping Initial Rows and Footers when Reading a CSV File

1) Sometimes the first few rows are just metadata that you don't want in your table:

In [14]:
df5 = pd.read_csv('CSV_EX_skiprows.csv')
df5

Unnamed: 0,Filetype: CSV,Unnamed: 1,Unnamed: 2,Unnamed: 3
0,,Info about some houses,,
1,Bedroom,Sq. foot,Locality,Price ($)
2,2,1500,Good,300000
3,3,1300,Fair,240000
4,3,1900,Very good,450000
5,3,1850,Bad,280000
6,2,1640,Good,310000


2) Skip the first two rows using **skiprows**:

In [15]:
df5 = pd.read_csv('CSV_EX_skiprows.csv', skiprows=2)
df5

Unnamed: 0,Bedroom,Sq. foot,Locality,Price ($)
0,2,1500,Good,300000
1,3,1300,Fair,240000
2,3,1900,Very good,450000
3,3,1850,Bad,280000
4,2,1640,Good,310000


3) You may also need to skip the footers:

In [17]:
df6 = pd.read_csv('CSV_EX_skipfooter.csv', skiprows=2)
df6

Unnamed: 0,Bedroom,Sq. foot,Locality,Price ($)
0,2.0,1500,Good,300000.0
1,3.0,1300,Fair,240000.0
2,3.0,1900,Very good,450000.0
3,3.0,1850,Bad,280000.0
4,2.0,1640,Good,310000.0
5,,This is the end of file,,


4) Use **skipfooter** with the 'Python' **engine** to skip footers:

In [18]:
df6 = pd.read_csv('CSV_EX_skipfooter.csv', skiprows=2,
                 skipfooter=1, engine='python')
df6

Unnamed: 0,Bedroom,Sq. foot,Locality,Price ($)
0,2,1500,Good,300000
1,3,1300,Fair,240000
2,3,1900,Very good,450000
3,3,1850,Bad,280000
4,2,1640,Good,310000


### Reading Only the First N Rows (Especially Useful for Large Files)

Use **nrows** to read the first N rows:

In [19]:
df7 = pd.read_csv('CSV_EX_1.csv', nrows=2)
df7

Unnamed: 0,Bedroom,Sq. foot,Locality,Price ($)
0,2,1500,Good,300000
1,3,1300,Fair,240000


## Exercise 64: Combining Skiprows and Nrows to Read Data in Small Chunks

Can combine **skiprows** and **nrows** to read a large file in smaller chunks of pre-determined sizes.

1) Create a list to store DFs:

In [20]:
listOfDFs = []

2) Number of rows to read per chunk:

In [21]:
rowsInAChunk = 10

3) Number of chunks to read:

In [22]:
numChunks = 5

4) Dummy DF to get column names:

In [24]:
dummyDF = pd.read_csv('Boston_housing.csv', nrows=2)
colNames = dummyDF.columns
colNames

Index(['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX',
       'PTRATIO', 'B', 'LSTAT', 'PRICE'],
      dtype='object')

5) Loop over DF to read a few chunks of lines:

In [25]:
for i in range(0, numChunks*rowsInAChunk, rowsInAChunk): 
    df = pd.read_csv('Boston_housing.csv', header=0, skiprows=i, 
                     nrows=rowsInAChunk, names=colNames)
    listOfDFs.append(df)

### Setting the **skip_blank_lines** Option

In [29]:
df9 = pd.read_csv('CSV_EX_blankline.csv')
df9

Unnamed: 0,Bedroom,Sq. foot,Locality,Price ($)
0,2,1500,Good,300000
1,3,1300,Fair,240000
2,3,1900,Very good,450000
3,3,1850,Bad,280000
4,2,1640,Good,310000


In [30]:
df9 = pd.read_csv('CSV_EX_blankline.csv', skip_blank_lines=False)
df9

Unnamed: 0,Bedroom,Sq. foot,Locality,Price ($)
0,2.0,1500.0,Good,300000.0
1,3.0,1300.0,Fair,240000.0
2,,,,
3,3.0,1900.0,Very good,450000.0
4,3.0,1850.0,Bad,280000.0
5,,,,
6,2.0,1640.0,Good,310000.0


### Read CSV from a Zip file
As long as the CSV is the only file in the zip.

In [31]:
df10 = pd.read_csv('CSV_EX_1.zip')
df10

Unnamed: 0,Bedroom,Sq. foot,Locality,Price ($)
0,2,1500,Good,300000
1,3,1300,Fair,240000
2,3,1900,Very good,450000
3,3,1850,Bad,280000
4,2,1640,Good,310000


### Reading from an Excel File Using **sheet_name** and Handling a Distinct sheet_name
Excel files can consist of multiple sheets, so you must specify.

In [32]:
df11_1 = pd.read_excel('Housing_data.xlsx', sheet_name='Data_Tab_1')
df11_2 = pd.read_excel('Housing_data.xlsx', sheet_name='Data_Tab_2')
df11_3 = pd.read_excel('Housing_data.xlsx', sheet_name='Data_Tab_3')

If sheet_name is set to **None**, an ordered dictionary will be returned, which can be iterated over to return DFs.

In [33]:
dict_df = pd.read_excel('Housing_data.xlsx', sheet_name=None)
dict_df.keys()

dict_keys(['Data_Tab_1', 'Data_Tab_2', 'Data_Tab_3'])

## Exercise 65: Reading a General Delimited Text File

1) Must pass proper separator if anything other than whitespace or tab.

In [39]:
df13 = pd.read_table('Table_EX_1.txt')
df13

Unnamed: 0,"Bedroom, Sq. foot, Locality, Price ($)"
0,"2, 1500, Good, 300000"
1,"3, 1300, Fair, 240000"
2,"3, 1900, Very good, 450000"
3,"3, 1850, Bad, 280000"
4,"2, 1640, Good, 310000"


2) Explicitly set the separator ',':

In [40]:
df13 = pd.read_table('Table_EX_1.txt', sep=',')
df13

Unnamed: 0,Bedroom,Sq. foot,Locality,Price ($)
0,2,1500,Good,300000
1,3,1300,Fair,240000
2,3,1900,Very good,450000
3,3,1850,Bad,280000
4,2,1640,Good,310000


### Reading HTML Tables Directly from a URL

In [49]:
url = 'http://www.fdic.gov/bank/individual/failed/banklist.html'
# listOfDFs = pd.read_html(url)

For some reason it won't recognize the table in the website.

In [44]:
df14 = listOfDFs[0]
df14.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,PRICE
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,36.2


## Exercise 66: Further Wrangling to Get the Desired Data

1) Get the table of the 2016 summer Olympics medal tally (by nation) from Wikipedia.

In [50]:
listOfDFs = pd.read_html('https://en.wikipedia.org/wiki/2016_Summer_Olympics_medal_table',
                        header=0)

2) Check the length of the list:

In [53]:
len(listOfDFs) # should be 6

6

3) Loop to find table (according to table sizes):

In [54]:
for t in listOfDFs:
    print(t.shape)

(1, 1)
(87, 6)
(10, 9)
(4, 2)
(1, 2)
(1, 2)


4) Extract the second table:

In [55]:
df15 = listOfDFs[1]
df15

Unnamed: 0,Rank,NOC,Gold,Silver,Bronze,Total
0,1,United States (USA),46,37,38,121
1,2,Great Britain (GBR),27,23,17,67
2,3,China (CHN),26,18,26,70
3,4,Russia (RUS),19,17,20,56
4,5,Germany (GER),17,10,15,42
...,...,...,...,...,...,...
82,78,Nigeria (NGR),0,0,1,1
83,78,Portugal (POR),0,0,1,1
84,78,Trinidad and Tobago (TTO),0,0,1,1
85,78,United Arab Emirates (UAE),0,0,1,1


## Exercise 67: Reading from a JSON File

1) Import the JSON file to a DF:

In [56]:
df16 = pd.read_json('movies.json')
df16.head()

Unnamed: 0,title,year,cast,genres
0,After Dark in Central Park,1900,[],[]
1,Boarding School Girls' Pajama Parade,1900,[],[]
2,Buffalo Bill's Wild West Parad,1900,[],[]
3,Caught,1900,[],[]
4,Clowns Spinning Hats,1900,[],[]


2) Extract the cast list for the 2012 Avengers movie:

In [57]:
avengersCast = df16[(df16['title']=='The Avengers') & 
                    (df16['year']==2012)]['cast']

In [61]:
list(avengersCast)

[['Robert Downey, Jr.',
  'Chris Evans',
  'Mark Ruffalo',
  'Chris Hemsworth',
  'Scarlett Johansson',
  'Jeremy Renner',
  'Tom Hiddleston',
  'Clark Gregg',
  'Cobie Smulders',
  'Stellan Skarsgård',
  'Samuel L. Jackson']]

In [62]:
avengersGenre = df16[(df16['title']=='The Avengers') & 
                    (df16['year']==2012)]['genres']

In [63]:
list(avengersGenre)

[['Superhero']]

### Reading a Stata File

In [64]:
df17 = pd.read_stata('wu-data.dta')
df17

Unnamed: 0,id,year,province,totalpop,totalso2,reg_GDP,time,treatment,provincearea,group,SO2PC,SO2PGDP,GDPPC,GDPPC2,pop_density
0,Beijing,1991,Beijing,1094.0,210000,598.900024,1.0,0,16800,1,191.956131,191.956131,0.547441,0.299691,0.065119
1,Beijing,1992,Beijing,1102.0,200000,709.099976,2.0,0,16800,1,181.488205,181.488205,0.643466,0.414049,0.065595
2,Beijing,1993,Beijing,1112.0,203736,863.530029,3.0,0,16800,1,183.215820,183.215820,0.776556,0.603039,0.066190
3,Beijing,1994,Beijing,1125.0,175616,1084.030029,4.0,0,16800,1,156.103104,156.103104,0.963582,0.928491,0.066964
4,Beijing,1995,Beijing,1251.0,214899,1394.890015,5.0,0,16800,1,171.781769,171.781769,1.115020,1.243270,0.074464
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
505,Zhejiang,2003,Zhejiang,4680.0,707271,9705.019531,13.0,0,101800,0,151.126282,151.126282,2.073722,4.300323,0.045972
506,Zhejiang,2004,Zhejiang,4720.0,789000,11648.700195,14.0,0,101800,0,167.161011,167.161011,2.467945,6.090752,0.046365
507,Zhejiang,2005,Zhejiang,4898.0,831000,15742.509766,15.0,0,101800,0,169.661087,169.661087,3.214069,10.330239,0.048114
508,Zhejiang,2006,Zhejiang,4980.0,829000,15742.509766,16.0,0,101800,0,166.465866,166.465866,3.161147,9.992848,0.048919


## Exercise 68: Reading Tabular Data from a PDF File

In [70]:
import urllib3
import pytest
import flake8
import distro
import pathlib
import tabula

In [65]:
from tabula import read_pdf

1) Find the PDF file, retrieve the tables from the two pages, and join to one table.

In [71]:
df18_1 = read_pdf('Housing_data.pdf', pages=[1], pandas_options={'header':None})
df18_1

JavaNotFoundError: `java` command is not found from this Python process.Please ensure Java is installed and PATH is set for `java`