In [2]:
import numpy as np
import pandas as pd

# Working with Data in Pandas
### Reading & Writing Text Files

In [5]:
# create a DataFrame from a csv file
df1 = pd.read_csv('Test_File.csv')
df1

Unnamed: 0,q,r,s,t,apple
0,2,3,4,5,pear
1,a,s,d,f,rabbit
2,5,2,5,7,dog


Note what happened. We read a comma-delimited file into a DataFrame, and used the first row as the column titles. If we specify the header = None keyword argument, we won't use the first row to create column names. 

In [12]:
# import csv file without using the top row as the column names
df2 = pd.read_csv('Test_File.csv', header = None)
df2

Unnamed: 0,0,1,2,3,4
0,q,r,s,t,apple
1,2,3,4,5,pear
2,a,s,d,f,rabbit
3,5,2,5,7,dog


We can also import using a more general method. The .read_table() method can read a CSV file in the same way, but we need to specify the delimiter. 

In [11]:
# import csv file using .read_table() method
df3 = pd.read_table('Test_File.csv', sep = ',', header = None)
df3

Unnamed: 0,0,1,2,3,4
0,q,r,s,t,apple
1,2,3,4,5,pear
2,a,s,d,f,rabbit
3,5,2,5,7,dog


We can also indicate the specific number of rows to be read.

In [14]:
#import top two rows from csv file
df4 = pd.read_csv('Test_File.csv', header = None, nrows = 2)
df4

Unnamed: 0,0,1,2,3,4
0,q,r,s,t,apple
1,2,3,4,5,pear


**Note:** Instead of letting the following commands write files to the disk, we're going to redirect the standard output to show what *would* have been written to the disk instead.

In [16]:
import sys

In [19]:
# export a DataFrame to a csv file
df2.to_csv('DataFrame_Test_Output.csv')

In [20]:
# export a DataFrame, but export it to standard output
df2.to_csv(sys.stdout)

,0,1,2,3,4
0,q,r,s,t,apple
1,2,3,4,5,pear
2,a,s,d,f,rabbit
3,5,2,5,7,dog


In [21]:
# we can also write to a text file with a different delimiter
# tab delimited
df2.to_csv(sys.stdout, sep = '\t')

	0	1	2	3	4
0	q	r	s	t	apple
1	2	3	4	5	pear
2	a	s	d	f	rabbit
3	5	2	5	7	dog


In [23]:
# pipe delimited
df2.to_csv(sys.stdout, sep = '|')

|0|1|2|3|4
0|q|r|s|t|apple
1|2|3|4|5|pear
2|a|s|d|f|rabbit
3|5|2|5|7|dog


In [25]:
# only export the first three columns
df2.to_csv(sys.stdout, columns = [0, 1, 2])

,0,1,2
0,q,r,s
1,2,3,4
2,a,s,d
3,5,2,5


### Reading & Writing JSON Data

In [26]:
import json

In [28]:
# create sample json data
json_obj = """
{   "zoo_animal": "Lion",
    "food": ["Meat", "Veggies", "Honey"],
    "fur": "Golden",
    "clothes": null, 
    "diet": [{"zoo_animal": "Gazelle", "food":"grass", "fur": "Brown"}]
}
"""

In [32]:
# load the json data
json_data = json.loads(json_obj)
json_data

{'clothes': None,
 'diet': [{'food': 'grass', 'fur': 'Brown', 'zoo_animal': 'Gazelle'}],
 'food': ['Meat', 'Veggies', 'Honey'],
 'fur': 'Golden',
 'zoo_animal': 'Lion'}

In [33]:
# export to json
json.dumps(json_data)

'{"zoo_animal": "Lion", "clothes": null, "food": ["Meat", "Veggies", "Honey"], "diet": [{"zoo_animal": "Gazelle", "food": "grass", "fur": "Brown"}], "fur": "Golden"}'

In [37]:
# create a DataFrame with the 'diet' data from the json data
df5 = pd.DataFrame(json_data['diet'])
df5

Unnamed: 0,food,fur,zoo_animal
0,grass,Brown,Gazelle


### XML and HTML

In [2]:
# read HTML info about failed banks
url = 'https://www.fdic.gov/bank/individual/failed/banklist.html'

We'll come back and update this section at a later date. Currently, the beautiful-soup module isn't released for python 3.5. 

### Microsoft Excel Files

In [4]:
# import Excel file
xlsx = pd.ExcelFile('Sample_Excel.xlsx')

In [13]:
# get Excel sheet names
xlsx.sheet_names

['Sheet1']

In [9]:
# create a DataFrame from the imported Excel file sheet 1
df6 = xlsx.parse('Sheet1')
df6

Unnamed: 0,Store Number,Area,Region,District
0,1234,1,5,3
1,5678,4,8,18
2,1357,8,27,17
3,2468,14,62,5
4,9876,18,56,10


In [10]:
# create another DataFrame using the 0th column as the index
df7 = xlsx.parse('Sheet1', index_col = 0)
df7

Unnamed: 0_level_0,Area,Region,District
Store Number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1234,1,5,3
5678,4,8,18
1357,8,27,17
2468,14,62,5
9876,18,56,10


In [15]:
# close the Excel file
xlsx.close

<bound method ExcelFile.close of <pandas.io.excel.ExcelFile object at 0x0000025522E74F98>>