In [1]:
#
import pandas as pd
from pandas import Series, DataFrame
import numpy as np
from numpy import nan as NA
import sys

In [2]:
#
import matplotlib.pylab as plt
from numpy.random import randn
%matplotlib inline

# Chapter 6: Data Loading, Storage, and File Formats

Display the content of example/ex1.csv

In [3]:
!cat examples/ex1.csv

a,b,c,d,message
1,2,3,4,hello
5,6,7,8,world
9,10,11,12,foo

Ceate a DataFrame df from ex1.csv file

In [4]:
df = pd.read_csv('examples/ex1.csv')
df

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


Create df again, but use read_table instead

In [5]:
df = pd.read_table('examples/ex1.csv', sep = ',')
df

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


Create a DateFrame from 'examples/ex2.csv', which has no headers

In [6]:
pd.read_csv('examples/ex2.csv', header=None)

Unnamed: 0,0,1,2,3,4
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


Create the DataFrame above again, but now assign column names of ['a', 'b', 'c', 'd', 'message']

In [7]:
pd.read_csv('examples/ex2.csv', header=None,
            names=['a', 'b', 'c', 'd', 'message'])

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


Create DataFrame from ex2.csv again, and assign column names as like last question, and use 'message' as the index 

In [8]:
pd.read_csv('examples/ex2.csv', header=None, 
           names=['a', 'b', 'c', 'd', 'message'],
           index_col='message')

Unnamed: 0_level_0,a,b,c,d
message,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
hello,1,2,3,4
world,5,6,7,8
foo,9,10,11,12


Display the content of 'examples/csv_mindex.csv'

In [9]:
!cat 'examples/csv_mindex.csv'

key1,key2,value1,value2
one,a,1,2
one,b,3,4
one,c,5,6
one,d,7,8
two,a,9,10
two,b,11,12
two,c,13,14
two,d,15,16


Create DataFrame 'parsed' from csv_mindex.csv, and use key1 and key2 as multilevel index

In [10]:
parsed = pd.read_csv('examples/csv_mindex.csv', 
                    index_col=['key1', 'key2'])
parsed

Unnamed: 0_level_0,Unnamed: 1_level_0,value1,value2
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
one,a,1,2
one,b,3,4
one,c,5,6
one,d,7,8
two,a,9,10
two,b,11,12
two,c,13,14
two,d,15,16


Display examples/ex3.txt to observe its deimiter

In [11]:
!cat examples/ex3.txt

            A         B         C
aaa -0.264438 -1.026059 -0.619500
bbb  0.927272  0.302904 -0.032399
ccc -0.264273 -0.386314 -0.217601
ddd -0.871858 -0.348382  1.100491


Create a DataFrame 'result' from ex3.txt

In [12]:
result = pd.read_table('examples/ex3.txt', sep='\s+')
result

Unnamed: 0,A,B,C
aaa,-0.264438,-1.026059,-0.6195
bbb,0.927272,0.302904,-0.032399
ccc,-0.264273,-0.386314,-0.217601
ddd,-0.871858,-0.348382,1.100491


Display examples/ex4.csv to observe which rows should be skpped.

In [13]:
!cat examples/ex4.csv

# hey!
a,b,c,d,message
# just wanted to make things more difficult for you
# who reads CSV files with computers, anyway?
1,2,3,4,hello
5,6,7,8,world
9,10,11,12,foo

Create dataframe from examples/ex4.csv. Skip the commenting rows.

In [14]:
pd.read_csv('examples/ex4.csv', skiprows=[0, 2, 3])

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


Display examples/ex5.csv, and observe the missing data

In [15]:
!cat examples/ex5.csv

something,a,b,c,d,message
one,1,2,3,4,NA
two,5,6,,8,world
three,9,10,11,12,foo

In [16]:
result = pd.read_csv('examples/ex5.csv')
result.isna()

Unnamed: 0,something,a,b,c,d,message
0,False,False,False,False,False,True
1,False,False,False,True,False,False
2,False,False,False,False,False,False


Create a data frame from ex5.csv, and define 'foo' and 1 as missing values

In [17]:
result = pd.read_csv('examples/ex5.csv', na_values=['foo', 1])
result

Unnamed: 0,something,a,b,c,d,message
0,one,,2,3.0,4,
1,two,5.0,6,,8,world
2,three,9.0,10,11.0,12,


Create a data frame from ex5.csv. For the column of 'message', define 'foo' and 'NA' as missing values; for the column of 'something', define 'two' as missing value

In [18]:
pd.read_csv('examples/ex5.csv',
            na_values={'message':['foo', 'NA'], 'something':'two'})

Unnamed: 0,something,a,b,c,d,message
0,one,1,2,3.0,4,
1,,5,6,,8,world
2,three,9,10,11.0,12,


Create a data frame from the first 5 rows of examples/ex6.csv

In [19]:
pd.read_csv('examples/ex6.csv', nrows=5)

Unnamed: 0,one,two,three,four,key
0,0.467976,-0.038649,-0.295344,-1.824726,L
1,-0.358893,1.404453,0.704965,-0.200638,B
2,-0.50184,0.659254,-0.421691,-0.057688,G
3,0.204886,1.074134,1.388361,-0.982404,R
4,0.354628,-0.133116,0.283763,-0.837063,Q


Create a TextParser from ex6.csv by chunksize of 1000; then count the number of each unique value in the column of 'key'

In [20]:
chunker = pd.read_csv('examples/ex6.csv', chunksize=1e3)

In [21]:
tot = Series([])
for piece in chunker:
    tot = tot.add(piece['key'].value_counts(), fill_value=0)
tot

0    151.0
1    146.0
2    152.0
3    162.0
4    171.0
5    157.0
6    166.0
7    164.0
8    162.0
9    150.0
A    320.0
B    302.0
C    286.0
D    320.0
E    368.0
F    335.0
G    308.0
H    330.0
I    327.0
J    337.0
K    334.0
L    346.0
M    338.0
N    306.0
O    343.0
P    324.0
Q    340.0
R    318.0
S    308.0
T    304.0
U    326.0
V    328.0
W    305.0
X    364.0
Y    314.0
Z    288.0
dtype: float64

Create a data frame 'data' from examples/ex5.csv, and write it to out.csv, then display it

In [22]:
data = pd.read_csv('examples/ex5.csv')

In [23]:
data.to_csv('out.csv')

In [24]:
!cat out.csv

,something,a,b,c,d,message
0,one,1,2,3.0,4,
1,two,5,6,,8,world
2,three,9,10,11.0,12,foo


Write data to printed screen display, and set '|' as the delimeter

In [25]:
data.to_csv(sys.stdout, sep='|')

|something|a|b|c|d|message
0|one|1|2|3.0|4|
1|two|5|6||8|world
2|three|9|10|11.0|12|foo


Write data to printed screen display, and disply missing values as 'NULL'

In [26]:
data.to_csv(sys.stdout, na_rep='NULL')

,something,a,b,c,d,message
0,one,1,2,3.0,4,NULL
1,two,5,6,NULL,8,world
2,three,9,10,11.0,12,foo


Repeat last question, but skip the index and column

In [27]:
data.to_csv(sys.stdout, na_rep='NULL', index=False, header=False)

one,1,2,3.0,4,NULL
two,5,6,NULL,8,world
three,9,10,11.0,12,foo


Write data to the screen display; skip the index; only write the columns of 'a', 'b', and 'c'

In [28]:
data.to_csv(sys.stdout, na_rep='NULL', index=False, columns=['a', 'b', 'c'])

a,b,c
1,2,3.0
5,6,NULL
9,10,11.0


Display examples/ex7.csv and observe it format

In [29]:
!cat examples/ex7.csv

"a","b","c"
"1","2","3"
"1","2","3"


Use csv package to convert ex7.csv to a dict {'a': ('1', '1'), 'b': ('2', '2'), 'c': ('3', '3')}. Then create a DataFrame df from this dict.

In [30]:
import csv

In [31]:
with open('examples/ex7.csv') as f:
    reader = csv.reader(f)
    lines = list(reader)
    headers = lines[0]
    values = lines[1:]
    data_dic = {h: v for h, v in zip(headers, zip(*values))}

In [32]:
data_dic

{'a': ('1', '1'), 'b': ('2', '2'), 'c': ('3', '3')}

In [33]:
DataFrame(data_dic)

Unnamed: 0,a,b,c
0,1,2,3
1,1,2,3


Load examples/example.json by the json package, and return the result

In [34]:
import json

In [35]:
with open('examples/example.json') as f:
    result = json.load(f)

In [36]:
result

[{'a': 1, 'b': 2, 'c': 3}, {'a': 4, 'b': 5, 'c': 6}, {'a': 7, 'b': 8, 'c': 9}]

Now save result to a new json file examples/newJson.json

In [37]:
with open('examples/newJson.json', 'w') as f:
    json.dump(result, f)