# 2. Pandas - IO tools

In [2]:
%pylab
from pandas import Series, DataFrame
import pandas as pd

Using matplotlib backend: Qt4Agg
Populating the interactive namespace from numpy and matplotlib


## 讀寫本文格式的數據

將text轉換為DataFrame的函數，其選項分為:
- 索引
- 類型推斷 和 數據轉換
- 日期解析
- 佚代
- 不規整數據問題

類型推斷(type inference)是最重要的功能之一，不需要指定列的資料型態

In [3]:
!cat ex1.csv

a,b,c,d,message
1,2,3,4,hello
5,6,7,8,world
9,10,11,12,foo


In [4]:
!type ex1.csv

a,b,c,d,message
1,2,3,4,hello
5,6,7,8,world
9,10,11,12,foo


In [5]:
# read_csv 讀入 csv檔案
df = pd.read_csv('ex1.csv')
df

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


In [6]:
# 也可以讀入table，不過需要指定分隔符號
df = pd.read_table('ex1.csv', sep = ',')
df

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


In [7]:
# 沒有欄位名稱列的檔案
!type ex2.csv

1,2,3,4,hello
5,6,7,8,world
9,10,11,12,foo


In [8]:
# 預設會把第一列當作 欄位名稱列
df = pd.read_csv('ex2.csv', )
df

Unnamed: 0,1,2,3,4,hello
0,5,6,7,8,world
1,9,10,11,12,foo


In [9]:
# 標示沒有欄位名稱列
df = pd.read_csv('ex2.csv', header = None)
df

Unnamed: 0,0,1,2,3,4
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


In [10]:
# 自定義 欄位名稱
fields = ['a', 'b', 'c', 'd', 'message']
df = pd.read_csv('ex2.csv', names = fields)
df

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


In [11]:
# 可以 使用 index_col 參數，將某一欄設定為DataFrame的索引
fields = ['a', 'b', 'c', 'd', 'message']
df = pd.read_csv('ex2.csv', names = fields, index_col = 'message')
df

Unnamed: 0_level_0,a,b,c,d
message,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
hello,1,2,3,4
world,5,6,7,8
foo,9,10,11,12


In [12]:
# 可以 使用 index_col 參數，將多個欄設定為DataFrame的層次化索引 
!type ex3.csv
df = pd.read_csv('ex3.csv', index_col = ['key1', 'key2'])
df

key1,key2,value1,value2
one,a,1,2
one,b,3,4
one,c,5,6
one,d,7,8
two,a,9,10
two,b,11,12
two,c,13,14
two,d,15,16


Unnamed: 0_level_0,Unnamed: 1_level_0,value1,value2
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
one,a,1,2
one,b,3,4
one,c,5,6
one,d,7,8
two,a,9,10
two,b,11,12
two,c,13,14
two,d,15,16


In [13]:
# 如果不是以固定的分隔符號來分隔字段，可以用 read_table + regex 作為 sep參數
# 由於列名比資料列的數量少，因此read_table推斷第一列應該是DataFrame的索引
# 以不定數量的空白做分隔
!type "ex3 - 1.csv"  
df = pd.read_table('ex3 - 1.csv', sep = '\s+')
df

	A	B	C
aaa  	-0.264  	-1.026  	-0.619
bbb	  0.927	  0.302	  -0.032
ccc          -0.265	   -0.385	    -0.217
	


Unnamed: 0,A,B,C
aaa,-0.264,-1.026,-0.619
bbb,0.927,0.302,-0.032
ccc,-0.265,-0.385,-0.217


In [14]:
# 讀檔時，可以用 skiprows 來跳過指定的 rows
!type ex4.csv
df = pd.read_csv('ex4.csv', skiprows = [0, 2, 3], index_col = 'message')
df

# hey!
a,b,c,d,message
# just wanted to make things more difficult for you
# who read CSV files with computers, anyway?
1,2,3,4,hello
5,6,7,8,world
9,10,11,12,foo


Unnamed: 0_level_0,a,b,c,d
message,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
hello,1,2,3,4
world,5,6,7,8
foo,9,10,11,12


In [15]:
# 缺失數據的處理
# read_csv 會自動判斷，然後以NaN標示缺失數據的位置
!type ex5.csv
df = pd.read_csv('ex5.csv', index_col = 'something')
df

something,a,b,c,d,message
one,1,2,3,4,NA
two,5,6,,8,world
three,9,10,11,12,foo 


Unnamed: 0_level_0,a,b,c,d,message
something,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
one,1,2,3.0,4,
two,5,6,,8,world
three,9,10,11.0,12,foo


In [16]:
# isnull()，判斷元素是否為NaN
df.isnull()

Unnamed: 0_level_0,a,b,c,d,message
something,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
one,False,False,False,False,True
two,False,False,True,False,False
three,False,False,False,False,False


In [17]:
pd.isnull(df)

Unnamed: 0_level_0,a,b,c,d,message
something,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
one,False,False,False,False,True
two,False,False,True,False,False
three,False,False,False,False,False


In [18]:
# na_values 參數可指定用於標示缺失數據的字串
df = pd.read_csv('ex5.csv', index_col = 'something', na_values = ['NULL'])
df

Unnamed: 0_level_0,a,b,c,d,message
something,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
one,1,2,3.0,4,
two,5,6,,8,world
three,9,10,11.0,12,foo


In [19]:
# 為各列分別指定不同的 缺失值標示字串
sentinels = {'message': ['foo', 'NA'], 'something': ['two']}
df = pd.read_csv('ex5.csv', na_values = sentinels)
df

Unnamed: 0,something,a,b,c,d,message
0,one,1,2,3.0,4,
1,,5,6,,8,world
2,three,9,10,11.0,12,foo


### 逐塊讀取文本文件

In [20]:
# 設定 nrows參數，設定讀入的列數
!type ex5.csv
df = pd.read_csv('ex5.csv', nrows = 2)
df

something,a,b,c,d,message
one,1,2,3,4,NA
two,5,6,,8,world
three,9,10,11,12,foo 


Unnamed: 0,something,a,b,c,d,message
0,one,1,2,3.0,4,
1,two,5,6,,8,world


In [21]:
# 如果要逐塊讀取，則設定chunksize
!type ex5.csv
chunker = pd.read_csv('ex5.csv', chunksize = 2)
chunker

something,a,b,c,d,message
one,1,2,3,4,NA
two,5,6,,8,world
three,9,10,11,12,foo 


<pandas.io.parsers.TextFileReader at 0x85cfba8>

In [22]:
tot = Series([])
for piece in chunker:
    tot = tot.add(piece['something'].value_counts(), fill_value = 0)
tot = tot.sort_values(ascending = False)
tot

two      1.0
three    1.0
one      1.0
dtype: float64

## 將數據寫出到文本格式

In [23]:
!type ex5.csv
df = pd.read_csv('ex5.csv')
df

something,a,b,c,d,message
one,1,2,3,4,NA
two,5,6,,8,world
three,9,10,11,12,foo 


Unnamed: 0,something,a,b,c,d,message
0,one,1,2,3.0,4,
1,two,5,6,,8,world
2,three,9,10,11.0,12,foo


In [24]:
# 以 to_csv() 將數據寫出到一個 以逗號分隔 的檔案中
df.to_csv('ex5-1.csv')
!type "ex5-1.csv"

,something,a,b,c,d,message
0,one,1,2,3.0,4,
1,two,5,6,,8,world
2,three,9,10,11.0,12,foo 


In [25]:
# 寫出的時候，可以設定 sep 參數 指定其他的分隔符號
df.to_csv('ex5-1.csv', sep = '|')
!type "ex5-1.csv"

|something|a|b|c|d|message
0|one|1|2|3.0|4|
1|two|5|6||8|world
2|three|9|10|11.0|12|foo 


In [26]:
# 設定 na_rep 參數，以其他的符號 明確地標示 缺失值
df.to_csv('ex5-1.csv', na_rep = 'NULL')
!type "ex5-1.csv"

,something,a,b,c,d,message
0,one,1,2,3.0,4,NULL
1,two,5,6,NULL,8,world
2,three,9,10,11.0,12,foo 


In [27]:
# 可以禁止列出 row, column的標籤
# 不輸出index、header
df.to_csv('ex5-1.csv', na_rep = 'NULL', index = False, header = False) 
!type "ex5-1.csv"

one,1,2,3.0,4,NULL
two,5,6,NULL,8,world
three,9,10,11.0,12,foo 


In [28]:
# 不輸出index
df.to_csv('ex5-1.csv', na_rep = 'NULL', index = False) 
!type "ex5-1.csv"

something,a,b,c,d,message
one,1,2,3.0,4,NULL
two,5,6,NULL,8,world
three,9,10,11.0,12,foo 


In [29]:
# 設定 cols 參數，只寫出一部分的欄位
df
df.to_csv("ex5-1.csv", index = False, cols = ['a', 'b', 'c']) # 好像無效呢?
!type "ex5-1.csv"

something,a,b,c,d,message
one,1,2,3.0,4,
two,5,6,,8,world
three,9,10,11.0,12,foo 


In [30]:
# Series 也有to_csv方法
dates = pd.date_range('1/1/2000', periods = 7)
dates

DatetimeIndex(['2000-01-01', '2000-01-02', '2000-01-03', '2000-01-04',
               '2000-01-05', '2000-01-06', '2000-01-07'],
              dtype='datetime64[ns]', freq='D')

In [31]:
ts = Series(np.arange(7), index = dates)
ts

2000-01-01    0
2000-01-02    1
2000-01-03    2
2000-01-04    3
2000-01-05    4
2000-01-06    5
2000-01-07    6
Freq: D, dtype: int32

In [32]:
# Series物件 也有to_csv方法
ts.to_csv('treseries.csv')
!type "treseries.csv"

2000-01-01,0
2000-01-02,1
2000-01-03,2
2000-01-04,3
2000-01-05,4
2000-01-06,5
2000-01-07,6


In [33]:
# Series類別 也有to_csv方法 (頂層)
Series.to_csv(ts, 'treseries.csv')
!type "treseries.csv

2000-01-01,0
2000-01-02,1
2000-01-03,2
2000-01-04,3
2000-01-05,4
2000-01-06,5
2000-01-07,6


In [34]:
# 使用 from_csv 將檔案讀入成為 Series
# 有 date欄位，須設定 parse_dates 參數
ts = Series.from_csv('treseries.csv', parse_dates = True)
ts

2000-01-01    0
2000-01-02    1
2000-01-03    2
2000-01-04    3
2000-01-05    4
2000-01-06    5
2000-01-07    6
dtype: int64

## JSON(JavaScript Object Notation)數據

In [35]:
obj = """
{
"name": "Wes", 
"place_lived": ["United States", "Spain", "Germany"],
"pet": null,
"siblings": [{"name": "Scott", "age": 25, "pet": "Zuko"}, {"name": "Wei", "age": 25, "pet": "Cisco"}]
}
"""

In [36]:
# 用 json.loads 可將JSON字串還原成 dict物件
import json

result = json.loads(obj)
result

{'name': 'Wes',
 'pet': None,
 'place_lived': ['United States', 'Spain', 'Germany'],
 'siblings': [{'age': 25, 'name': 'Scott', 'pet': 'Zuko'},
  {'age': 25, 'name': 'Wei', 'pet': 'Cisco'}]}

In [37]:
# JSON物件其實是 dict 物件
type(result)

dict

In [38]:
# 使用索引，可以探及 dict內部的資料
type(result['siblings'][0]['age'])

int

In [39]:
# json.dumps 可將dict物件轉換成 JSON字串
# json字串 和json物件 需區分清楚
# json物件 其實就是 dict
json.dumps(result)

'{"place_lived": ["United States", "Spain", "Germany"], "siblings": [{"pet": "Zuko", "age": 25, "name": "Scott"}, {"pet": "Cisco", "age": 25, "name": "Wei"}], "pet": null, "name": "Wes"}'

In [40]:
result['siblings']

[{'age': 25, 'name': 'Scott', 'pet': 'Zuko'},
 {'age': 25, 'name': 'Wei', 'pet': 'Cisco'}]

In [41]:
# 以JSON物件建構DataFrame
df_siblings = DataFrame(result['siblings'], columns = ['age', 'name', 'pet']).T
df_siblings

Unnamed: 0,0,1
age,25,25
name,Scott,Wei
pet,Zuko,Cisco


In [42]:
# DataFrame有 to_json() 方法，可將DataFrame序列化
siblings_json_string = df_siblings.to_json()
siblings_json_string

'{"0":{"age":25,"name":"Scott","pet":"Zuko"},"1":{"age":25,"name":"Wei","pet":"Cisco"}}'

In [43]:
siblings_json = json.loads(siblings_json_string)
siblings_json

{'0': {'age': 25, 'name': 'Scott', 'pet': 'Zuko'},
 '1': {'age': 25, 'name': 'Wei', 'pet': 'Cisco'}}

In [44]:
# DataFrame有 from_dict() 方法，可反序列化
df_siblings = DataFrame.from_dict(siblings_json)
df_siblings

Unnamed: 0,0,1
age,25,25
name,Scott,Wei
pet,Zuko,Cisco


## Web訊息收集

[Yahoo股票資料抓取](../%E7%B7%B4%E7%BF%92%20-%20%E8%82%A1%E7%A5%A8%E8%B3%87%E6%96%99%E5%BD%99%E6%95%B4_Yahoo%E8%82%A1%E5%B8%82%20-%20%E5%95%8F%E9%A1%8C.ipynb)

## 二進制數據格式

In [45]:
# pandas物件都有一個 save方法，可以將物件數據以pickle的形式保存到硬碟
df = pd.read_csv('ex1.csv')
df

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


In [46]:
type(df)

pandas.core.frame.DataFrame

In [47]:
# 輸出 pickle資料到檔案
import pickle
df.to_pickle('ex1.pickle')
df = None
del df

In [48]:
# 讀入 pickle檔案資料成為物件 
df = pickle.load(open('ex1.pickle', 'rb'))
df

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


In [49]:
type(df)

pandas.core.frame.DataFrame

### 讀取 Microsoft Excel文件

In [50]:
# 使用 ExcelFile 方法
xls_file = pd.ExcelFile('test.xls', header = None)
table = xls_file.parse('Sheet1')
table

Unnamed: 0,時間,溫度,濕度
0,2016-02-01 10:35:00.000,12,40
1,2016-02-01 10:36:00.000,13,41
2,2016-02-01 10:36:59.995,14,42


In [51]:
type(table)

pandas.core.frame.DataFrame

## 使用數據庫

In [52]:
# 使用 SQLite3

import sqlite3

# 連接資料庫
con = sqlite3.connect(':memory:')

# 建立資料表
query = """
CREATE TABLE test
(a VARCHAR(20), b VARCHAR(20), c REAL, d INTEGER);
"""
con.execute(query)
con.commit()

# 插入資料
data = [('Atlanta', 'Georgia', 1.25, 6), 
        ('Tallahassee', 'Florida', 2.6, 3), 
        ('Sacramento', 'California', 1.7, 5)]
stmt = "INSERT INTO test VALUES(?, ?, ?, ?)"
con.executemany(stmt, data)
con.commit()


# 查詢資料
cursor = con.execute('select * from test')
rows = cursor.fetchall()
rows

[('Atlanta', 'Georgia', 1.25, 6),
 ('Tallahassee', 'Florida', 2.6, 3),
 ('Sacramento', 'California', 1.7, 5)]

In [53]:
# cursor.description 包含 欄位資訊
cursor.description

(('a', None, None, None, None, None, None),
 ('b', None, None, None, None, None, None),
 ('c', None, None, None, None, None, None),
 ('d', None, None, None, None, None, None))

In [54]:
# 用資料庫的資料建立 DataFrame
df = DataFrame(rows, columns = [f[0] for f in cursor.description])
df

Unnamed: 0,a,b,c,d
0,Atlanta,Georgia,1.25,6
1,Tallahassee,Florida,2.6,3
2,Sacramento,California,1.7,5


In [55]:
# 使用 pandas.io.sql 來讀取資料庫資料並創建 DataFrame
import pandas.io.sql as sql
df = sql.read_sql('select * from test', con)
df

Unnamed: 0,a,b,c,d
0,Atlanta,Georgia,1.25,6
1,Tallahassee,Florida,2.6,3
2,Sacramento,California,1.7,5
