# 5. 數據的檢索、加工與存儲

In [1]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [2]:
import pandas as pd
from pandas import Series, DataFrame

## 5.1 用 NumPy 和 pandas 對 csv 文件進行操作

In [3]:
import numpy as np
import pandas as pd

np.random.seed(42)

a = np.random.randn(3, 4)
a

array([[ 0.49671415, -0.1382643 ,  0.64768854,  1.52302986],
       [-0.23415337, -0.23413696,  1.57921282,  0.76743473],
       [-0.46947439,  0.54256004, -0.46341769, -0.46572975]])

In [4]:
a[2][2] = np.nan
a

array([[ 0.49671415, -0.1382643 ,  0.64768854,  1.52302986],
       [-0.23415337, -0.23413696,  1.57921282,  0.76743473],
       [-0.46947439,  0.54256004,         nan, -0.46572975]])

In [5]:
np.savetxt('np.csv', a, fmt='%.2f', delimiter=',', header=" #1,  #2,  #3,  #4")

In [6]:
!cat np.csv

#  #1,  #2,  #3,  #4
0.50,-0.14,0.65,1.52
-0.23,-0.23,1.58,0.77
-0.47,0.54,nan,-0.47


In [7]:
df = pd.DataFrame(a)
df

Unnamed: 0,0,1,2,3
0,0.496714,-0.138264,0.647689,1.52303
1,-0.234153,-0.234137,1.579213,0.767435
2,-0.469474,0.54256,,-0.46573


In [8]:
df.to_csv('pd.csv', float_format='%.2f', na_rep="NAN!")
# na_rep 參數指定 在 csv檔案中，用那一種符號代表 NaN

In [9]:
!cat pd.csv
# 比起 numpy.savetxt() 所儲存的格式，pd.to_csv() 多出 index 欄位

,0,1,2,3
0,0.50,-0.14,0.65,1.52
1,-0.23,-0.23,1.58,0.77
2,-0.47,0.54,NAN!,-0.47


## 5.2 NumPy.npy 與 pandas DataFrame

In [10]:
import numpy as np
import pandas as pd
from tempfile import NamedTemporaryFile
from os.path import getsize

np.random.seed(42)
a = np.random.randn(365, 4)

tmpf = NamedTemporaryFile()
np.savetxt(tmpf, a, delimiter=',')  # savetxt 存成一個text檔案
print("Size CSV file", getsize(tmpf.name))

Size CSV file 32616


In [11]:
tmpf = NamedTemporaryFile()
np.save(tmpf, a)  # save 存成一個 .npy 的 binary 檔案
tmpf.seek(0)
loaded = np.load(tmpf)  # 從 .npy 的 binary 檔案讀回來
print("Shape", loaded.shape)
print("Size .npy file", getsize(tmpf.name))

Shape (365, 4)
Size .npy file 11760


In [17]:
tmpf = NamedTemporaryFile()
tmpf.close()
df = pd.DataFrame(a)
df.to_pickle(tmpf.name)  # 用 pandas 轉成 pickle
print("Size pickled dataframe", getsize(tmpf.name))

# 從 pickle 讀回來成為 pandas.dataframe
# print("DF from pickle\n", pd.read_pickle(tmpf.name))  
pd.read_pickle(tmpf.name)

Size pickled dataframe 12250


Unnamed: 0,0,1,2,3
0,0.496714,-0.138264,0.647689,1.523030
1,-0.234153,-0.234137,1.579213,0.767435
2,-0.469474,0.542560,-0.463418,-0.465730
3,0.241962,-1.913280,-1.724918,-0.562288
4,-1.012831,0.314247,-0.908024,-1.412304
5,1.465649,-0.225776,0.067528,-1.424748
6,-0.544383,0.110923,-1.150994,0.375698
7,-0.600639,-0.291694,-0.601707,1.852278
8,-0.013497,-1.057711,0.822545,-1.220844
9,0.208864,-1.959670,-1.328186,0.196861


## 5.3 使用 PyTables 儲存數據

In [19]:
import numpy as np
import tables
from tempfile import NamedTemporaryFile
from os.path import getsize

np.random.seed(42)
a = np.random.randn(365, 4)

tmpf = NamedTemporaryFile()
print('tmpf.name:',tmpf.name)
tmpf.close()

h5file = tables.open_file(tmpf.name, mode='w', title="NumPy Array")
root = h5file.root

# root 像是一個資料夾，用 create_array 在 root 這個資料夾中建立一個 name 是 'array' 的 array a 
h5file.create_array(where = root, name = "array", obj = a)  

h5file.close()

tmpf.name: C:\Users\Wei\AppData\Local\Temp\tmpe_n4tmmb


In [20]:
h5file = tables.open_file(tmpf.name, "r")
print(getsize(tmpf.name))

for node in h5file.iter_nodes(h5file.root):  # 抓取 root 這個資料夾中的所有 nodes
    b = node.read()  # 把每個 node 的資料讀出來
    print(type(b), b.shape)
    
h5file.close()

13824
<class 'numpy.ndarray'> (365, 4)


## 5.4 Pandas DataFrame 和 HDF5 倉庫之間的讀寫操作

In [21]:
import numpy as np
import pandas as pd
from tempfile import NamedTemporaryFile

np.random.seed(42)
a = np.random.randn(365, 4)

tmpf = NamedTemporaryFile()
tmpf

<tempfile._TemporaryFileWrapper at 0xb7220f0>

In [24]:
tmpf.name
tmpf.close()

In [25]:
store = pd.io.pytables.HDFStore(tmpf.name)  # 建立一個 HDFStore 物件, 基本上是一個 dict
print(store)  # 空的

<class 'pandas.io.pytables.HDFStore'>
File path: C:\Users\Wei\AppData\Local\Temp\tmp6yzaar0_
Empty


In [26]:
type(store)  # HDFStore 類別物件

pandas.io.pytables.HDFStore

In [27]:
pd.io.pytables.HDFStore.__mro__  # HDFStore 的繼承架構

(pandas.io.pytables.HDFStore, pandas.core.base.StringMixin, object)

In [28]:
df = pd.DataFrame(a)
df.tail()

Unnamed: 0,0,1,2,3
360,0.662881,1.173474,0.181022,-1.296832
361,0.399688,-0.651357,-0.528617,0.586364
362,1.238283,0.021272,0.308833,1.702215
363,0.240753,2.601683,0.56551,-1.760763
364,0.753342,0.381158,1.289753,0.673181


In [29]:
store['df'] = df  # 儲存一個 DataFrame
print(store)

<class 'pandas.io.pytables.HDFStore'>
File path: C:\Users\Wei\AppData\Local\Temp\tmp6yzaar0_
/df            frame        (shape->[365,4])


In [30]:
d = store.get('df')
d.tail()

Unnamed: 0,0,1,2,3
360,0.662881,1.173474,0.181022,-1.296832
361,0.399688,-0.651357,-0.528617,0.586364
362,1.238283,0.021272,0.308833,1.702215
363,0.240753,2.601683,0.56551,-1.760763
364,0.753342,0.381158,1.289753,0.673181


In [31]:
print("Get", store.get('df').shape)  # 用 dict.get()

Get (365, 4)


In [32]:
print("Lookup", store['df'].shape)  # 用 dict[]

Lookup (365, 4)


In [33]:
print("Dotted", store.df.shape)  # 用 dict.xx

Dotted (365, 4)


In [34]:
del store['df']  # 刪除資料屬性
print("After del\n", store)

After del
 <class 'pandas.io.pytables.HDFStore'>
File path: C:\Users\Wei\AppData\Local\Temp\tmp6yzaar0_
Empty


In [35]:
print("Before close, store.is_open:", store.is_open)
store.close()  # 關閉
print("After close,store.is_open:", store.is_open)

Before close, store.is_open: True
After close,store.is_open: False


In [36]:
df.to_hdf(tmpf.name, mode = 'w', key = 'data', format='table')  # 直接將 DataFrame 的資料 寫到一個 HDF5 檔案
# print(pd.read_hdf(tmpf.name, 'data')  # 直接讀取底層的 HDF5 檔案，並做過濾
pd.read_hdf(tmpf.name, key = 'data', where=['index>360'])

Unnamed: 0,0,1,2,3
361,0.399688,-0.651357,-0.528617,0.586364
362,1.238283,0.021272,0.308833,1.702215
363,0.240753,2.601683,0.56551,-1.760763
364,0.753342,0.381158,1.289753,0.673181


## 5.5 使用 pandas 讀寫 Excel 文件

In [40]:
import numpy as np
import pandas as pd
from tempfile import NamedTemporaryFile 

tmpf = NamedTemporaryFile(suffix='.xlsx')
print(tmpf.name)
tmpf.close()

C:\Users\Wei\AppData\Local\Temp\tmpc02es1ve.xlsx


In [41]:
np.random.seed(42)
a = np.random.randn(365, 4)
df = pd.DataFrame(a)
df.tail()

Unnamed: 0,0,1,2,3
360,0.662881,1.173474,0.181022,-1.296832
361,0.399688,-0.651357,-0.528617,0.586364
362,1.238283,0.021272,0.308833,1.702215
363,0.240753,2.601683,0.56551,-1.760763
364,0.753342,0.381158,1.289753,0.673181


In [42]:
df.to_excel(tmpf.name, sheet_name='Random Data')  # 將資料寫入 Excel 檔案
# print("Means\n", pd.read_excel(tmpf.name, 'Random Data').mean())
pd.read_excel(tmpf.name, sheetname = 'Random Data').mean()  # 從 Excel 檔案 讀回資料

0    0.037860
1    0.024483
2    0.059836
3    0.058417
dtype: float64

## 5.6 使用 REST Web 服務 和 JSON

In [43]:
import json

json_str = '{"country":"Netherlands","dma_code":"0","timezone":"Europe\/Amsterdam","area_code":"0","ip":"46.19.37.108","asn":"AS196752","continent_code":"EU","isp":"Tilaa V.O.F.","longitude":5.75,"latitude":52.5,"country_code":"NL","country_code3":"NLD"}'

data = json.loads(json_str)  # json.loads() ： 將字串讀入成為物件
data

{'area_code': '0',
 'asn': 'AS196752',
 'continent_code': 'EU',
 'country': 'Netherlands',
 'country_code': 'NL',
 'country_code3': 'NLD',
 'dma_code': '0',
 'ip': '46.19.37.108',
 'isp': 'Tilaa V.O.F.',
 'latitude': 52.5,
 'longitude': 5.75,
 'timezone': 'Europe/Amsterdam'}

In [44]:
data["country"]

'Netherlands'

In [45]:
data["country"] = "Brazil"
print(json.dumps(data))  # json.dumps() ： 將資料轉成字串

{"longitude": 5.75, "ip": "46.19.37.108", "country": "Brazil", "dma_code": "0", "isp": "Tilaa V.O.F.", "asn": "AS196752", "country_code": "NL", "country_code3": "NLD", "timezone": "Europe/Amsterdam", "continent_code": "EU", "latitude": 52.5, "area_code": "0"}


## 5.7 使用 pandas 讀寫 JSON

In [46]:
import pandas as pd

json_str = '{"country":"Netherlands","dma_code":"0","timezone":"Europe\/Amsterdam","area_code":"0","ip":"46.19.37.108","asn":"AS196752","continent_code":"EU","isp":"Tilaa V.O.F.","longitude":5.75,"latitude":52.5,"country_code":"NL","country_code3":"NLD"}'

data = pd.read_json(json_str, typ='series')  # read_json(): 將字串轉成物件
data

area_code                        0
asn                       AS196752
continent_code                  EU
country                Netherlands
country_code                    NL
country_code3                  NLD
dma_code                         0
ip                    46.19.37.108
isp                   Tilaa V.O.F.
latitude                      52.5
longitude                     5.75
timezone          Europe/Amsterdam
dtype: object

In [47]:
data["country"] = "Brazil"
data.to_json()  # to_json()： 將物件轉成字串

'{"area_code":"0","asn":"AS196752","continent_code":"EU","country":"Brazil","country_code":"NL","country_code3":"NLD","dma_code":"0","ip":"46.19.37.108","isp":"Tilaa V.O.F.","latitude":52.5,"longitude":5.75,"timezone":"Europe\\/Amsterdam"}'

## 5.8 解析 RSS 和 Atom 訂閱

In [48]:
# import feedparser as fp

# rss = fp.parse("http://www.packtpub.com/rss.xml")
# print("# Entries", len(rss.entries))

# for i, entry in enumerate(rss.entries):
#     if "Python" in entry.summary:
#         print(i, entry.title)
#         print(entry.summary)

## 5.9 使用 BeautifulSoup 解析 html

In [49]:
from bs4 import BeautifulSoup
import re

soup = BeautifulSoup(open('loremIpsum.html'), 'html5lib')

soup.div  # First div

<div class="tile">
   <h4>Development</h4>
     0.10.1 - July 2014<br/>
 </div>

In [50]:
soup.div['class']  # First div class

['tile']

In [51]:
soup.dl.dt.dfn.text  # First dfn text

'Quare attende, quaeso.'

In [52]:
for link in soup.find_all('a'):
    print("Link text:", link.string, "URL", link.get('href'))

Link text: loripsum.net URL http://loripsum.net/
Link text: Poterat autem inpune; URL http://loripsum.net/
Link text: Is es profecto tu. URL http://loripsum.net/


In [53]:
# Omitting find_all
for i, div in enumerate(soup('div')):
    print(i, div.contents)

0 ['\n   ', <h4>Development</h4>, '\n     0.10.1 - July 2014', <br/>, '\n ']
1 ['\n     ', <h4>Official Release</h4>, '\n     0.10.0 June 2014', <br/>, '\n']
2 ['\n     ', <h4>Previous Release</h4>, '\n     0.09.1 June 2013', <br/>, '\n']


In [54]:
#Div with id=official
official_div = soup.find_all("div", id="official")
print("Official Version:", official_div[0].contents[2].strip())

Official Version: 0.10.0 June 2014


In [55]:
soup.find_all(class_=True)

[<div class="tile">
    <h4>Development</h4>
      0.10.1 - July 2014<br/>
  </div>, <div class="tile" id="official">
      <h4>Official Release</h4>
      0.10.0 June 2014<br/>
 </div>, <div class="notile">
      <h4>Previous Release</h4>
      0.09.1 June 2013<br/>
 </div>]

In [56]:
print("# elements with class:", len(soup.find_all(class_=True)))

# elements with class: 3


In [57]:
tile_class = soup.find_all("div", class_="tile")
tile_class

[<div class="tile">
    <h4>Development</h4>
      0.10.1 - July 2014<br/>
  </div>, <div class="tile" id="official">
      <h4>Official Release</h4>
      0.10.0 June 2014<br/>
 </div>]

In [58]:
print("# Tile classes:", len(tile_class))

# Tile classes: 2


In [59]:
soup.find_all("div", class_=re.compile("tile"))

[<div class="tile">
    <h4>Development</h4>
      0.10.1 - July 2014<br/>
  </div>, <div class="tile" id="official">
      <h4>Official Release</h4>
      0.10.0 June 2014<br/>
 </div>, <div class="notile">
      <h4>Previous Release</h4>
      0.09.1 June 2013<br/>
 </div>]

In [60]:
print("# Divs with class containing tile", len(soup.find_all("div", class_=re.compile("tile"))))

# Divs with class containing tile 3


In [61]:
soup.select('div.notile')  # Using CSS selector

[<div class="notile">
      <h4>Previous Release</h4>
      0.09.1 June 2013<br/>
 </div>]

In [62]:
print("Selecting ordered list list items\n", soup.select("ol > li")[:2])

Selecting ordered list list items
 [<li>Cur id non ita fit?</li>, <li>In qua si nihil est praeter rationem, sit in una virtute finis bonorum;</li>]


In [63]:
print("Second list item in ordered list", soup.select("ol > li:nth-of-type(2)"))

Second list item in ordered list [<li>In qua si nihil est praeter rationem, sit in una virtute finis bonorum;</li>]


In [64]:
print("Searching for text string", soup.find_all(text=re.compile("2014")))

Searching for text string ['\n     0.10.1 - July 2014', '\n     0.10.0 June 2014']
