In [2]:
import pandas as pd
from pandas import Series, DataFrame
import numpy as np

1、文本格式数据的读写

In [3]:
# 将表格读取为DataFrame是pandas的重要特性
# read_csv和read_table是使用最多的函数
# read_csv 逗号是默认分隔符； read_table 制表符('\t')是默认分隔符

In [4]:
!cat examples/ex1.csv

a,b,c,d,message
1,2,3,4,hello
5,6,7,8,world
9,10,11,12,foo

In [5]:
# 使用read_csv读入一个DataFrame
df = pd.read_csv('examples/ex1.csv')
# 也可以使用read_table，指定分隔符
#df = pd.read_table('examples/ex1.csv', sep=',')
df

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


In [6]:
# 对于不包含表头行的文件，pandas可以自动默认分配，也可以手动指定列名
pd.read_csv('examples/ex2.csv', header=None) # 默认
pd.read_csv('examples/ex2.csv', names=['a', 'b', 'c', 'message'])

Unnamed: 0,a,b,c,message
1,2,3,4,hello
5,6,7,8,world
9,10,11,12,foo


1.2 将数据写入文本格式

In [10]:
import sys

data = pd.read_csv('examples/ex1.csv')
data.to_csv(sys.stdout, index=False, header=False, columns=['a', 'b', 'c'], na_rep='NULL')

1,2,3
5,6,7
9,10,11


1.3 使用分隔格式

In [11]:
import csv
f = open('examples/ex3.csv')
reader = csv.reader(f)

In [12]:
for line in reader:
    print(line)

['a', 'b', 'c']
['1', '2', '3']
['1', '2', '3']


In [13]:
with open('examples/ex3.csv') as f:
    lines = list(csv.reader(f))
header, values = lines[0], lines[1:]
header

['a', 'b', 'c']

In [14]:
values

[['1', '2', '3'], ['1', '2', '3']]

In [16]:
data_dict = {h: v for h, v in zip(header, zip(*values))}
data_dict

{'a': ('1', '1'), 'b': ('2', '2'), 'c': ('3', '3')}

1.4 JSON数据

In [17]:
obj = """
{"name": "Wes",
 "places_lived": ["China", "Hong Kong", "Canada"],
 "pet": null,
 "siblings": [{"name": "Scott", "age": 30, "pets": ["Zeus", "Zuko"]},
              {"name": "Katie", "age": 38, "pets": ["Sixes", "Stache", "Cisco"]}]
 }
"""

In [19]:
import json

result = json.loads(obj)
result

{'name': 'Wes',
 'places_lived': ['China', 'Hong Kong', 'Canada'],
 'pet': None,
 'siblings': [{'name': 'Scott', 'age': 30, 'pets': ['Zeus', 'Zuko']},
  {'name': 'Katie', 'age': 38, 'pets': ['Sixes', 'Stache', 'Cisco']}]}

In [21]:
asjson = json.dumps(result)
asjson

'{"name": "Wes", "places_lived": ["China", "Hong Kong", "Canada"], "pet": null, "siblings": [{"name": "Scott", "age": 30, "pets": ["Zeus", "Zuko"]}, {"name": "Katie", "age": 38, "pets": ["Sixes", "Stache", "Cisco"]}]}'

In [22]:
siblings = pd.DataFrame(result['siblings'], columns=['name', 'age', 'pets'])
siblings

Unnamed: 0,name,age,pets
0,Scott,30,"[Zeus, Zuko]"
1,Katie,38,"[Sixes, Stache, Cisco]"


In [23]:
# read_json默认选项是假设JSON数组中的每个对象里表里的一行
data = pd.read_json('examples/ex4.json')
data

Unnamed: 0,a,b,c
0,1,2,3
1,1,2,3
2,1,2,3


In [24]:
print(data.to_json()) # 默认按列

{"a":{"0":1,"1":1,"2":1},"b":{"0":2,"1":2,"2":2},"c":{"0":3,"1":3,"2":3}}


In [27]:
print(data.to_json(orient='records')) # 按行

[{"a":1,"b":2,"c":3},{"a":1,"b":2,"c":3},{"a":1,"b":2,"c":3}]


1.5 XML和HTML：网络抓取