# 读写CSV数据

## 将数据读写为元组序列

In [1]:
%%writefile stocks.csv
Symbol,Price,Data,Time,Change,Volume
"AA",39.48,"6/11/2007","9:36am",-0.18,181800
"AIG",71.38,"6/11/2007","9:36am",-0.15,195500
"AXP",62.58,"6/11/2007","9:36am",-0.46,935000
"BA",98.31,"6/11/2007","9:36am",+0.12,104800
"C",53.08,"6/11/2007","9:36am",-0.25,360900
"CAT",78.29,"6/11/2007","9:36am",-0.23,225400

Overwriting stocks.csv


In [2]:
import csv
with open('stocks.csv') as f:
    f_csv = csv.reader(f)
    headers = next(f_csv)
    print(headers)
    for row in f_csv:
        print(row)

['Symbol', 'Price', 'Data', 'Time', 'Change', 'Volume']
['AA', '39.48', '6/11/2007', '9:36am', '-0.18', '181800']
['AIG', '71.38', '6/11/2007', '9:36am', '-0.15', '195500']
['AXP', '62.58', '6/11/2007', '9:36am', '-0.46', '935000']
['BA', '98.31', '6/11/2007', '9:36am', '+0.12', '104800']
['C', '53.08', '6/11/2007', '9:36am', '-0.25', '360900']
['CAT', '78.29', '6/11/2007', '9:36am', '-0.23', '225400']


- 上面代码中，row将会是一个元组，要访问特定的字段（和标题相对应）要用到索引，如row[0]表示Symbol，row[4]表示Change

## 由于这样的索引用以混淆，因此可以考虑使用命名元组

In [3]:
from collections import namedtuple
with open('stocks.csv') as f:
    f_csv = csv.reader(f)
    headers = next(f_csv)
    Row = namedtuple('Row', headers)
    for r in f_csv:
        row = Row(*r)
        print(row.Symbol)

AA
AIG
AXP
BA
C
CAT


- 以上方法要求每一列的标头都是合法的python标识符时才起作用，如果不是的话，就必须调整原始的标头

## 另一种可行的方式是将数据读取为字典序列

In [4]:
with open('stocks.csv') as f:
    f_csv = csv.DictReader(f)
    for row in f_csv:
        print(row['Symbol'])

AA
AIG
AXP
BA
C
CAT


## 写入CSV数据，用元组序列创建

In [5]:
headers = ['Symbol', 'Price', 'Data', 'Time', 'Change', 'Volume']
rows = [("AA",39.48,"6/11/2007","9:36am",-0.18,181800),
("AIG",71.38,"6/11/2007","9:36am",-0.15,195500),
("AXP",62.58,"6/11/2007","9:36am",-0.46,935000),
("BA",98.31,"6/11/2007","9:36am",+0.12,104800),
("C",53.08,"6/11/2007","9:36am",-0.25,360900),
("CAT",78.29,"6/11/2007","9:36am",-0.23,225400)]

with open('stock1.csv', 'w') as f:
    f_csv = csv.writer(f)
    f_csv.writerow(headers)
    f_csv.writerows(rows)

## 写入CSV数据，使用字典序列

In [6]:
headers = ['Symbol', 'Price', 'Data', 'Time', 'Change', 'Volume']
rows = [{'Symbol':"AA", 'Price':39.48, 'Data':"6/11/2007", 'Time':"9:36am", 'Change':-0.18, 'Volume':180}]
with open('stock2.csv', 'w') as f:
    f_csv = csv.DictWriter(f, headers)
    f_csv.writeheader()
    f_csv.writerows(rows)

## 将CSV编码微调为其他的格式（例如修改分隔符）

读取以空格键分隔的数据

In [7]:
%%writefile test.csv
a b c
1 2 3

Overwriting test.csv


In [8]:
with open('test.csv') as f:
    f_csv = csv.reader(f, delimiter=' ')
    for row in f_csv:
        print(row)

['a', 'b', 'c']
['1', '2', '3']


## 读取CSV数据并转换为命名元组时，验证标题列

In [9]:
%%writefile test.csv
Street Address,Num-Premises,Latitude,Longitude
5412 N CLARK,10,41.123123,-87.2314

Overwriting test.csv


以上文件标题列含有非法的标识符

使用正则表达式整理标题，对非法的标识符进行正则替换

In [10]:
import re
with open('test.csv') as f:
    f_csv = csv.reader(f)
    headers = [re.sub(r'[^a-zA-Z_]', '_', h) for h in next(f_csv)]
    Row = namedtuple('ROW', headers)
    for r in f_csv:
        row = Row(*r)
        print(row.Num_Premises)

10


In [11]:
import re
with open('test.csv') as f:
    f_csv = csv.reader(f)
    headers = next(f_csv)
    Row = namedtuple('ROW', headers)
    for r in f_csv:
        row = Row(*r)
        print(row.Num_Premises)

ValueError: Type names and field names must be valid identifiers: 'Street Address'

## CSV模块不会尝试去解释数据，或者将数据转化为除了字符串以外的类型。如果这样的转换很重要，那么我们需要自行处理。

In [12]:
col_types = [str, float, str, str, float, int]
with open('stocks.csv') as f:
    f_csv = csv.reader(f)
    headers = next(f_csv)
    for row in f_csv:
        row = tuple(convert(value) for convert, value in zip(col_types, row))
        print(row[1])
    

39.48
71.38
62.58
98.31
53.08
78.29


## 将选中的字段转换为字典

In [13]:
field_type = [('Price', float), ('Change', float), ('Volume', int)]
with open('stocks.csv') as f:
    for row in csv.DictReader(f):
        row.update((key, conversion(row[key])) for key, conversion in field_type)
        print(row)

OrderedDict([('Symbol', 'AA'), ('Price', 39.48), ('Data', '6/11/2007'), ('Time', '9:36am'), ('Change', -0.18), ('Volume', 181800)])
OrderedDict([('Symbol', 'AIG'), ('Price', 71.38), ('Data', '6/11/2007'), ('Time', '9:36am'), ('Change', -0.15), ('Volume', 195500)])
OrderedDict([('Symbol', 'AXP'), ('Price', 62.58), ('Data', '6/11/2007'), ('Time', '9:36am'), ('Change', -0.46), ('Volume', 935000)])
OrderedDict([('Symbol', 'BA'), ('Price', 98.31), ('Data', '6/11/2007'), ('Time', '9:36am'), ('Change', 0.12), ('Volume', 104800)])
OrderedDict([('Symbol', 'C'), ('Price', 53.08), ('Data', '6/11/2007'), ('Time', '9:36am'), ('Change', -0.25), ('Volume', 360900)])
OrderedDict([('Symbol', 'CAT'), ('Price', 78.29), ('Data', '6/11/2007'), ('Time', '9:36am'), ('Change', -0.23), ('Volume', 225400)])


这样的数据转换一定要非常小心，一般来说现实世界的CSV文件可能会缺少某些值或者数据损坏，以及出现其他一些可能会使类型操作在转换失败的情况，因此需要加上适当的异常处理代码。

** 最后，我们的目标是通过读取CSV数据来进行数据分析和统计，那么应该看看这个Pandas这个python包（http://pandas.pydata.org ） **

pandas中有一个方便的函数pandas.read_csv(),能够将csv数据加载到DataFrame对象中，之后可以对数据做各种高级操作。