## Pandas 学习笔记（博客）

In [1]:
# https://blog.csdn.net/Refrain__WG/article/details/88678303
# Next to do: 整理 xmind 思维导图

## Pandas 数据结构

### Series -- 一维数据

In [4]:
# Series(一维数组) & DataFrame(二维数组) * 多维数组（MultiIndex
# Series -- 一维数据
import pandas as pd
obj = pd.Series([4, 7, -5, 6])
obj

0    4
1    7
2   -5
3    6
dtype: int64

In [12]:
obj2 = pd.Series([4, 7, -5, 6], index=['a', 'b', 'c', 'd'])  # 自定义索引
obj2

a    4
b    7
c   -5
d    6
dtype: int64

In [8]:
obj.values

array([ 4,  7, -5,  6])

In [14]:
obj.index

RangeIndex(start=0, stop=4, step=1)

In [13]:
obj2.index

Index(['a', 'b', 'c', 'd'], dtype='object')

In [16]:
obj2['d']

6

In [17]:
obj2[['a', 'b']]

a    4
b    7
dtype: int64

In [18]:
obj2[obj2 > 0]

a    4
b    7
d    6
dtype: int64

In [19]:
obj2 * 2

a     8
b    14
c   -10
d    12
dtype: int64

In [33]:
# Series 的索引可以通过赋值的方式修改
obj.index = ['Bob', 'Steve', 'Jeff', 'Ali']
obj

Bob      4
Steve    7
Jeff    -5
Ali      6
dtype: int64

In [21]:
import numpy as np
np.exp(obj2)     # 自然常数e为底的指数函数, 返回e的幂次方，e是一个常数为2.71828

a      54.598150
b    1096.633158
c       0.006738
d     403.428793
dtype: float64

In [22]:
'b' in obj2

True

In [23]:
# 通过字典创建 Series 对象
data = {'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah':5000}
obj3 = pd.Series(data)
obj3

Ohio      35000
Texas     71000
Oregon    16000
Utah       5000
dtype: int64

In [24]:
# 传入排好序的键，以改变顺序
states = ['California', 'Ohio', 'Oregon', 'Texas']
obj4 = pd.Series(data, index=states)
obj4

California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
dtype: float64

In [26]:
# pandas 中 isnull & notnull 判断是否有缺失值
pd.isnull(obj4)

California     True
Ohio          False
Oregon        False
Texas         False
dtype: bool

In [27]:
pd.notnull(obj4)

California    False
Ohio           True
Oregon         True
Texas          True
dtype: bool

In [28]:
obj4.isnull()

California     True
Ohio          False
Oregon        False
Texas         False
dtype: bool

In [32]:
# Series name属性
obj4.name = 'population'
obj4.index.name = 'state'
obj4

state
California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
Name: population, dtype: float64

### DataFrame  -- 二维数据

In [34]:
# 创建一个符合正态分布的10个股票5天的涨跌幅数据
import pandas as pd
import numpy as np
 
stock_change = np.random.normal(0, 1, (10, 5))
stock_change

array([[-0.73457842,  0.57166844, -0.0856376 ,  0.69667265,  0.4365187 ],
       [-0.34046531, -0.11646408, -0.68002591, -0.1335528 , -1.00771834],
       [ 0.7419575 ,  0.39700695, -0.82321464,  0.47205805,  0.32054357],
       [-0.54871943,  0.01615495,  0.90671372, -1.04362143, -0.46346137],
       [-0.55905372,  0.61771197,  0.08786458,  0.80188893, -1.3400693 ],
       [-1.71131334,  0.6300688 ,  1.86108262,  2.54546247,  0.1087077 ],
       [ 0.62033781, -0.60082891, -0.28290567,  0.8012135 , -1.20358139],
       [ 0.11240283, -0.64721729, -0.20679359,  1.29618567,  1.48331288],
       [-2.43833686,  1.98772482, -0.61045156, -2.04470197, -0.16739907],
       [ 0.50075525, -0.55174008, -2.12969016, -0.04282519,  1.25483053]])

In [35]:
# 构造Pandas中的DataFrame数据结构 -- 添加 行索引序列 & 列索引序列
stock_code = ["股票{}".format(i) for i in range(stock_change.shape[0])]    # 行索引
 
date = pd.date_range('2019-03-01', periods=stock_change.shape[1], freq='B')  # 生成一个时间的序列，略过周末非交易日
 
data = pd.DataFrame(stock_change, index=stock_code, columns=date)
data

Unnamed: 0,2019-03-01 00:00:00,2019-03-04 00:00:00,2019-03-05 00:00:00,2019-03-06 00:00:00,2019-03-07 00:00:00
股票0,-0.734578,0.571668,-0.085638,0.696673,0.436519
股票1,-0.340465,-0.116464,-0.680026,-0.133553,-1.007718
股票2,0.741958,0.397007,-0.823215,0.472058,0.320544
股票3,-0.548719,0.016155,0.906714,-1.043621,-0.463461
股票4,-0.559054,0.617712,0.087865,0.801889,-1.340069
股票5,-1.711313,0.630069,1.861083,2.545462,0.108708
股票6,0.620338,-0.600829,-0.282906,0.801214,-1.203581
股票7,0.112403,-0.647217,-0.206794,1.296186,1.483313
股票8,-2.438337,1.987725,-0.610452,-2.044702,-0.167399
股票9,0.500755,-0.55174,-2.12969,-0.042825,1.254831


In [36]:
# DataFrame 属性和常用方法
data.shape

(10, 5)

In [37]:
data.index

Index(['股票0', '股票1', '股票2', '股票3', '股票4', '股票5', '股票6', '股票7', '股票8', '股票9'], dtype='object')

In [38]:
data.columns

DatetimeIndex(['2019-03-01', '2019-03-04', '2019-03-05', '2019-03-06',
               '2019-03-07'],
              dtype='datetime64[ns]', freq='B')

In [39]:
data.values

array([[-0.73457842,  0.57166844, -0.0856376 ,  0.69667265,  0.4365187 ],
       [-0.34046531, -0.11646408, -0.68002591, -0.1335528 , -1.00771834],
       [ 0.7419575 ,  0.39700695, -0.82321464,  0.47205805,  0.32054357],
       [-0.54871943,  0.01615495,  0.90671372, -1.04362143, -0.46346137],
       [-0.55905372,  0.61771197,  0.08786458,  0.80188893, -1.3400693 ],
       [-1.71131334,  0.6300688 ,  1.86108262,  2.54546247,  0.1087077 ],
       [ 0.62033781, -0.60082891, -0.28290567,  0.8012135 , -1.20358139],
       [ 0.11240283, -0.64721729, -0.20679359,  1.29618567,  1.48331288],
       [-2.43833686,  1.98772482, -0.61045156, -2.04470197, -0.16739907],
       [ 0.50075525, -0.55174008, -2.12969016, -0.04282519,  1.25483053]])

In [40]:
data.head(5)

Unnamed: 0,2019-03-01 00:00:00,2019-03-04 00:00:00,2019-03-05 00:00:00,2019-03-06 00:00:00,2019-03-07 00:00:00
股票0,-0.734578,0.571668,-0.085638,0.696673,0.436519
股票1,-0.340465,-0.116464,-0.680026,-0.133553,-1.007718
股票2,0.741958,0.397007,-0.823215,0.472058,0.320544
股票3,-0.548719,0.016155,0.906714,-1.043621,-0.463461
股票4,-0.559054,0.617712,0.087865,0.801889,-1.340069


In [41]:
data.tail(5)

Unnamed: 0,2019-03-01 00:00:00,2019-03-04 00:00:00,2019-03-05 00:00:00,2019-03-06 00:00:00,2019-03-07 00:00:00
股票5,-1.711313,0.630069,1.861083,2.545462,0.108708
股票6,0.620338,-0.600829,-0.282906,0.801214,-1.203581
股票7,0.112403,-0.647217,-0.206794,1.296186,1.483313
股票8,-2.438337,1.987725,-0.610452,-2.044702,-0.167399
股票9,0.500755,-0.55174,-2.12969,-0.042825,1.254831


In [52]:
data['2019-03-01']   # 获取某列

股票0   -0.734578
股票1   -0.340465
股票2    0.741958
股票3   -0.548719
股票4   -0.559054
股票5   -1.711313
股票6    0.620338
股票7    0.112403
股票8   -2.438337
股票9    0.500755
Name: 2019-03-01 00:00:00, dtype: float64

In [55]:
data.loc['股票1']   # 获取某行

2019-03-01   -0.340465
2019-03-04   -0.116464
2019-03-05   -0.680026
2019-03-06   -0.133553
2019-03-07   -1.007718
Freq: B, Name: 股票1, dtype: float64

In [57]:
data.loc['股票1', '2019-03-01']   # 获取某列中的某一行（先列后行，否则报错）

-0.34046531399978786

In [59]:
data['2019-03-01']['股票1']       # 获取某一行中的某一列

-0.34046531399978786

In [60]:
# 修改数据 -- 赋值操作

In [None]:
# 删除数据 -- pd.drop([], axis=1)   # 1:列， 0:行

In [None]:
# 排序 -- sort_values() & sort_index()

## 运算

In [None]:
# 算数运算 & 逻辑运算 & 统计运算 

## 文件的读取和存储

In [None]:
# csv 文件： read_csv & to_csv

In [None]:
# hdf 文件：read_hdf & to_hdf

In [None]:
# json 文件：read_json & to_json

## 数据高级处理

In [None]:
# 缺失值

In [None]:
# 离散化

In [None]:
# 合并

In [None]:
# 分组聚合