# Pandas 数据结构

- 一个强大的数据分析结构化数据的工具集
- 基础是 NumPy

In [13]:
import pandas as pd
import numpy as np

## Series

In [3]:
# 通过list构建Series
ser_obj = pd.Series(range(10,20))
print(type(ser_obj))

<class 'pandas.core.series.Series'>


In [4]:
# 获取数据
print(ser_obj.values)
# 获取索引
print(ser_obj.index)

[10 11 12 13 14 15 16 17 18 19]
RangeIndex(start=0, stop=10, step=1)


In [5]:
# 预览数据
print(ser_obj.head(3))

0    10
1    11
2    12
dtype: int32


In [7]:
#通过索引获取数据
print(ser_obj[0])
print(ser_obj[8])

10
18


In [12]:
# 索引与数据的对应关系仍保持在数组运算的结果中
print(ser_obj * 2)
print(ser_obj[ser_obj > 15])

0    20
1    22
2    24
3    26
4    28
5    30
6    32
7    34
8    36
9    38
dtype: int32
6    16
7    17
8    18
9    19
dtype: int32


In [10]:
# 通过dict构建Series
year_data = {2001:17.8,2002:20.1,2003:16.5}
ser_obj2 = pd.Series(year_data)
print(ser_obj2)

2001    17.8
2002    20.1
2003    16.5
dtype: float64


In [11]:
# name属性
ser_obj2.name = 'temp'
ser_obj2.index.name = 'year'
print(ser_obj2.head())

year
2001    17.8
2002    20.1
2003    16.5
Name: temp, dtype: float64


## DataFrame

In [15]:
# 通过ndarray构建DataFrame
array = np.random.randn(5,4)
print(array)

df_obj = pd.DataFrame(array)
print(df_obj)

[[-0.11376364  0.25226737 -0.65904278 -0.15669658]
 [ 0.33697774 -0.54559189  1.5196303  -0.88137892]
 [-0.63776002 -0.05550711 -0.29385835  0.00727016]
 [-1.39622271 -0.63804359 -0.73287162 -0.26950658]
 [ 1.07647488  0.10686802  0.56679886  1.27693586]]
          0         1         2         3
0 -0.113764  0.252267 -0.659043 -0.156697
1  0.336978 -0.545592  1.519630 -0.881379
2 -0.637760 -0.055507 -0.293858  0.007270
3 -1.396223 -0.638044 -0.732872 -0.269507
4  1.076475  0.106868  0.566799  1.276936


In [21]:
# 通过dict构建DataFrame
dict_data = {'A': 1.,
           'B': pd.Timestamp('20180104'),
           'C': pd.Series(1,index=list(range(4)),dtype='float32'),
           'D': np.array([3] * 4,dtype='int32'),
            'E' : pd.Categorical(["Python","Java","C++","C#"]),
            'F' : 'Skye' 
           }
df_obj2 = pd.DataFrame(dict_data)
df_obj2

Unnamed: 0,A,B,C,D,E,F
0,1.0,2018-01-04,1.0,3,Python,Skye
1,1.0,2018-01-04,1.0,3,Java,Skye
2,1.0,2018-01-04,1.0,3,C++,Skye
3,1.0,2018-01-04,1.0,3,C#,Skye


In [22]:
# 通过列索引获取列数据
print(df_obj2['A'])
print(type(df_obj2['A']))

print(df_obj2.A)

0    1.0
1    1.0
2    1.0
3    1.0
Name: A, dtype: float64
<class 'pandas.core.series.Series'>
0    1.0
1    1.0
2    1.0
3    1.0
Name: A, dtype: float64


In [23]:
# 增加列
df_obj2['G'] = df_obj2['D'] + 4
df_obj2

Unnamed: 0,A,B,C,D,E,F,G
0,1.0,2018-01-04,1.0,3,Python,Skye,7
1,1.0,2018-01-04,1.0,3,Java,Skye,7
2,1.0,2018-01-04,1.0,3,C++,Skye,7
3,1.0,2018-01-04,1.0,3,C#,Skye,7


In [24]:
# 增加列
del df_obj2['G']
df_obj2

Unnamed: 0,A,B,C,D,E,F
0,1.0,2018-01-04,1.0,3,Python,Skye
1,1.0,2018-01-04,1.0,3,Java,Skye
2,1.0,2018-01-04,1.0,3,C++,Skye
3,1.0,2018-01-04,1.0,3,C#,Skye


## 索引对象 Index

In [25]:
print(type(ser_obj.index))
print(type(df_obj2.index))

print(df_obj2.index)

<class 'pandas.core.indexes.range.RangeIndex'>
<class 'pandas.core.indexes.numeric.Int64Index'>
Int64Index([0, 1, 2, 3], dtype='int64')


In [26]:
# 索引对象不可变
df_obj2.index[0] = 2

TypeError: Index does not support mutable operations