In [2]:
import pandas as pd
import numpy as np

# Series
Series可以看成一个定长的有序字典，能够保存任何数据类型;
最重要的一个功能：会根据运算的索引标签自动对齐数据；类似join的操作
## 数据访问 索引

In [28]:
s = pd.Series([4, 7, -5, 3], index=['d', 'b', 'a', 'c'])  # 默认带索引 0~n-1
print(s.index, s.values)
print("通过索引获取", s['a'], s.get('e', np.nan))  #
print("通过下标获取\n", s[[3,2]])  #
print("通过条件获取\n", s[s.values > 0])
s["e"] = 100  # 修改某个值
s.rename("demo")

Index(['d', 'b', 'a', 'c'], dtype='object') [ 4  7 -5  3]
通过索引获取 -5 nan
通过下标获取
 c    3
a   -5
dtype: int64
通过条件获取
 d    4
b    7
c    3
dtype: int64


d      4
b      7
a     -5
c      3
e    100
Name: demo, dtype: int64

## 运算

In [27]:
s = pd.Series([4, 7, -5, 3], index=['d', 'b', 'a', 'c'])
print(s * 2)
print(np.exp(s))
print(s[1:] + s[:-1])  # ??

d     8
b    14
a   -10
c     6
dtype: int64
d      54.598150
b    1096.633158
a       0.006738
c      20.085537
dtype: float64
a   -10.0
b    14.0
c     NaN
d     NaN
dtype: float64


## 时间和日期
H小时 T min分钟 S秒 L ms毫秒  D天 W周 M月 Q季度

In [45]:
print(pd.date_range('1/1/2011', periods=5, freq='2D'))

DatetimeIndex(['2011-01-01', '2011-01-03', '2011-01-05', '2011-01-07',
               '2011-01-09'],
              dtype='datetime64[ns]', freq='2D')


# DataFrame
一个表格型的数据结构

## 数据获取
### 从文件中获取

In [5]:
data = pd.read_csv('../data/Income1.csv')  # 从文件中获取DataFrame
data

Unnamed: 0.1,Unnamed: 0,Education,Income
0,1,10.0,26.658839
1,2,10.401338,27.306435
2,3,10.842809,22.13241
3,4,11.244147,21.169841
4,5,11.645485,15.192634
5,6,12.086957,26.398951
6,7,12.488294,17.435307
7,8,12.889632,25.507885
8,9,13.29097,36.884595
9,10,13.732441,39.666109


### 从字典中获取

In [43]:
d1 = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'],
        'year': [2000, 2001, 2002, 2001, 2002, 2003],
        'pop': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}
df = pd.DataFrame(d1,
                  columns=['year', 'state', 'pop', 'date'],
                  index=['one', 'two', 'three', 'four','five', 'six'])
df["date"] = pd.date_range("20200808", periods=6, freq="2D")
df

Unnamed: 0,year,state,pop,date
one,2000,Ohio,1.5,2020-08-08
two,2001,Ohio,1.7,2020-08-10
three,2002,Ohio,3.6,2020-08-12
four,2001,Nevada,2.4,2020-08-14
five,2002,Nevada,2.9,2020-08-16
six,2003,Nevada,3.2,2020-08-18


## 数据访问
和Series类似，和二维数组切片一致

### 列操作

In [None]:
dates = pd.date_range('20130101', periods=6)
df = pd.DataFrame(np.arange(24).reshape(6, 4), index=dates, columns=list('ABCD'))
df

In [None]:
df["A"]  # 选择列，一列是Series；加[]表示多列，返回的是DF

In [None]:
df[["A", "B"]]  # 选择多列

In [22]:
df['E'] = ['one', 'two', 'three', 'four', 'five', "six"]  # 添加某列
df["F"] = df['A'] + df["B"]
df.insert(0, "第一列", ['one', 'two', 'three', 'four', 'five', "six"])

In [24]:
del df["E"]  # 删除列
df.pop("F")  # 弹出
df = df.drop("第一列", axis=1)  # 通过轴删除行或者列

### 行操作

In [None]:
df[1:3]  # 下标是1、2的行(左闭右开)

In [None]:
df.loc['20130102']  # 选择行 一行是Series

In [None]:
df['20130102':'20130104']  # 2,3,4行

In [None]:
df.loc["new_row"] = [n + 5 for n in range(4)]  # 添加行
df = df.drop("new_row", axis=0)  # 默认0

### 选择行和列

In [None]:
df.iloc[:3, -2:]  # 前3行后两列

In [None]:
df.iloc[[1, 2, 4], [0, 2]]

In [None]:
df.loc['20130102':'20130104', ['A', 'B']]

### 按条件选择

In [None]:
df[df.loc[:, "A"] > 0]  # A列大于0的行

## 方法和属性

In [None]:
dates = pd.date_range('20130101', periods=6)
df = pd.DataFrame(np.arange(24).reshape(6, 4), index=dates, columns=list('ABCD'))
df

In [40]:
print(df.size, df.shape, df.ndim)  # 多少个元素, 形状, 维数

24 (6, 4) 2


In [None]:
print(df.index, df.columns, df.values, df.dtypes, sep="\n")  # 查看索引、列、基础NumPy数据、数据类型

In [None]:
print(df.count(axis=1))  # 默认0轴
print(df["pop"].value_counts())

### 统计
常用函数：
1 	count()	    Number of non-null observations
2 	sum()	    Sum of values
3 	mean()	    Mean of Values
4 	median()	Median of Values
5 	mode()	    Mode of values
6 	std()	    Standard Deviation of the Values
7 	min()	    Minimum Value
8 	max()	    Maximum Value
9 	abs()	    Absolute Value
10	prod()	    Product of Values
11	cumsum()	Cumulative Sum
12	cumprod()	Cumulative Product

In [57]:
d = {'Name': pd.Series(['Tom', 'James', 'Ricky', 'Vin', 'Steve', 'Smith', 'Jack',
                        'Lee', 'David', 'Gasper', 'Betina', 'Andres']),
     'Age': pd.Series([25, 26, 25, 23, 30, 29, 23, 34, 40, 30, 51, 46]),
     'Rating': pd.Series([4.23, 3.24, 3.98, 2.56, 3.20, 4.6, 3.8, 3.78, 2.98, 4.80, 4.10, 3.65])}
df = pd.DataFrame(d)
df = df.iloc[:,-2:]

In [58]:
df.sum(0) # 默认0轴

Age       382.00
Rating     44.92
dtype: float64

### 对元素应用函数

In [None]:
df.apply(np.mean, axis=0)

In [61]:
df.apply(lambda x: x.max() - x.min())

Age       28.00
Rating     2.24
dtype: float64

In [63]:
df.applymap(lambda x: x + 100)  # 将每个元素+100

Unnamed: 0,Age,Rating
0,125,104.23
1,126,103.24
2,125,103.98
3,123,102.56
4,130,103.2
5,129,104.6
6,123,103.8
7,134,103.78
8,140,102.98
9,130,104.8


### 迭代
迭代用于读取，迭代器返回原始对象的副本(视图)，因此更改不会反映在原始对象上

In [None]:
df = pd.DataFrame(np.random.randn(4, 3), columns=['col1', 'col2', 'col3'])

In [None]:
for key, value in df.iteritems():
    print(key, value, sep="\n")  # 迭代列，key为列名，value为Series对象

In [None]:
for row_index, row in df.iterrows():
    print(row_index, row, sep="\n")  # 迭代行

In [None]:
# 迭代行，返回一个迭代器，为DataFrame中的每一行产生一个命名元组。元组的第一个元素是行的相应索引值，而其余值是行值
for row in df.itertuples():
    print(row)

### 排序

In [None]:
df = pd.DataFrame(np.random.randn(10, 2), index=[1, 4, 6, 2, 3, 5, 9, 8, 0, 7], columns=['col2', 'col1'])
df

In [None]:
df.sort_index(axis=0, ascending=False)  # 按index排序

In [None]:
df.sort_values(by=['col1', 'col2'], kind="mergesort")  # 按值排序 (默认快排)

## 分组 groupby

In [13]:
ipl_data = {'Team': ['Riders', 'Riders', 'Devils', 'Devils', 'Kings',
                     'Kings', 'Kings', 'Kings', 'Riders', 'Royals', 'Royals', 'Riders'],
            'Rank': [1, 2, 2, 3, 3, 4, 1, 1, 2, 4, 1, 2],
            'Year': [2014, 2015, 2014, 2015, 2014, 2015, 2016, 2017, 2016, 2014, 2015, 2017],
            'Points': [876, 789, 863, 673, 741, 812, 756, 788, 694, 701, 804, 690]}
df = pd.DataFrame(ipl_data)
grouped = df.groupby('Team')  # 分组 或groupby(['Team', 'Year'])
for name, group in grouped:  # 遍历组
    print(name)
    print(group)

groups = grouped.groups  # 字典
for k, v in groups.items():
    print(k, v)

Devils
     Team  Rank  Year  Points
2  Devils     2  2014     863
3  Devils     3  2015     673
Kings
    Team  Rank  Year  Points
4  Kings     3  2014     741
5  Kings     4  2015     812
6  Kings     1  2016     756
7  Kings     1  2017     788
Riders
      Team  Rank  Year  Points
0   Riders     1  2014     876
1   Riders     2  2015     789
8   Riders     2  2016     694
11  Riders     2  2017     690
Royals
      Team  Rank  Year  Points
9   Royals     4  2014     701
10  Royals     1  2015     804
Devils Int64Index([2, 3], dtype='int64')
Kings Int64Index([4, 5, 6, 7], dtype='int64')
Riders Int64Index([0, 1, 8, 11], dtype='int64')
Royals Int64Index([9, 10], dtype='int64')


In [12]:
print(grouped.get_group("Kings"))  # 选择组
print(grouped['Points'].agg(np.mean))  # 聚合
print(grouped['Points'].agg([np.sum, np.mean, np.std]))
print(grouped.transform(lambda x: (x * 10)))  # 转换
print(grouped.filter(lambda x: len(x) >= 3))  # 过滤

    Team  Rank  Year  Points
4  Kings     3  2014     741
5  Kings     4  2015     812
6  Kings     1  2016     756
7  Kings     1  2017     788
Team
Devils    768.00
Kings     774.25
Riders    762.25
Royals    752.50
Name: Points, dtype: float64
         sum    mean         std
Team                            
Devils  1536  768.00  134.350288
Kings   3097  774.25   31.899582
Riders  3049  762.25   88.567771
Royals  1505  752.50   72.831998
    Rank   Year  Points
0     10  20140    8760
1     20  20150    7890
2     20  20140    8630
3     30  20150    6730
4     30  20140    7410
5     40  20150    8120
6     10  20160    7560
7     10  20170    7880
8     20  20160    6940
9     40  20140    7010
10    10  20150    8040
11    20  20170    6900
      Team  Rank  Year  Points
0   Riders     1  2014     876
1   Riders     2  2015     789
4    Kings     3  2014     741
5    Kings     4  2015     812
6    Kings     1  2016     756
7    Kings     1  2017     788
8   Riders     2  2016    

### 聚合

In [None]:
df = pd.DataFrame(np.arange(40).reshape(10, 4),
                  index=pd.date_range('1/1/2000', periods=10),
                  columns=['A', 'B', 'C', 'D'])
print(df)
r = df.rolling(window=3, min_periods=1)
print("全部元素都进行聚合=====")
print(r.aggregate(np.sum))  # 每三个数进行求和
print("单个列上应用聚合====")
print(r['A'].aggregate(np.sum))
print("多个列上应用聚合=======")
print(r[['A', 'B']].aggregate(np.sum))
print("多个列上应用多个函数====")
print(r['A', "B"].aggregate([np.sum, np.mean]))
print("将不同的函数应用于不同列")
print(r.aggregate({'A': np.sum, 'B': np.mean}))

## 连接 join

In [14]:
left = pd.DataFrame({
    'id': [1, 2, 3, 4, 5],
    'Name': ['Alex', 'Amy', 'Allen', 'Alice', 'Ayoung'],
    'subject_id': ['sub1', 'sub2', 'sub4', 'sub6', 'sub5']})
right = pd.DataFrame(
    {'id': [1, 2, 3, 4, 5],
     'Name': ['Billy', 'Brian', 'Bran', 'Bryce', 'Betty'],
     'subject_id': ['sub2', 'sub4', 'sub3', 'sub6', 'sub5']})
print(left)
print(right)

   id    Name subject_id
0   1    Alex       sub1
1   2     Amy       sub2
2   3   Allen       sub4
3   4   Alice       sub6
4   5  Ayoung       sub5
   id   Name subject_id
0   1  Billy       sub2
1   2  Brian       sub4
2   3   Bran       sub3
3   4  Bryce       sub6
4   5  Betty       sub5


In [16]:
print(pd.merge(left, right, on='id'))
print(pd.merge(left, right, on=['id', 'subject_id']))
print(pd.merge(left, right, on='subject_id', how='inner'))  # left左连接 right右连接 outer外连接 inner自然连接(默认)

   id  Name_x subject_id_x Name_y subject_id_y
0   1    Alex         sub1  Billy         sub2
1   2     Amy         sub2  Brian         sub4
2   3   Allen         sub4   Bran         sub3
3   4   Alice         sub6  Bryce         sub6
4   5  Ayoung         sub5  Betty         sub5
   id  Name_x subject_id Name_y
0   4   Alice       sub6  Bryce
1   5  Ayoung       sub5  Betty
   id_x  Name_x subject_id  id_y Name_y
0     2     Amy       sub2     1  Billy
1     3   Allen       sub4     2  Brian
2     4   Alice       sub6     4  Bryce
3     5  Ayoung       sub5     5  Betty


## Concat连接

In [64]:
one = pd.DataFrame({
    'Name': ['Alex', 'Amy', 'Allen', 'Alice', 'Ayoung'],
    'subject_id': ['sub1', 'sub2', 'sub4', 'sub6', 'sub5'],
    'Marks_scored': [98, 90, 87, 69, 78]},
    index=[1, 2, 3, 4, 5])
two = pd.DataFrame({
    'Name': ['Billy', 'Brian', 'Bran', 'Bryce', 'Betty'],
    'subject_id': ['sub2', 'sub4', 'sub3', 'sub6', 'sub5'],
    'Marks_scored': [89, 80, 79, 97, 88]},
    index=[1, 2, 3, 4, 5])

### 重建索引

In [66]:
pd.concat([one, two], keys=['x', 'y'], ignore_index=True)  # 重建索引

Unnamed: 0,Name,subject_id,Marks_scored
0,Alex,sub1,98
1,Amy,sub2,90
2,Allen,sub4,87
3,Alice,sub6,69
4,Ayoung,sub5,78
5,Billy,sub2,89
6,Brian,sub4,80
7,Bran,sub3,79
8,Bryce,sub6,97
9,Betty,sub5,88


## 处理缺失值NaN

In [7]:
df = pd.DataFrame(np.arange(15).reshape(5, 3), index=['a', 'c', 'e', 'f', 'h'], columns=['one', 'two', 'three'])
df = df.reindex(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h'])
print(df)
# df['one'].isnull()  # 判断是否为NaN

    one   two  three
a   0.0   1.0    2.0
b   NaN   NaN    NaN
c   3.0   4.0    5.0
d   NaN   NaN    NaN
e   6.0   7.0    8.0
f   9.0  10.0   11.0
g   NaN   NaN    NaN
h  12.0  13.0   14.0


In [8]:
df['one'].sum()  # 在对数据求和时，NA将被视为零;如果数据都是NA，那么结果将是NA

30.0

### 用标量值替换NaN

In [None]:
print(df.fillna(0))  # 用0填充
print(df.replace({np.nan: 111, 10.: 1000, 11.: 1100}))  # 替换
print(df.fillna(method="pad"))  # 用前面的填充
print(df.fillna(method="backfill"))  # 用后面的填充

In [None]:
print(df.dropna())  # 删除缺失值

# 索引对象
就类似数据库表中的主键id
负责管理轴标签和其他元数据（比如轴名称等）

append 连接另一个Index对象,产生一个新的Index
difference 计算差集,并得到一个Index
intersection 计算交集
union 计算并集
isin 计算一个指示各值是否都包含在参数集合中的布尔型数组
delete 删除索引i处的元素,并得到新的Index
drop 删除传入的值,并得到新的Index
insert 将元素插入到索引i处,并得到新的Index
is_monotonic 当各元素均大于等于前一个元素时,返回True
is_unique 当Index没有重复值时,返回True
unique计算Index中唯一值的数组

In [38]:
s = pd.Series([4, 7, -5, 3])
print(s.index)

RangeIndex(start=0, stop=4, step=1)


## 重新索引
reindex函数的参数：
    index用作索引的新序列。
    method插值(填充)方式,具体参数请参见表5-4
    fill_value 在重新索引的过程中,需要引入缺失值时使用的替代值
    limit前向或后向填充时的最大填充量
    tolerance向前后向后填充时,填充不准确匹配项的最大间距(绝对值距离)
    level在Multilndex的指定级别上匹配简单索引,否则选取其子集
    copy默认为True,无论如何都复制;如果为False,则新旧相等就不复制

In [45]:
s = pd.Series([4.5, 7.2, -5.3, 3.6])
print(s.reindex(range(6), method="ffill"))  # 前向值填充，用于时间序列这样的有序数据

0    4.5
1    7.2
2   -5.3
3    3.6
4    3.6
5    3.6
dtype: float64


# 看到5.2.2节了