# 加载数据

In [28]:
import pandas as pd
import numpy as np

In [36]:
weather = pd.read_csv('./DataFolder/PRSA_data.csv')
weather.head()

Unnamed: 0,No,year,month,day,hour,pm2.5,DEWP,TEMP,PRES,cbwd,Iws,Is,Ir
0,1,2010,1,1,0,,-21,-11.0,1021.0,NW,1.79,0,0
1,2,2010,1,1,1,,-21,-12.0,1020.0,NW,4.92,0,0
2,3,2010,1,1,2,,-21,-11.0,1019.0,NW,6.71,0,0
3,4,2010,1,1,3,,-21,-14.0,1019.0,NW,9.84,0,0
4,5,2010,1,1,4,,-20,-12.0,1018.0,NW,12.97,0,0


In [37]:
# 数据预处理
date = pd.date_range(start="20100101", periods=weather.shape[0])
weather = pd.DataFrame(np.array(weather), index=date, columns=weather.columns)
weather = weather.drop(["No", "year", "month", "day", "hour", "cbwd", "Is", "Ir"], axis=1)
weather.head()

Unnamed: 0,pm2.5,DEWP,TEMP,PRES,Iws
2010-01-01,,-21,-11.0,1021.0,1.79
2010-01-02,,-21,-12.0,1020.0,4.92
2010-01-03,,-21,-11.0,1019.0,6.71
2010-01-04,,-21,-14.0,1019.0,9.84
2010-01-05,,-20,-12.0,1018.0,12.97


# 索引操作

## 直接使用行列索引（先列后行）

In [41]:
# 获取2010年1月1日4点的“DEWP”值
# 不可以先行后列，会报错
weather["DEWP"]["2010-01-05"]

-20

## 结合loc和iloc进行索引（先行后列）

In [44]:
# 使用loc：索引名称
weather.loc["2021-01-10":"2021-01-15", "Iws"]

2021-01-10    1.79
2021-01-11    0.89
2021-01-12    1.78
2021-01-13    2.67
2021-01-14    3.13
2021-01-15    6.26
Freq: D, Name: Iws, dtype: object

In [46]:
# 使用iloc：索引下标
# 前五行，前五列
weather.iloc[:5, :5]

Unnamed: 0,pm2.5,DEWP,TEMP,PRES,Iws
2010-01-01,,-21,-11.0,1021.0,1.79
2010-01-02,,-21,-12.0,1020.0,4.92
2010-01-03,,-21,-11.0,1019.0,6.71
2010-01-04,,-21,-14.0,1019.0,9.84
2010-01-05,,-20,-12.0,1018.0,12.97


## 通过使用ix进行索引（先行后列）
**早些版本的pandas中可以使用，为了严谨性，新版本的pandas已经不支持ix索引**

In [54]:
# 两种索引方式的混合
# weather.ix[0:4, ("DEWP", "Iws")]
# 推荐使用loc或者iloc进行索引操作
weather.loc[weather.index[11:15], ["TEMP", "PRES", "Iws"]]

Unnamed: 0,TEMP,PRES,Iws
2010-01-12,-5.0,1017.0,34.43
2010-01-13,-5.0,1015.0,37.56
2010-01-14,-3.0,1015.0,40.69
2010-01-15,-2.0,1014.0,43.82


In [55]:
# 使用weather.columns.get_indexer()的方法获取该列索引名称的索引位置
weather.iloc[11:15, weather.columns.get_indexer(["TEMP", "PRES", "Iws"])]

Unnamed: 0,TEMP,PRES,Iws
2010-01-12,-5.0,1017.0,34.43
2010-01-13,-5.0,1015.0,37.56
2010-01-14,-3.0,1015.0,40.69
2010-01-15,-2.0,1014.0,43.82


# 赋值操作

In [59]:
# 将1015赋值给"PRES"所在列的所有数据
weather["PRES"] = 1015
weather.PRES = 1015 # 效果同上
weather

Unnamed: 0,pm2.5,DEWP,TEMP,PRES,Iws
2010-01-01,,-21,-11.0,1015,1.79
2010-01-02,,-21,-12.0,1015,4.92
2010-01-03,,-21,-11.0,1015,6.71
2010-01-04,,-21,-14.0,1015,9.84
2010-01-05,,-20,-12.0,1015,12.97
...,...,...,...,...,...
2129-12-22,8.0,-23,-2.0,1015,231.97
2129-12-23,10.0,-22,-3.0,1015,237.78
2129-12-24,10.0,-22,-3.0,1015,242.7
2129-12-25,8.0,-22,-4.0,1015,246.72


# 排序
* 排序有两种方式：按照**索引排序**以及按照**内容**排序
* 使用sort_values(by= , ascending= )，其中ascending是一个bool值，False代表降序，True代表升序,；默认升序

## 按照内容排序

In [60]:
weather.sort_values(by="PRES", ascending=False).head()

Unnamed: 0,pm2.5,DEWP,TEMP,PRES,Iws
2010-01-01,,-21,-11.0,1015,1.79
2089-12-31,75.0,4,21.0,1015,3.13
2089-12-23,53.0,2,14.0,1015,45.6
2089-12-24,57.0,3,14.0,1015,49.62
2089-12-25,72.0,5,14.0,1015,52.75


In [62]:
# 先按照"DEWP"排序，若是相等，则按照“TEMP”进行排序
weather.sort_values(by=["DEWP", "TEMP"])

Unnamed: 0,pm2.5,DEWP,TEMP,PRES,Iws
2108-03-20,6.0,-40,2.0,1015,104.18
2108-03-19,4.0,-39,2.0,1015,96.13
2108-04-11,6.0,-38,0.0,1015,80.02
2108-03-21,6.0,-38,1.0,1015,113.12
2108-03-17,6.0,-37,1.0,1015,80.03
...,...,...,...,...,...
2047-05-18,235.0,28,31.0,1015,3.58
2023-10-13,244.0,28,32.0,1015,3.13
2047-05-16,206.0,28,32.0,1015,0.89
2047-06-05,337.0,28,32.0,1015,4.92


## 按照索引排序

In [64]:
# 这里就是按照日期排序
weather.sort_index()

Unnamed: 0,pm2.5,DEWP,TEMP,PRES,Iws
2010-01-01,,-21,-11.0,1015,1.79
2010-01-02,,-21,-12.0,1015,4.92
2010-01-03,,-21,-11.0,1015,6.71
2010-01-04,,-21,-14.0,1015,9.84
2010-01-05,,-20,-12.0,1015,12.97
...,...,...,...,...,...
2129-12-22,8.0,-23,-2.0,1015,231.97
2129-12-23,10.0,-22,-3.0,1015,237.78
2129-12-24,10.0,-22,-3.0,1015,242.7
2129-12-25,8.0,-22,-4.0,1015,246.72
