### Pandas 数据统计函数
- 汇总类统计
- 唯一去重和按值计数
- 相关系数和协方差

In [2]:
import pandas as pd

In [3]:
# 读取csv文件
fpath = './datas/beijing_tianqi/beijing_tianqi_2018.csv'
df = pd.read_csv(fpath)

In [4]:
df.head(3)

Unnamed: 0,ymd,bWendu,yWendu,tianqi,fengxiang,fengli,aqi,aqiInfo,aqiLevel
0,2018-01-01,3℃,-6℃,晴~多云,东北风,1-2级,59,良,2
1,2018-01-02,2℃,-5℃,阴~多云,东北风,1-2级,49,优,1
2,2018-01-03,2℃,-5℃,多云,北风,1-2级,28,优,1


In [5]:
# 替换掉温度的后缀
df.loc[:, 'bWendu'] = df['bWendu'].str.replace('℃','').astype('int64')
df['yWendu'] = df['yWendu'].str.replace('℃', '').astype('int64')

In [6]:
df.head(3)

Unnamed: 0,ymd,bWendu,yWendu,tianqi,fengxiang,fengli,aqi,aqiInfo,aqiLevel
0,2018-01-01,3,-6,晴~多云,东北风,1-2级,59,良,2
1,2018-01-02,2,-5,阴~多云,东北风,1-2级,49,优,1
2,2018-01-03,2,-5,多云,北风,1-2级,28,优,1


#### 汇总类统计

In [7]:
# 一下展示所有的数据统计结果, 只统计数值类
df.describe()

Unnamed: 0,bWendu,yWendu,aqi,aqiLevel
count,365.0,365.0,365.0,365.0
mean,18.665753,8.358904,82.183562,2.090411
std,11.858046,11.755053,51.936159,1.029798
min,-5.0,-12.0,21.0,1.0
25%,8.0,-3.0,46.0,1.0
50%,21.0,8.0,69.0,2.0
75%,29.0,19.0,104.0,3.0
max,38.0,27.0,387.0,6.0


In [9]:
df.describe(include='object')

Unnamed: 0,ymd,tianqi,fengxiang,fengli,aqiInfo
count,365,365,365,365,365
unique,365,31,8,6,6
top,2018-01-01,晴,南风,1-2级,良
freq,1,101,92,236,150


In [10]:
# 查看单个Series的数据
df['bWendu'].mean()

18.665753424657535

In [11]:
df['bWendu'].max()

38

In [12]:
df['bWendu'].min()

-5

#### 非数值唯一去重和按值统计

In [13]:
# 唯一去重
df['fengxiang'].unique()

array(['东北风', '北风', '西北风', '西南风', '南风', '东南风', '东风', '西风'], dtype=object)

In [14]:
df['tianqi'].unique()

array(['晴~多云', '阴~多云', '多云', '阴', '多云~晴', '多云~阴', '晴', '阴~小雪', '小雪~多云',
       '小雨~阴', '小雨~雨夹雪', '多云~小雨', '小雨~多云', '大雨~小雨', '小雨', '阴~小雨',
       '多云~雷阵雨', '雷阵雨~多云', '阴~雷阵雨', '雷阵雨', '雷阵雨~大雨', '中雨~雷阵雨', '小雨~大雨',
       '暴雨~雷阵雨', '雷阵雨~中雨', '小雨~雷阵雨', '雷阵雨~阴', '中雨~小雨', '小雨~中雨', '雾~多云',
       '霾'], dtype=object)

In [16]:
# 按值计数
df['tianqi'].value_counts()
df['fengxiang'].value_counts()

南风     92
西南风    64
北风     54
西北风    51
东南风    46
东北风    38
东风     14
西风      6
Name: fengxiang, dtype: int64

#### 相关系数和协方差
用途：  
      1.两只股票，是不是同涨同跌，程度多大，正相关还是负相关？  
      2.产品的销量波动，与那些因素正相关，负相关？

来自知乎：  
    1.协方差：**衡量同向反向的程度**，如果协方差为正，说明X,Y同向变化，协方差越大说明同向程度越高  
    2.相关系数，特殊的协方差，除以X、Y的标准差，提标准化后的协方差，**衡量相似度程度**，当相关系数为1，说明两个变量正向相似越大

In [17]:
# 协方差矩阵
df.cov()

Unnamed: 0,bWendu,yWendu,aqi,aqiLevel
bWendu,140.613247,135.529633,47.462622,0.879204
yWendu,135.529633,138.181274,16.186685,0.264165
aqi,47.462622,16.186685,2697.364564,50.749842
aqiLevel,0.879204,0.264165,50.749842,1.060485


In [18]:
# 相关系数
df.corr()

Unnamed: 0,bWendu,yWendu,aqi,aqiLevel
bWendu,1.0,0.972292,0.077067,0.071999
yWendu,0.972292,1.0,0.026513,0.021822
aqi,0.077067,0.026513,1.0,0.948883
aqiLevel,0.071999,0.021822,0.948883,1.0


In [19]:
# 单独查看空气质量和最高温度的相关系数
df['aqi'].corr(df['bWendu'])

0.07706705916811069

In [20]:
df['aqi'].corr(df['yWendu'])

0.02651328267296889

In [21]:
# 空气质量和温差的关系
df['aqi'].corr(df['bWendu']-df['yWendu'])
# 说明特征工程对于机器学习是非常重要的

0.2165225757638205