# 数据预处理

In [1]:
import numpy as np
import pandas as pd

## 1. 导入数据

In [2]:
dataPath = "data/AQ/AQ_data.csv"
data = pd.read_csv(dataPath)
data.head()

Unnamed: 0,日期,质量等级,AQI指数,当天AQI排名,PM2.5,PM10,So2,No2,Co,O3
0,2014-01-01,轻度污染,149,140,114,161,61,40,1.5,9
1,2014-01-02,中度污染,167,131,134,187,62,40,1.35,11
2,2014-01-03,重度污染,208,144,158,229,56,52,1.62,15
3,2014-01-04,重度污染,205,170,156,226,42,54,1.73,21
4,2014-01-05,重度污染,210,148,160,213,37,46,1.64,16


In [3]:
data.shape

(2846, 10)

In [4]:
data = data[data['O3']!='O3']  # 去除不必要的列名
data = data.reset_index(drop=True)  # 重新编排索引
data.shape

(2755, 10)

# 2. 添加“年”，“月”， “日”

## 2.1 切分日期

In [5]:
data_temp = data['日期'].str.split('-', expand=True)
data_temp.head()

Unnamed: 0,0,1,2
0,2014,1,1
1,2014,1,2
2,2014,1,3
3,2014,1,4
4,2014,1,5


## 2.2 合并数据

In [6]:
data = pd.merge(data, data_temp, how='left', left_index=True, right_index=True)
data.head()

Unnamed: 0,日期,质量等级,AQI指数,当天AQI排名,PM2.5,PM10,So2,No2,Co,O3,0,1,2
0,2014-01-01,轻度污染,149,140,114,161,61,40,1.5,9,2014,1,1
1,2014-01-02,中度污染,167,131,134,187,62,40,1.35,11,2014,1,2
2,2014-01-03,重度污染,208,144,158,229,56,52,1.62,15,2014,1,3
3,2014-01-04,重度污染,205,170,156,226,42,54,1.73,21,2014,1,4
4,2014-01-05,重度污染,210,148,160,213,37,46,1.64,16,2014,1,5


## 2.3 修改列名

In [7]:
data.rename(columns={0: '年', 1: '月', 2: '日'}, inplace=True) 
data

Unnamed: 0,日期,质量等级,AQI指数,当天AQI排名,PM2.5,PM10,So2,No2,Co,O3,年,月,日
0,2014-01-01,轻度污染,149,140,114,161,61,40,1.5,9,2014,01,01
1,2014-01-02,中度污染,167,131,134,187,62,40,1.35,11,2014,01,02
2,2014-01-03,重度污染,208,144,158,229,56,52,1.62,15,2014,01,03
3,2014-01-04,重度污染,205,170,156,226,42,54,1.73,21,2014,01,04
4,2014-01-05,重度污染,210,148,160,213,37,46,1.64,16,2014,01,05
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2750,2021-08-27,优,17,33,10,15,6,22,0.57,43,2021,08,27
2751,2021-08-28,优,16,24,9,16,6,22,0.61,30,2021,08,28
2752,2021-08-29,优,18,44,10,18,6,23,0.62,33,2021,08,29
2753,2021-08-30,优,28,144,13,25,7,22,0.7,48,2021,08,30


# 3. 类型转换

## 3.1 数值类型

In [8]:
data['AQI指数'] = data['AQI指数'].astype(int)
data['当天AQI排名'] = data['当天AQI排名'].astype(int)
data['PM2.5'] = data['PM2.5'].astype(int)
data['PM10'] = data['PM10'].astype(int)
data['So2'] = data['No2'].astype(int)
data['No2'] = data['No2'].astype(int)
data['Co'] = data['Co'].astype(np.double)
data['O3'] = data['O3'].astype(int)
data['年'] = data['年'].astype(int)
data['月'] = data['月'].astype(int)
data['日'] = data['日'].astype(int)

## 3.3 类型转换结果

In [9]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2755 entries, 0 to 2754
Data columns (total 13 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   日期       2755 non-null   object 
 1   质量等级     2755 non-null   object 
 2   AQI指数    2755 non-null   int32  
 3   当天AQI排名  2755 non-null   int32  
 4   PM2.5    2755 non-null   int32  
 5   PM10     2755 non-null   int32  
 6   So2      2755 non-null   int32  
 7   No2      2755 non-null   int32  
 8   Co       2755 non-null   float64
 9   O3       2755 non-null   int32  
 10  年        2755 non-null   int32  
 11  月        2755 non-null   int32  
 12  日        2755 non-null   int32  
dtypes: float64(1), int32(10), object(2)
memory usage: 172.3+ KB


# 4. 写入新文件

In [10]:
data.to_csv("data/AQ/AQ_data_new.csv", sep=',', header=True, index=False, encoding='utf-8')