In [3]:
import pandas as pd
import numpy as np

In [4]:
# 读取数据文件并重命名列名
df1 = pd.read_csv(
    r'E:\Projects\OneDrive - somebottle\文档Documents\大数据分析Spark\期末大作业\气象监测数据\data2.csv',
    header=0,
    names=['监测时间', 'SO2监测浓度(μg/m3)', 'NO2监测浓度(μg/m3)',
           'PM10监测浓度(μg/m3)', 'PM2.5监测浓度(μg/m3)', 'O3监测浓度(μg/m3)',
           'CO监测浓度(mg/m3)']
)

In [5]:
df1.shape[0]

819

In [6]:
df1.tail(5)

Unnamed: 0,监测时间,SO2监测浓度(μg/m3),NO2监测浓度(μg/m3),PM10监测浓度(μg/m3),PM2.5监测浓度(μg/m3),O3监测浓度(μg/m3),CO监测浓度(mg/m3)
814,2021-7-8,5,15,18,3,64,0.4
815,2021-7-9,7,17,28,12,146,0.4
816,2021-7-10,6,13,20,5,81,0.4
817,2021-7-11,6,11,20,3,63,0.3
818,2021-7-12,6,11,17,5,81,0.4


In [7]:
# 删除监测时间为空值的行
df1 = df1.dropna(subset=['监测时间'])

In [8]:
df1.shape[0]

819

In [9]:
df1.columns[1:]
# 将除了监测字段外所有字段强制转换为数值
numeric_columns = df1.columns[1:]
df1[numeric_columns] = df1[numeric_columns].apply(
    pd.to_numeric, errors='coerce')

In [10]:
# 筛选出所有某字段不是数值的行
invalid_rows = df1[df1[numeric_columns].isnull().any(axis=1)]

In [11]:
invalid_rows

Unnamed: 0,监测时间,SO2监测浓度(μg/m3),NO2监测浓度(μg/m3),PM10监测浓度(μg/m3),PM2.5监测浓度(μg/m3),O3监测浓度(μg/m3),CO监测浓度(mg/m3)


In [12]:
# 使用前向填充法(ffill)填充缺失值
df1 = df1.fillna(method='ffill')

In [13]:
df1[df1['监测时间'] > '2019-4-23 13:00']

Unnamed: 0,监测时间,SO2监测浓度(μg/m3),NO2监测浓度(μg/m3),PM10监测浓度(μg/m3),PM2.5监测浓度(μg/m3),O3监测浓度(μg/m3),CO监测浓度(mg/m3)
8,2019-4-24,5,20,35,16,85,0.6
9,2019-4-25,7,23,36,19,79,0.7
10,2019-4-26,8,32,44,22,67,0.7
11,2019-4-27,6,48,22,12,56,0.7
12,2019-4-28,6,42,34,22,78,0.7
...,...,...,...,...,...,...,...
814,2021-7-8,5,15,18,3,64,0.4
815,2021-7-9,7,17,28,12,146,0.4
816,2021-7-10,6,13,20,5,81,0.4
817,2021-7-11,6,11,20,3,63,0.3


In [14]:
# 筛选出所有某字段为负值的行
negative_rows = df1[(df1[numeric_columns] < 0).any(axis=1)]
negative_rows.shape[0]

0

In [15]:
# 将所有负值替换为NaN
df1[df1[numeric_columns] < 0] = np.nan
# 使用前向填充法(ffill)填充缺失值
df1 = df1.fillna(method='ffill')

In [16]:
print('负值行数:', df1[(df1[numeric_columns] < 0).any(axis=1)].shape[0])
print('缺失值行数:', df1[df1[numeric_columns].isnull().any(axis=1)].shape[0])

负值行数: 0
缺失值行数: 0


In [17]:
# 将监测时间列转换成日期类型，并覆盖原数据
df1['监测时间'] = pd.to_datetime(df1['监测时间'])

In [18]:
df1.to_csv(
    r'E:\Projects\OneDrive - somebottle\文档Documents\大数据分析Spark\期末大作业\气象监测数据\preprocessed_by_day.csv',
    index=False
)