In [1]:
import netCDF4 as nc
import os
import numpy as np
import pandas as pd
import pickle

# test(查看某个文件中数据时间格式)

In [3]:
filepath = r'D:\Wind_correction\nanhai\data\wrf\wrf_202102.nc'
dataset = nc.Dataset(filepath, mode='r')

In [5]:
dataset['time']

<class 'netCDF4._netCDF4.Variable'>
float64 time(time)
    long_name: time
    units: hours since 2021-02-01 00:00:00 
    calendar: standard
unlimited dimensions: 
current shape = (672,)
filling on, default _FillValue of 9.969209968386869e+36 used

## Feature---WRF数据

In [6]:
import re
def find_time_origin(string):
    # 正则表达式模式，匹配日期时间格式  
    pattern = r'\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}'  
    # 使用re.search()查找匹配项  
    match = re.search(pattern, string)  
    # 如果找到匹配项，则提取匹配的日期时间字符串  
    if match:  
        date_time = match.group()
        print(date_time)
        return date_time
    else:  
        print("没有找到匹配的日期时间")

In [7]:
# 构建读取wrf数据的函数
def load_wrf(data_dir, variable_str):
    filename = os.listdir(data_dir)
    dict_target = {}
    for i, filename in enumerate(filename):
        filepath = os.path.join(data_dir, filename)
        dataset = nc.Dataset(filepath, mode='r')

        for variable in variable_str:
            data_temp = np.squeeze(dataset[variable][:])
            if variable == 'time':
                # 提取待处理字符串
                time_string = dataset[variable].units
                # 使用正则表达式提取时间
                datetime_origin = find_time_origin(time_string)
                # 时间戳转换
                data_temp = pd.to_datetime(data_temp, unit='h', origin=datetime_origin)
                
            if i == 0:
                dict_target[variable] = data_temp
            else:
                dict_target[variable] = np.concatenate((dict_target[variable], data_temp))
            
    return dict_target

In [8]:
# 依据风速分量u、v 合成速度大小velocity 和 风向degree
def get_vd(u, v):
    # 计算风速大小
    velocity = np.sqrt((u**2 + v**2))
    # 计算风向（以度表示）
    deg = 180.0/np.pi
    degrees = 180.0 + np.arctan2(u, v)*deg
    return velocity, degrees

In [9]:
## 处理WRF数据
# 构建加载参数
wrf_data_dir = r'D:\Wind_correction\nanhai\data\wrf'
wrf_variable_str = ['time', 'u10', 'v10', 'rh2', 't2', 'slp', 'lon', 'lat']
# 加载数据
dict_wrf = load_wrf(wrf_data_dir, wrf_variable_str)

2021-02-01 00:00:00
2021-03-01 00:00:00
2021-04-01 00:00:00
2021-05-01 00:00:00
2021-06-01 00:00:00
2021-07-01 00:00:00
2021-08-01 00:00:00
2021-09-01 00:00:00
2021-10-01 00:00:00
2021-11-01 00:00:00
2021-12-01 00:00:00
2022-01-01 00:00:00
2022-02-01 00:00:00
2022-03-01 00:00:00
2022-04-01 00:00:00
2022-05-01 00:00:00
2023-04-01 00:00:00


In [6]:
# 计算矢量风速和方向
dict_wrf['velocity'], dict_wrf['degree'] = get_vd(dict_wrf['u10'], dict_wrf['v10'])

In [7]:
## 经纬度lon/lat
wrf_path_202102 = r'D:\Wind_correction\nanhai\data\wrf\wrf_202102.nc'
ds_wrf_202102 = nc.Dataset(wrf_path_202102 , mode = 'r')
lon = ds_wrf_202102['lon'][:]
lat = ds_wrf_202102['lat'][:]
dict_wrf['lon'] = lon
dict_wrf['lat'] = lat

In [8]:
dict_wrf.keys()

dict_keys(['time', 'u10', 'v10', 'rh2', 't2', 'slp', 'lon', 'lat', 'velocity', 'degree'])

In [9]:
dict_wrf['time']

array(['2021-02-01T22:00:00.000000000', '2021-02-01T23:00:00.000000000',
       '2021-02-02T00:00:00.000000000', ...,
       '2023-05-01T19:00:00.000000000', '2023-05-01T20:00:00.000000000',
       '2023-05-01T21:00:00.000000000'], dtype='datetime64[ns]')

## Label---实测数据

In [6]:
# 构建读取实测数据的函数
def load_real(data_dir, variable_str, station_str):
    df_target_list = [] 
    filenames = os.listdir(data_dir)
    print(filenames)
    for i, filename in enumerate(filenames):
        filepath = os.path.join(data_dir, filename)
        df_temp = pd.read_excel(filepath, engine='openpyxl') 
        # 特殊处理站位编号
        df_temp[station_str] = i
        # 初始化字典容器
        dict_target = {}
        for variable in variable_str:
            data_temp = df_temp[variable]
            dict_target[variable] = data_temp 
        # 依据字典创建dataframe对象并添加到列表
        df_target_list.append(pd.DataFrame(dict_target))
    # 依据df_target_list合并各df对象
    df_merge = pd.concat(df_target_list)
    return df_merge

In [7]:
# 依据风速高度转换风速--计算经验系数k
def get_k(h0, h, z0=0.003):
    k = (np.log10(h) - np.log10(z0))/(np.log10(h0) - np.log10(z0))
    return k

In [8]:
## 处理标签数据
# 构建加载数据参数
label_data_dir = r'../data_real'
label_variable_str = ['站位编号', '获取时间', '经度', '纬度', '平均风速', '平均风向']
label_station_str = '站位编号'
# 加载数据
label_df_target = load_real(label_data_dir, label_variable_str, label_station_str)

['A地大_A点.xlsx', 'B深圳大学_B点.xlsx', 'C地大_C点.xlsx', 'D深圳大学_D点.xlsx']


In [20]:
# 计算系数k
k = get_k(2.8, 10)
print(k)
# 计算10m平均风速
label_df_target['平均风速_10m'] = label_df_target['平均风速'] * k

1.1861397720759732


In [21]:
label_df_target.keys()

Index(['站位编号', '获取时间', '经度', '纬度', '平均风速', '平均风向', '平均风速_10m'], dtype='object')

## 处理时间戳

In [22]:
# 处理wrf时间戳---Feature
dict_wrf['time'] = pd.to_datetime(dict_wrf['time']).astype('int64').to_numpy()/10**9

In [23]:
## 处理Real时间戳---Label
time_label = label_df_target['获取时间']
#转为numpy数组并以秒作为时间戳进制
time_label_target = time_label.astype('int64').to_numpy() / 10 ** 9
#覆盖原始数据
label_df_target['获取时间'] = time_label_target

In [24]:
dict_wrf['time']

array([1.6122168e+09, 1.6122204e+09, 1.6122240e+09, ..., 1.6829676e+09,
       1.6829712e+09, 1.6829748e+09])

In [25]:
label_df_target['获取时间']

0        1.648173e+09
1        1.648172e+09
2        1.648172e+09
3        1.648171e+09
4        1.648170e+09
             ...     
34540    1.629161e+09
34541    1.629160e+09
34542    1.629160e+09
34543    1.629159e+09
34544    1.629158e+09
Name: 获取时间, Length: 96254, dtype: float64

#### 时间设为单调递增

In [26]:
list(label_df_target.keys())

['站位编号', '获取时间', '经度', '纬度', '平均风速', '平均风向', '平均风速_10m']

In [27]:
label_df_target = label_df_target.groupby('站位编号').apply(lambda x: x.sort_values('获取时间'))

  label_df_target = label_df_target.groupby('站位编号').apply(lambda x: x.sort_values('获取时间'))


In [28]:
label_df_target

Unnamed: 0_level_0,Unnamed: 1_level_0,站位编号,获取时间,经度,纬度,平均风速,平均风向,平均风速_10m
站位编号,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,22260,0,1.630541e+09,113.45481,21.56127,4.8,0.3,5.693471
0,22259,0,1.630542e+09,113.45481,21.56127,4.9,108.9,5.812085
0,22258,0,1.630543e+09,113.45481,21.56127,4.7,73.1,5.574857
0,22257,0,1.630543e+09,113.45481,21.56127,4.5,157.1,5.337629
0,22256,0,1.630544e+09,113.45481,21.56127,5.2,52.5,6.167927
...,...,...,...,...,...,...,...,...
3,4,3,1.651366e+09,113.77972,20.87623,,121.5,
3,3,3,1.651366e+09,113.77972,20.87623,,121.5,
3,2,3,1.651367e+09,113.77972,20.87623,,121.5,
3,1,3,1.651369e+09,113.77972,20.87623,,121.5,


In [29]:
# 重置索引
df_target = label_df_target.reset_index(drop=True)

In [30]:
df_target

Unnamed: 0,站位编号,获取时间,经度,纬度,平均风速,平均风向,平均风速_10m
0,0,1.630541e+09,113.45481,21.56127,4.8,0.3,5.693471
1,0,1.630542e+09,113.45481,21.56127,4.9,108.9,5.812085
2,0,1.630543e+09,113.45481,21.56127,4.7,73.1,5.574857
3,0,1.630543e+09,113.45481,21.56127,4.5,157.1,5.337629
4,0,1.630544e+09,113.45481,21.56127,5.2,52.5,6.167927
...,...,...,...,...,...,...,...
96249,3,1.651366e+09,113.77972,20.87623,,121.5,
96250,3,1.651366e+09,113.77972,20.87623,,121.5,
96251,3,1.651367e+09,113.77972,20.87623,,121.5,
96252,3,1.651369e+09,113.77972,20.87623,,121.5,


## 剔除Lable中平均风速为NaN的值

In [31]:
# 平均风速为NaN的索引
null_indices = df_target[df_target['平均风速_10m'].isnull()].index

In [32]:
# 剔除平均风速中为NaN的行
df = df_target.drop(null_indices, axis=0)

In [33]:
# 重置索引
df_target = df.reset_index(drop=True)

In [34]:
df_target

Unnamed: 0,站位编号,获取时间,经度,纬度,平均风速,平均风向,平均风速_10m
0,0,1.630541e+09,113.45481,21.56127,4.80,0.3,5.693471
1,0,1.630542e+09,113.45481,21.56127,4.90,108.9,5.812085
2,0,1.630543e+09,113.45481,21.56127,4.70,73.1,5.574857
3,0,1.630543e+09,113.45481,21.56127,4.50,157.1,5.337629
4,0,1.630544e+09,113.45481,21.56127,5.20,52.5,6.167927
...,...,...,...,...,...,...,...
70073,3,1.641001e+09,113.77757,20.87745,0.00,133.5,0.000000
70074,3,1.641002e+09,113.77757,20.87745,0.00,121.5,0.000000
70075,3,1.641239e+09,113.77650,20.87868,36.72,121.5,43.555052
70076,3,1.646503e+09,113.78186,20.87623,30.18,121.5,35.797698


## 保存统一时间戳后的原始Feature、Label数据

In [35]:
## 保存WRF数据
with open('../data/wrf_dataset_init', 'wb') as f:
    pickle.dump(dict_wrf , f)

In [36]:
## 保存Label数据
with open('../data/real_dataset_init', 'wb') as f:
    pickle.dump(df_target , f)

## 异常值处理

#### 加载数据

In [37]:
## 读取WRF数据
with open('../data/wrf_dataset_init', 'rb') as f:
    wrf_init = pickle.load(f)

In [38]:
wrf_init.keys()

dict_keys(['time', 'u10', 'v10', 'rh2', 't2', 'slp', 'lon', 'lat', 'velocity', 'degree'])

In [39]:
## 读取Lable数据
with open('../data/real_dataset_init', 'rb') as f:
    real_init = pickle.load(f)

In [40]:
real_init.keys()

Index(['站位编号', '获取时间', '经度', '纬度', '平均风速', '平均风向', '平均风速_10m'], dtype='object')

#### 检查real10m平均风速范围是否在[0,75]

In [41]:
real_velocity = real_init['平均风速_10m']
real_velocity.dtype

dtype('float64')

In [43]:
# 检查所有数值是否都在0到75范围内
range_V = (real_velocity >= 0) & (real_velocity <= 75*k)#75为海面数据，转为10米风速
if range_V.all():
    print("所有数据都在0到75之间")
else:
    print("存在不在0到75范围内的数据")

# 打印不在范围内的索引
out_of_range_indices = real_velocity[(real_velocity > 75) | (real_velocity < 0)].index
out_of_range_indices

所有数据都在0到75之间


Index([], dtype='int64')

#### 检查real平均风向范围是否在[0,360)

In [44]:
real_degree = real_init['平均风向']
real_degree.dtype

dtype('float64')

In [45]:
# 找出非NaN值且不在0到360度范围内的索引及其对应值
out_of_range_indices_and_values = real_degree[(np.isfinite(real_degree)) & ((real_degree > 360) | (real_degree < 0))]
#print("不在0到360度范围内的索引及对应值：", out_of_range_indices_and_values)

# 索引
out_of_range_indices = out_of_range_indices_and_values.index
print("不在0到360度范围内的索引：", out_of_range_indices)

不在0到360度范围内的索引： Index([], dtype='int64')


#### real10m平均风速的时间一致性检验---20

In [46]:
# 加载数据
real_velocity = real_init['平均风速_10m'].values
real_velocity.shape

(70078,)

In [47]:
def quality_control(data, H):
    # 初始化结果列表，用于存放不满足条件的数据序列
    non_conforming_sequences = []
    # 遍历一维数据及其对应的序列号，检查相邻元素差值
    for i in range(len(data) - 1):
        difference = abs(data[i] - data[i + 1])
        # 如果差值大于阈值，则将序列号加入结果列表
        if difference > H:
            non_conforming_sequences.append(i+1)
    return  non_conforming_sequences

In [48]:
# 设置阈值H
H = 20  
# V
problematic_sequences_V = quality_control(real_velocity, H)
problematic_sequences_V

[5740,
 14704,
 14705,
 20513,
 20514,
 20998,
 20999,
 24278,
 24279,
 32226,
 32227,
 45392,
 45393,
 70075,
 70077]

In [49]:
real_init.shape

(70078, 7)

In [50]:
# 剔除原始real数据中不符合要求的值
real_target = real_init.drop(problematic_sequences_V)

In [51]:
real_target.shape

(70063, 7)

In [52]:
# 重置索引
real_target = real_target.reset_index(drop=True)

## 剔除Station03中风速为0的值

In [53]:
# 加载数据
df_group = real_target.groupby('站位编号')
real_velocity_03 = df_group.get_group(3)['平均风速_10m'].values
# 找到0值的索引
zero_indices_alternative = np.where(real_velocity_03 == 0)
# 将元组转为列表
zero_indices = list(zero_indices_alternative)[0]
print(zero_indices)
print(len(zero_indices))
zero_indices_list = list(zero_indices)
#print(zero_indices_list)

[ 1296  1306  3838 ... 18584 18585 18586]
10801


In [54]:
def find_continuous_sequences(lst):
    result = []
    temp_sequence = [lst[0]] if lst else []
    for index, value in enumerate(lst[1:], start=1):  # 从索引1开始，避免重复第一个元素
        if value == lst[index - 1] + 1:
            temp_sequence.append(value)
        else:
            if temp_sequence:
                result.append(temp_sequence)
                temp_sequence = [value]
    # 处理列表末尾可能存在的连续序列
    if temp_sequence:
        result.append(temp_sequence)
    return result

In [55]:
list_all = find_continuous_sequences(zero_indices_list)
print(len(list_all))
index_drop = list_all[4]
len(index_drop)

5


10797

In [56]:
## 剔除[7790,:](index_drop)

##选取03站位的表格重新编号并剔除
real_03 = df_group.get_group(3)
real_03 = real_03.reset_index(drop=True)
real_03_target = real_03.drop(index_drop)
real_03_target = real_03_target.reset_index(drop=True)

##将原表格中的03剔除[51475,70062]
real_target_1 = real_target.drop(range(51475, 70063))
##将03站位加入表格中并重新排序
real_target_end = pd.concat([real_target_1 , real_03_target], axis=0)
real_target_end = real_target_end.reset_index(drop=True)

In [57]:
real_target_end

Unnamed: 0,站位编号,获取时间,经度,纬度,平均风速,平均风向,平均风速_10m
0,0,1.630541e+09,113.45481,21.56127,4.80,0.30000,5.693471
1,0,1.630542e+09,113.45481,21.56127,4.90,108.90000,5.812085
2,0,1.630543e+09,113.45481,21.56127,4.70,73.10000,5.574857
3,0,1.630543e+09,113.45481,21.56127,4.50,157.10000,5.337629
4,0,1.630544e+09,113.45481,21.56127,5.20,52.50000,6.167927
...,...,...,...,...,...,...,...
59261,3,1.634357e+09,113.77757,20.87868,8.20,109.50000,9.726346
59262,3,1.634358e+09,113.77757,20.87868,7.70,136.00000,9.133276
59263,3,1.634359e+09,113.77757,20.87868,8.80,95.79999,10.438030
59264,3,1.634359e+09,113.77757,20.87868,9.00,96.70001,10.675258


In [58]:
# 保存real数据
with open('../data/real_dataset_noabnormal', 'wb') as f:
    pickle.dump(real_target_end , f)