#### 降雨数据表 rain_data 降雨时间，降雨量； 观测站表 observe_abute 观测站属性；地区降雨表 region_rain

In [2]:
import findspark
findspark.init()
from pyspark import SparkConf
from pyspark.sql import HiveContext,SparkSession
import pandas as pd
import numpy as np
import datetime


appname = "rain data cleaning"
master = "spark://master:7077"
conf = SparkConf().setAppName(appname).setMaster(master).set('spark.driver.maxResultSize', '8g') # maxResultSize work提交给dravel节点的最大数据

spark = SparkSession.builder.config(conf=conf).enableHiveSupport().getOrCreate()
hive_cont = HiveContext(spark)
spark.sql("set spark.sql.execution.arrow.enabled=true")

DataFrame[key: string, value: string]

## 降雨数据处理
### 从头遍历一次降雨数据
#### 维护唯一的 S_STATIONID，并记录在 observe_abute 观测站属性表中

In [4]:
# 从所有数据中 维护唯一的 S_STATIONID，生成原始的 观测站属性表
start_time = '2013-01-01 00:00:00'
end_time = '2022-01-01 00:00:00'

rain_sql = open(file='./sql/rain.sql')
list_text = rain_sql.readlines()
rain_sql.close()
sql_text = " ".join(list_text)
sql_text = sql_text.format(start_time=start_time, end_time=end_time)
# Hive 拿数据
hive_data = spark.sql(sql_text)
# 去重并按照时间进行排序，删除时间列，存档
observe_abute = hive_data.select('S_STATIONID', 'D_TIME', 'S_STATIONNAME', 'S_DIST', 'S_XIANGZHEN')
observe_abute = observe_abute.orderBy(['S_STATIONID','D_TIME'],ascending=[0,0]).dropDuplicates(['S_STATIONID']).drop('D_TIME')

observe_abute.write.format("hive").mode("overwrite").saveAsTable('rcxljjs.t_observe_abute')

#### 按时间，S_STATIONID 遍历数据，记录每一场降雨情况

In [None]:
from decimal import Decimal
# 降雨记录表   rain_data
# rain_data 记录降雨的开始结束时间，降雨量，雨量等级
# 从hive 多张表中获取所需数据 -- 已按时间进行排序的数据
years = {'2013':'2014',
         '2014':'2015',
         '2015':'2016',
         '2016':'2017',
         '2017':'2018',
         '2018':'2019',
         '2019':'2020',
         '2020':'2021',
         '2021':'2022'}

# 可用的 S_STATIONID 名单
observe_abute = spark.sql('select * from rcxljjs.t_observe_abute').cache().toPandas()
station_ids = observe_abute['S_STATIONID']

# 初始化DataFrame，hydrops_data 作为最终的积水记录，可存入Hive
rain_data = pd.DataFrame(columns=['S_STATIONID', 'START_TIME', 'END_TIME','DURATION', 'N_RAINVALUE', 'RARANK', 'S_DIST'])
rain_data_index = 0

# 依次读取监测数据，记录降雨开始时间，结束时间，降雨量
log = 0             # 标志位，1代表正在下雨
rain_fall = 0.0     # 总降雨量
rain_time = 0       # 降雨持续时间，只有大于1（10分钟）才为有效降雨


for start_year,end_year in years.items():
    
    # 按时间范围截取数据，to_pandas不宜返回太大数据
    year_start_time = start_year + '-01-01 00:00:00'
    year_end_time = end_year + '-01-01 00:00:00'
    
    rain_sql = open(file='./sql/rain.sql')
    list_text = rain_sql.readlines()
    rain_sql.close()
    sql_text = " ".join(list_text)
    sql_text = sql_text.format(start_time=year_start_time, end_time=year_end_time)
    
    # Hive 拿数据
    hive_data = spark.sql(sql_text)
    hive_data.cache()
    year_rain_data = hive_data.toPandas()
    year_rain_data['S_DIST'] = year_rain_data['S_DIST'].apply(lambda x: '无' if x == None else x)
    year_rain_data['N_RAINVALUE'] = pd.to_numeric(year_rain_data['N_RAINVALUE'])    # 速度太慢，需要优化
    # 将一年数据存在pandas 内存中，进行遍历
    
    for station_id in station_ids:
        year_rain_station = year_rain_data[year_rain_data['S_STATIONID'] == station_id][['S_STATIONID', 'D_TIME', 'N_RAINVALUE', 'S_DIST']]
        year_rain_station_sort = year_rain_station.sort_values('D_TIME')
        
        for value in year_rain_station_sort.itertuples(index=True):
            if value.N_RAINVALUE >= 0.2 and log == 0:       # 开始降雨
                start_time = value.D_TIME
                rain_fall += value.N_RAINVALUE
                log = 1
                rain_time += 1
            elif value.N_RAINVALUE < 0.2 and log == 0:      # 未降雨
                continue
            elif value.N_RAINVALUE >= 0.2 and log == 1:     #正在降雨
                rain_fall += value.N_RAINVALUE
                rain_time += 1
            elif value.N_RAINVALUE < 0.2 and log == 1:      # 雨停
                # 雨停，只有rain_time>2为有效降雨
                if rain_time == 1:
                    rain_time = 0
                    rain_fall = 0
                    log = 0
                    continue
                else:
                    end_time = value.D_TIME
                    duration = end_time - start_time
                    duration = duration.seconds / 60    # 持续多少分钟
                    s_dist = value.S_DIST
                    # 根据降雨时间与降雨量判断降雨级别：小雨，中雨，大雨，暴雨，大暴雨，特大暴雨
                    if (duration <= 12 and rain_fall <= 5) or (duration > 12 and rain_fall <= 10):
                        rarank = 1
                    elif (duration <= 12 and rain_fall <= 15) or (duration > 12 and rain_fall <= 25):
                        rarank = 2
                    elif (duration <= 12 and rain_fall <= 30) or (duration > 12 and rain_fall <= 50):
                        rarank = 3
                    elif rain_fall <= 100:
                        rarank = 4
                    elif rain_fall <= 250:
                        rarank = 5
                    else: rarank = 6
                    rain_data.loc[rain_data_index] = [value.S_STATIONID, start_time, end_time,duration, rain_fall, rarank, s_dist]
                    log, rain_fall, rarank, rain_time= 0, 0, 0, 0
                    rain_data_index += 1
            
# 将rain_data，存储在 Hive rcxljjs.t_rain_data 中

rain_data_value = rain_data.values.tolist()
# Pandas df 中的时间格式为Timestamp，需转化为spark中的 datatime
for i in rain_data_value:
    i[1] = i[1].to_pydatetime()
    i[2] = i[2].to_pydatetime()
rain_data_columns = list(rain_data.columns)

rain_data_spark = spark.createDataFrame(rain_data_value, rain_data_columns)
rain_data_spark.write.format("hive").mode("overwrite").saveAsTable('rcxljjs.t_rain_data')

In [11]:
year_rain_data['N_RAINVALUE'] = pd.to_numeric(year_rain_data['N_RAINVALUE'])

In [13]:
type(year_rain_data.loc[3,'N_RAINVALUE'])

numpy.float64

In [1]:
rain_data_value = rain_data.values.tolist()
# Pandas df 中的时间格式为Timestamp，需转化为spark中的 datatime
for i in rain_data_value:
    i[1] = i[1].to_pydatetime()
    i[2] = i[2].to_pydatetime()
rain_data_columns = list(rain_data.columns)

rain_data_spark = spark.createDataFrame(rain_data_value, rain_data_columns)
rain_data_spark.write.format("hive").mode("overwrite").saveAsTable('rcxljjs.t_rain_data')

NameError: name 'rain_data' is not defined

In [12]:
for start_year,end_year in years.items():
    
    # 按时间范围截取数据，to_pandas不宜返回太大数据
    year_start_time = start_year + '-01-01 00:00:00'
    year_end_time = end_year + '-01-01 00:00:00'
    
    rain_sql = open(file='./sql/rain.sql')
    list_text = rain_sql.readlines()
    rain_sql.close()
    sql_text = " ".join(list_text)
    sql_text = sql_text.format(start_time=year_start_time, end_time=year_end_time)
    
    # Hive 拿数据
    hive_data = spark.sql(sql_text)
    hive_data.cache()
    year_rain_data = hive_data.toPandas()
    year_rain_data['S_DIST'] = year_rain_data['S_DIST'].apply(lambda x: '无' if x == None else x)
    break

In [14]:
for station_id in station_ids:
    year_rain_station = year_rain_data[year_rain_data['S_STATIONID'] == station_id][['S_STATIONID', 'D_TIME', 'N_RAINVALUE', 'S_DIST']]
    year_rain_station_sort = year_rain_station.sort_values('D_TIME')
    break

In [16]:
rain_data

Unnamed: 0,S_STATIONID,START_TIME,END_TIME,DURATION,N_RAINVALUE,RARANK,S_DIST
0,MH099105,2013-02-05 02:45:00,2013-02-05 03:05:00,20.0,1.4,1,无
1,MH099105,2013-02-09 09:25:00,2013-02-09 09:35:00,10.0,1.0,1,无
2,MH099105,2013-03-01 01:15:00,2013-03-01 01:45:00,30.0,3.0,1,无
3,MH099105,2013-03-01 03:55:00,2013-03-01 04:45:00,50.0,4.6,1,无
4,MH099105,2013-03-01 08:05:00,2013-03-01 08:15:00,10.0,0.6,1,无
5,MH099105,2013-03-13 04:15:00,2013-03-13 04:25:00,10.0,0.6,1,无
6,MH099105,2013-03-13 04:55:00,2013-03-13 05:15:00,20.0,1.4,1,无
7,MH099105,2013-03-13 05:25:00,2013-03-13 05:45:00,20.0,1.4,1,无
8,MH099105,2013-03-13 08:35:00,2013-03-13 08:55:00,20.0,1.4,1,无
9,MH099105,2013-03-17 15:55:00,2013-03-17 16:35:00,40.0,3.4,1,无


#### 统计降雨并将数据 插入雨量站属性表 observe_abute

In [None]:
rain_data = spark.sql('select * from rcxljjs.t_rain_data').cache().toPandas()
observe_abute = spark.sql('select * from rcxljjs.t_observe_abute').cache().toPandas()

# 目前rain_data 为所有的降雨数据，提取某一年的数据进行统计
rain_data['START_TIME'] = pd.to_datetime(rain_data['START_TIME'])
rain_data.set_index('START_TIME', inplace=True)

year_rain_data = rain_data.loc['2019']

observe_abute.set_index('S_STATIONID', inplace=True)
rainTimeFall = year_rain_data[['S_STATIONID', 'DURATION', 'N_RAINVALUE']]

# 统计 降雨总量，降雨总时长
rainTimeFall_sum = rainTimeFall.groupby('S_STATIONID').sum()

observe_index = rainTimeFall_sum.index
observe_frq = list(rain_data['S_STATIONID'])

observe_abute['RAINFALL', 'DURATION', 'FREQU'] = ''
for index in observe_index:
    observe_abute.loc[index,'RAINFALL'] =rainTimeFall_sum.loc[index,'N_RAINVALUE']
    observe_abute.loc[index, 'DURATION'] = rainTimeFall_sum.loc[index, 'DURATION']
    observe_abute.loc[index, 'FREQU'] = observe_frq.count(index)

