In [24]:
import os
import pandas as pd

def process_phenology(input_folderpath,output_folderpath,yearlist,filename = '物候总表.xlsx'):
    '''
    This is a function for processing weather data, including filling in missing values, resampling to daily and merging data.
    
    Parameters
    ----------
    input_folderpath：string.Input folder path
    output_folderpath：string.Output folder path
    yearlist：list.2003-2021
    filename: string. Filename
    
    Returns
    ----------
    None
    
    '''
    print('processing...')
    for year in yearlist:
        phenological_data = pd.read_excel(input_folderpath+str(year)+'年物候表.xlsx')
        ph1 = phenological_data.replace(['(NA)','-'],[None,0])#处理NA数据为no data;把-变为0
        # 去掉州名中的空格
        ph2 = ph1.replace([ 'Illinois', 'Indiana ','Iowa ', 'Kansas', 'Michigan ', 'Minnesota ', 'Missouri ', 'Nebraska ', 'North Dakota', 'Ohio ', 'South Dakota', 'Wisconsin'],
                          ['Illinois', 'Indiana','Iowa', 'Kansas', 'Michigan', 'Minnesota', 'Missouri', 'Nebraska', 'North Dakota', 'Ohio', 'South Dakota', 'Wisconsin'])
        ph2['年份'] = year# 填充年份列
        # 删掉空列
        ph2.drop(list(ph2.filter(regex = 'Unnamed:')), axis = 1, inplace = True)
        # 去掉无用行
        ph3 = ph2[ph2['州'].isin(['Illinois', 'Indiana','Iowa', 'Kansas', 'Michigan', 'Minnesota', 'Missouri', 'Nebraska', 'North Dakota', 'Ohio', 'South Dakota', 'Wisconsin'])]
        ph4 = ph3.T # 转置
        ph5 = ph4.iloc[3:]# 暂时把年份、州、物候阶段去掉
        ph5.index = pd.to_datetime(ph5.index)# 将索引变成时间格式
        ph6 = ph5.resample('D').asfreq()# 向上采样：周低频变成日高频数据
        ph7 = ph6.apply(pd.to_numeric)# convert all columns of DataFrame
        ph8 = ph7.interpolate(method = 'polynomial',order=3)# 三次多项式插值
        # 去掉最后一个零及之前的数据；去掉第一个100及之后的数据
        ph8[ph8.lt(0)] = 0
        for i in ph8.iloc[0].index:
            # 最后一个零的位置
            if ph8.index[ph8[i] == 0].tolist() != []:
                loc = ph8.index[ph8[i] == 0].tolist()[-1]
                ph8.loc[ph8.index[0]:loc,i] = None
                # print(i)
            if ph8.index[ph8[i] == 100].tolist() != []:
                loc = ph8.index[ph8[i] == 100].tolist()[0]
                ph8.loc[loc:ph8.index[-1],i] = None
                # print(i)
        # 高频日物候数据
        ph9 = pd.concat([ph4.iloc[0:3],ph8])
        # 25、50、75时间点数据
        ph10 = pd.DataFrame(index=[25,50,75],columns=ph8.iloc[0].index)
        for q in [25,50,75]:
            for i in ph8.iloc[0].index:
                loc = (ph8[i]-q).dropna().abs().argsort()[0]
                ph_value = ph8[i].dropna().iloc[loc]
                index = ph8[ph8[i] == ph8[i].dropna().iloc[loc]].index
                ph10[i][q] = index[0]
        ph11 = pd.concat([ph4.iloc[0:3],ph10])
        ph12 = ph11.T

        if year == 2003:
            phenology_2003 = ph12
        else:
            phenology_2003 = pd.concat([phenology_2003,ph12])
#     phenology_alldata = phenology_2003.drop(columns=['Unnamed: 0'])
    phenology_2003.to_excel(output_folderpath + '2003-2021' + filename,index=False)
    return 'processed'

In [25]:
# Soybean phenology data processing
inputdir = 'D:/论文-产量趋势利用/数据/物候数据/raw/soybean/'
outputdir = 'D:/论文-产量趋势利用/数据/物候数据/processed/soybean/'
yearlist = [i for i in range(2003,2022)]
process_phenology(inputdir, outputdir, yearlist)

processing...


'processed'

In [26]:
# Maize phenology data processing
inputdir = 'D:/论文-产量趋势利用/数据/物候数据/raw/maize/'
outputdir = 'D:/论文-产量趋势利用/数据/物候数据/processed/maize/'
yearlist = [i for i in range(2003,2022)]
process_phenology(inputdir, outputdir, yearlist)

processing...


'processed'