In [215]:
import os


import pandas as pd
import numpy as np
pd.options.mode.chained_assignment = None  # default='warn'

def area_processing(input_dir, output_dir, cols = ['State','County','Year','Value','Domain Category'], cropland = 'cropland'):
    '''
    It is a production processing function for the cropland area records of census in 1997, 2002, 2007, 2012, and 2017
    downloaded from the national agricultural statistics Service (NASS).
    
    Parameters
    ----------
    input_dir: string, the directory of raw area records. e.g. 'C:/Data/raw/soybean/'.
    output_dir: string, the directory where save processed area data.
    cols: list, the columns that will be read. The default is ['State','County','Year','Value'].
    cropland: string. cropland or irrigated cropland. The default is cropland.
    
    Returns
    ----------
    None
    
    Notes
    ----------
    1. The input directory only contains CSV files of yield.
    '''
    # Check if the output directory exists.
    if not os.path.isdir(output_dir):
        print('Please create a new outout directory.')
        return
        
    count = 0
    list_sta_con = []
    print("Processing...")
    
    for _, _, files in os.walk(input_dir):
        for file in files:
            # Process csv files only.
            if file.endswith(".csv"):
                count += 1
                area_data = pd.read_csv(os.path.join(input_dir,file),engine='python', usecols = cols)

                # Make the names of states and counties lowercase.
                area_data.loc[:,"State"] = area_data.loc[:,"State"].str.lower()
                area_data.loc[:,"County"] = area_data.loc[:,"County"].str.lower()

                # Concatenate state name and county name as sta_con.
                area_data.loc[:,"sta_con"] = area_data.loc[:,"State"] + "_" + area_data.loc[:,"County"]
                
                # rename and caluated the percentage of cropland
                area_data.loc[:,cropland+" area"] = area_data.loc[:,"Value"]# cropland or irrigated cropland

                    
                # Concatenate all csv files
                if count == 1:
                    df = area_data
                else:
                    df = pd.concat([df,area_data])
        
    # Delete rows containing 'other (combined) counties' and 0.
    df.drop(df.index[df['County'] == 'other (combined) counties'], inplace=True)
    df.drop(df.index[df['Value'] == 0], inplace=True)
    df.drop(labels = 'Value', axis = 1, inplace=True)
    
    # convert the hidden value ' (D)' to np.nan
    df.loc[df[cropland+' area'] == ' (D)',cropland+' area'] = np.nan #-999 都可以

    # string of area is converted to the float
    df[cropland+' area'] = df[cropland+' area'].str.replace(',', '').astype(float)

    # fill the nan using the historical value
    df[cropland+' area'] = df.groupby(['sta_con','Domain Category']).transform(lambda x: x.fillna(x.mean()))[cropland+' area']
    # fill the nan using the value of state range 
    df[cropland+' area'] = df.groupby(['State','Domain Category']).transform(lambda x: x.fillna(x.mean()))[cropland+' area']
    # Save new csv file of yield
    df.to_csv(os.path.join(output_dir, cropland + '.csv'), index=False)
    print("Processed!")
    return 

def calculate_PIC(cropland_area_path, cropland_irri_path, croptype = 'cropland'):
    '''
    It is a production processing function for calculate the percentage of irrigated cropland (PIC) using census data
    
    Parameters
    ----------
    cropland_area_path: string, the path of cropland area. e.g. "D:/cropland/cropland.csv"
    cropland_irri_path: string, the path where save irrigated cropland area. 
    
    Returns
    ----------
    The DF with PIC column
    '''
    # read area data
    area = pd.read_csv(cropland_area_path,engine='python')
    irri_area = pd.read_csv(cropland_irri_path,engine='python')
    
    # Adding together the areas of different value ranges (Domain Category)
    area = area.drop_duplicates().groupby(['Year','sta_con'], sort=False, as_index=False).sum()
    irri_area = irri_area.drop_duplicates().groupby(['Year','sta_con'], sort=False, as_index=False).sum()
    # merge
    cropland_integ = irri_area.merge(area,on=['Year','sta_con'],how='inner',validate='one_to_one')
    # Percentage of irrigated cropland
    cropland_integ['PIC'] = cropland_integ['irrigated '+croptype+' area']/cropland_integ[croptype+' area']
    # Upper limit set to 1
    cropland_integ.loc[cropland_integ['PIC'] > 1,'PIC'] = 1
    
    return cropland_integ

def resample_PIC(data, output_dir,croptype = 'cropland'):
    '''
    It is a function which resample the PIC from 1997,2002,2007,2012,2017 to every year
    
    Parameters
    ----------
    data: DF data
    output_dir: string, the directory where save processed area data.
    
    Returns
    ----------
    None
    
    '''
    integ_data = pd.DataFrame([],columns=data.columns.to_list())
    # 2003,2004
    for n in [2003,2004]:
        data_year = data.loc[data['Year'] == 2002,:]
        data_year.loc[:,'Year'] = n
        integ_data = pd.concat([integ_data,data_year])
     # 2005,2006,2008,2009
    for n in [2005,2006,2008,2009]:
        data_year = data.loc[data['Year'] == 2007,:]
        data_year.loc[:,'Year'] = n
        integ_data = pd.concat([integ_data,data_year])
     # 2010,2011,2013,2014
    for n in [2010,2011,2013,2014]:
        data_year = data.loc[data['Year'] == 2012,:]
        data_year.loc[:,'Year'] = n
        integ_data = pd.concat([integ_data,data_year])
     # 2015,2016,2018,2019,2020,2021
    for n in [2015,2016,2018,2019,2020,2021]:
        data_year = data.loc[data['Year'] == 2017,:]
        data_year.loc[:,'Year'] = n
        integ_data = pd.concat([integ_data,data_year])
     # 2007,2012,2017
    for n in [2015,2016,2018,2019,2020,2021]:
        data_year = data.loc[data['Year'] == n,:]
        data_year.loc[:,'Year'] = n
        integ_data = pd.concat([integ_data,data_year])
    # Save new csv file of PIC
    integ_data.to_csv(os.path.join(output_dir, croptype+'_'+'PIC_resampled.csv'), index=False)
    print("Processed!")
    return


In [216]:
# cropland
# preprocessing for area of cropland and irrigated cropland in 1997, 2002, 2007, 2012, and 2017
input_dir = "D:/论文-产量趋势利用/数据/面积数据/收获面积/raw/cropland/"
output_dir = "D:/论文-产量趋势利用/数据/面积数据/收获面积/processed/cropland/"
area_processing(input_dir, output_dir, cropland = 'cropland')
# preprocessing for area of cropland and irrigated cropland in 1997, 2002, 2007, 2012, and 2017
input_dir = "D:/论文-产量趋势利用/数据/面积数据/灌溉面积/raw/cropland/"
output_dir = "D:/论文-产量趋势利用/数据/面积数据/灌溉面积/processed/cropland/"
area_processing(input_dir, output_dir, cropland = 'irrigated cropland')
# get the PIC for eveery year from 2003 to 2021
cropland_area_path = "D:/论文-产量趋势利用/数据/面积数据/收获面积/processed/cropland/cropland.csv"
cropland_irri_path = "D:/论文-产量趋势利用/数据/面积数据/灌溉面积/processed/cropland/irrigated cropland.csv"
output_dir = "D:/论文-产量趋势利用/数据/面积数据/"
PIC = calculate_PIC(cropland_area_path, cropland_irri_path)
resample_PIC(PIC,output_dir)

Processing...
Processed!
Processing...
Processed!
Processed!


In [217]:
# soybean
# preprocessing for area of soybean area and irrigated soybean area in 1997, 2002, 2007, 2012, and 2017
input_dir = "D:/论文-产量趋势利用/数据/面积数据/收获面积/raw/soybean/"
output_dir = "D:/论文-产量趋势利用/数据/面积数据/收获面积/processed/soybean/"
area_processing(input_dir, output_dir, cropland = 'soybean')
input_dir = "D:/论文-产量趋势利用/数据/面积数据/灌溉面积/raw/soybean/"
output_dir = "D:/论文-产量趋势利用/数据/面积数据/灌溉面积/processed/soybean/"
area_processing(input_dir, output_dir, cropland = 'irrigated soybean')
# get the PIC for eveery year from 2003 to 2021
cropland_area_path = "D:/论文-产量趋势利用/数据/面积数据/收获面积/processed/soybean/soybean.csv"
cropland_irri_path = "D:/论文-产量趋势利用/数据/面积数据/灌溉面积/processed/soybean/irrigated soybean.csv"
output_dir = "D:/论文-产量趋势利用/数据/面积数据/"
PIC = calculate_PIC(cropland_area_path, cropland_irri_path, croptype = 'soybean')
resample_PIC(PIC,output_dir,croptype = 'soybean')

Processing...
Processed!
Processing...
Processed!
Processed!


In [218]:
# maize
# preprocessing for area of maize area and irrigated maize area in 1997, 2002, 2007, 2012, and 2017
input_dir = "D:/论文-产量趋势利用/数据/面积数据/收获面积/raw/maize/"
output_dir = "D:/论文-产量趋势利用/数据/面积数据/收获面积/processed/maize/"
area_processing(input_dir, output_dir, cropland = 'maize')
input_dir = "D:/论文-产量趋势利用/数据/面积数据/灌溉面积/raw/maize/"
output_dir = "D:/论文-产量趋势利用/数据/面积数据/灌溉面积/processed/maize/"
area_processing(input_dir, output_dir, cropland = 'irrigated maize')
# get the PIC for eveery year from 2003 to 2021
cropland_area_path = "D:/论文-产量趋势利用/数据/面积数据/收获面积/processed/maize/maize.csv"
cropland_irri_path = "D:/论文-产量趋势利用/数据/面积数据/灌溉面积/processed/maize/irrigated maize.csv"
output_dir = "D:/论文-产量趋势利用/数据/面积数据/"
PIC = calculate_PIC(cropland_area_path, cropland_irri_path, croptype = 'maize')
resample_PIC(PIC,output_dir,croptype = 'maize')

Processing...
Processed!
Processing...
Processed!
Processed!
