In [12]:
import os
import pandas as pd

def yield_processing(input_dir, output_dir, cols = ['State','County','Year','Value'], crop = 'soybean', sta_con_dir = ''):
    '''
    It is a production processing function for the historical yield records 
    downloaded from the national agricultural statistics Service (NASS).
    
    Parameters
    ----------
    input_dir: string, the directory of raw yield records. e.g. 'C:/Data/raw/soybean/'.
    output_dir: string, the directory where save processed production data.
    cols: list, the columns that will be read. The default is ['State','County','Year','Value'].
    crop: string. soybean or maize. The default is soybean.
    sta_con_dir: string. The directory of the sta_con.csv file, which will be used in 
    QGIS and Google Earth Engine as shp file. Default is the current working directory got using os. getcwd().
    
    Returns
    ----------
    None
    
    Notes
    ----------
    1. The input directory only contains CSV files of yield.
    '''
    # Check if the output directory exists.
    if not os.path.isdir(output_dir):
        print('Please create a new outout directory.')
        return
        
    count = 0
    list_sta_con = []
    print("Processing...")
    
    for _, _, files in os.walk(input_dir):
        for file in files:
            # Process csv files only.
            if file.endswith(".csv"):
                count += 1
                yield_data = pd.read_csv(os.path.join(input_dir,file),engine='python', usecols = cols)

                # Make the names of states and counties lowercase.
                yield_data.loc[:,"State"] = yield_data.loc[:,"State"].str.lower()
                yield_data.loc[:,"County"] = yield_data.loc[:,"County"].str.lower()

                # Concatenate state name and county name as sta_con.
                yield_data.loc[:,"sta_con"] = yield_data.loc[:,"State"] + "_" + yield_data.loc[:,"County"]
                
                # Change the unit of yield records
                if crop == 'soybean':
                    yield_data.loc[:,"yield(t/ha)"] = yield_data.loc[:,"Value"]*0.0672 # Soybean
                if crop == 'maize':
                    yield_data.loc[:,"yield(t/ha)"] = yield_data.loc[:,"Value"]*0.062719012 # Maize
                    
                # Concatenate all csv files
                if count == 1:
                    df = yield_data
                else:
                    df = pd.concat([df,yield_data])
                
                # Get sta_con.csv
                list_sta_con = list_sta_con + list(yield_data["sta_con"])
                set_sta_con = set(list_sta_con)
        

    # Save sta_con.csv
    sta_con = pd.DataFrame()
    sta_con["sta_con"] = list(set_sta_con)
    sta_con.to_csv(os.path.join(sta_con_dir, crop + "_sta_con.csv"),index=False)
    
    # Save new csv file of yield
    print(len(df.index))
    df.to_csv(os.path.join(output_dir, crop + '.csv'), index=False)
    print("Processed!")
    return



In [13]:
# Yield of soybean from 2004 to 2021 preprocessing 
input_dir = "D:/论文-产量趋势利用/数据/产量数据/raw/soybean/"
output_dir = "D:/论文-产量趋势利用/数据/产量数据/processed/soybean/"
sta_con_dir = "D:/论文-产量趋势利用/可视化/QGIS源文件/"
yield_processing(input_dir, output_dir, crop = 'soybean', sta_con_dir = sta_con_dir)

Processing...
27401
Processed!


In [14]:
# Yield of maize from 2004 to 2021 preprocessing 
input_dir = "D:/论文-产量趋势利用/数据/产量数据/raw/maize/"
output_dir = "D:/论文-产量趋势利用/数据/产量数据/processed/maize/"
sta_con_dir = "D:/论文-产量趋势利用/可视化/QGIS源文件/"
yield_processing(input_dir, output_dir, crop = 'maize', sta_con_dir = sta_con_dir)

Processing...
31765
Processed!
