
# Import database for yield rate at year .... 


In [None]:
import pandas as pd
import re
%matplotlib inline

DATA_FOLDER = '~/data/'

TRADYIELD = DATA_FOLDER+"tradyield2019.csv"

tradyield = pd.read_csv(TRADYIELD)
tradyield.head()

TimeoutError: [Errno 60] Operation timed out

# Clean database

## Remove useless columns and rows with unreadable data

In [2]:
#Remove columns that are not needed
# Columns with only NaN values
tradyield = tradyield.dropna(axis=1, how='all')
# Columns with only 0 values
tradyield = tradyield.loc[:, (tradyield != 0).any(axis=0)]
# Columns Period, Geo Level, State ANSI
tradyield = tradyield.drop(columns=['Year','Program','Period', 'Geo Level', 'State ANSI', 'Domain', 'Domain Category'])
#state how many rows with (D) value have been removed
print("Number of rows with (D) value: ", len(tradyield[tradyield['Value'].str.contains(r'\(D\)')]))
#remove rows where value colummn = (D) with or without spaces
tradyield = tradyield[~tradyield['Value'].str.contains(r'\(D\)')]
#State how many rows with MAPLE SYRUP have been removed
print("Number of rows with MAPLE SYRUP: ", len(tradyield[tradyield['Commodity'].str.contains(r'MAPLE SYRUP')]))
#remove rows where commodity contains MAPLE SYRUP 
tradyield = tradyield[~tradyield['Commodity'].str.contains(r'MAPLE SYRUP')]


tradyield.head(7)

Number of rows with (D) value:  109
Number of rows with MAPLE SYRUP:  7


Unnamed: 0,State,Commodity,Data Item,Value
0,ALABAMA,CORN,"CORN, GRAIN - YIELD, MEASURED IN BU / ACRE",147.0
1,ALABAMA,CORN,"CORN, SILAGE - YIELD, MEASURED IN TONS / ACRE",13.0
2,ALABAMA,COTTON,"COTTON - YIELD, MEASURED IN LB / ACRE",928.0
3,ALABAMA,COTTON,"COTTON, UPLAND - YIELD, MEASURED IN LB / ACRE",928.0
4,ALABAMA,HAY,"HAY - YIELD, MEASURED IN TONS / ACRE",2.5
5,ALABAMA,HAY,"HAY, (EXCL ALFALFA) - YIELD, MEASURED IN TONS ...",2.5
6,ALABAMA,PEANUTS,"PEANUTS - YIELD, MEASURED IN LB / ACRE",3350.0


## Units conversion in short tons/acre

In [3]:
# Function to extract and clean the unit
def extract_clean_unit(data_item):
    match = re.search(r'IN\s+([A-Z]+\s*/\s*[A-Z]+)', data_item)  # Look for "IN" followed by the unit
    if match:
        return match.group(1).replace(' ', '')  # Clean spaces to get "BU/ACRE"
    return None  # Return None if no match is found

# Applying the function to create a new 'Unit' column
tradyield['Unit'] = tradyield['Data Item'].apply(extract_clean_unit)

# Display the updated DataFrame
tradyield.head(7)

#print unique values of 'Unit' column
print(tradyield['Unit'].unique())


['BU/ACRE' 'TONS/ACRE' 'LB/ACRE' 'CWT/ACRE' 'BOXES/ACRE' 'BARRELS/ACRE']


In [4]:
#print column with 'Unit' = 'BARRELS/ACRE'
print(tradyield[tradyield['Unit'] == 'BARRELS/ACRE'])
#print the number of rows with 'Unit' = 'BARRELS/ACRE'
print(len(tradyield[tradyield['Unit'] == 'BARRELS/ACRE']))

              State    Commodity  \
450   MASSACHUSETTS  CRANBERRIES   
640      NEW JERSEY  CRANBERRIES   
792          OREGON  CRANBERRIES   
1174      WISCONSIN  CRANBERRIES   

                                            Data Item  Value          Unit  
450   CRANBERRIES - YIELD, MEASURED IN BARRELS / ACRE  175.6  BARRELS/ACRE  
640   CRANBERRIES - YIELD, MEASURED IN BARRELS / ACRE  185.1  BARRELS/ACRE  
792   CRANBERRIES - YIELD, MEASURED IN BARRELS / ACRE  206.8  BARRELS/ACRE  
1174  CRANBERRIES - YIELD, MEASURED IN BARRELS / ACRE  224.4  BARRELS/ACRE  
4


In [5]:
#print column with 'Unit' = 'BOXES/ACRE'
print(tradyield[tradyield['Unit'] == 'BOXES/ACRE'])
#print the number of rows with 'Unit' = 'BOXES/ACRE'
print(len(tradyield[tradyield['Unit'] == 'BOXES/ACRE']))

          State   Commodity  \
26      ARIZONA      LEMONS   
89   CALIFORNIA  GRAPEFRUIT   
104  CALIFORNIA      LEMONS   
115  CALIFORNIA     ORANGES   
116  CALIFORNIA     ORANGES   
117  CALIFORNIA     ORANGES   
144  CALIFORNIA  TANGERINES   
204     FLORIDA  GRAPEFRUIT   
205     FLORIDA  GRAPEFRUIT   
206     FLORIDA  GRAPEFRUIT   
211     FLORIDA     ORANGES   
212     FLORIDA     ORANGES   
213     FLORIDA     ORANGES   
223     FLORIDA  TANGERINES   
971       TEXAS  GRAPEFRUIT   
984       TEXAS     ORANGES   
985       TEXAS     ORANGES   
986       TEXAS     ORANGES   

                                             Data Item Value        Unit  
26            LEMONS - YIELD, MEASURED IN BOXES / ACRE   218  BOXES/ACRE  
89        GRAPEFRUIT - YIELD, MEASURED IN BOXES / ACRE   456  BOXES/ACRE  
104           LEMONS - YIELD, MEASURED IN BOXES / ACRE   484  BOXES/ACRE  
115          ORANGES - YIELD, MEASURED IN BOXES / ACRE   370  BOXES/ACRE  
116  ORANGES, MID & NAVEL - YIELD, 

In [6]:
tradyield['Value'] = tradyield['Value'].str.replace(',', '', regex=False)  # Remove thousands separator before conversion
# Convert 'Value' column to numeric, forcing errors to NaN
tradyield['Value'] = pd.to_numeric(tradyield['Value'], errors='coerce')

# Define conversion ratios
conversion_ratios = {
    'TONS/ACRE': 1,  # Reference value with Short tons and not metric tons used here
    'LB/ACRE': 0.0005,
    'CWT/ACRE': 0.05, #since 1 CWT = 100 lb
    'BU/ACRE (CORN/SORGHUM/RYE/FLAXSEED)': 0.028,  #source: TABLES 5/6 https://ers.usda.gov/sites/default/files/_laserfiche/publications/41880/33132_ah697_002.pdf?v=91996 
    'BU/ACRE (BARLEY)': 0.024, #source: same
    'BU/ACRE (WHEAT/SOYBEANS)': 0.03, #source: same
    'BU/ACRE (OATS)': 32*0.0005, #source: same
    'BARRELS/ACRE': 100*0.0005, #since 1 barrel of cranberry = 100lb of fruits source: same
    'BOXES/ACRE (LEMONS)': 38*0.0005,  #since 1 box of lemons = 38 lb source: same
    'BOXES/ACRE (ORANGES,FL)': 43*0.0005,  #since 1 box of oranges = 43 lb source: same
    'BOXES/ACRE (ORANGES,TX)': 42*0.0005,  #since 1 box of oranges = 42 lb source: same
    'BOXES/ACRE (ORANGE, AZ/CA)': 38*0.0005,  #since 1 box of oranges = 38 lb source: same
    'BOXES/ACRE (GRAPEFRUIT, FL/TX)': 40*0.0005,  #since 1 box of oranges = 40 lb source: same
    'BOXES/ACRE (GRAPEFRUIT, AZ/CA)': 34*0.0005,  #since 1 box of oranges = 34 lb source: same
    'BOXES/ACRE (TANGERINES, AZ/CA)': 25*0.0005,  #since 1 box of oranges = 25 lb source: same
    'BOXES/ACRE (TANGERINES, FL)': 43*0.0005,  #since 1 box of oranges = 43 lb source: same
}



# Function to convert values to tons/acre
def convert_to_tons_acre(row):
    value = row['Value']
    unit = row['Unit']

    if pd.isna(value):  # Check if the value is NaN
        return None
    if unit == 'TONS/ACRE':
        return value  # No conversion needed
    elif 'BU/ACRE' in unit:
        if 'CORN' in row['Commodity'] or 'SORGHUM' or 'RYE' in row['Commodity'] or 'FLAXSEED' in row['Commodity']:
            return value * conversion_ratios['BU/ACRE (CORN/SORGHUM/RYE/FLAXSEED)']
        elif 'BARLEY' in row['Commodity']:
            return value * conversion_ratios['BU/ACRE (BARLEY)']
        elif 'WHEAT' in row['Commodity'] or 'SOYBEANS' in row['Commodity']:
            return value * conversion_ratios['BU/ACRE (WHEAT/SOYBEANS)']
        elif 'OATS' in row['Commodity']:
            return value * conversion_ratios['BU/ACRE (OATS)']
    elif unit == 'LB/ACRE':
        return value * conversion_ratios['LB/ACRE']
    elif unit == 'CWT/ACRE':
        return value * conversion_ratios['CWT/ACRE']
    elif unit == 'BARRELS/ACRE':
        return value * conversion_ratios['BARRELS/ACRE']
    elif unit == 'BOXES/ACRE':
        if 'LEMONS' in row['Commodity']:
            return value * conversion_ratios['BOXES/ACRE (LEMONS)']
        elif 'ORANGE' in row['Commodity']:
            if 'FLORIDA' in row['State']:
                return value * conversion_ratios['BOXES/ACRE (ORANGES,FL)']
            elif 'TEXAS' in row['State']:
                return value * conversion_ratios['BOXES/ACRE (ORANGES,TX)']
            elif 'ARIZONA' in row['State'] or 'CALIFORNIA' in row['State']:
                return value * conversion_ratios['BOXES/ACRE (ORANGE, AZ/CA)']
        elif 'GRAPEFRUIT' in row['Commodity']:
            if 'FLORIDA' in row['State'] or 'TEXAS' in row['State']:
                return value * conversion_ratios['BOXES/ACRE (GRAPEFRUIT, FL/TX)']
            elif 'ARIZONA' in row['State'] or 'CALIFORNIA' in row['State']:
                return value * conversion_ratios['BOXES/ACRE (GRAPEFRUIT, AZ/CA)']
        elif 'TANGERINES' in row['Commodity']:
            if 'FLORIDA' in row['State']:
                return value * conversion_ratios['BOXES/ACRE (TANGERINES, FL)']
            elif 'ARIZONA' in row['State'] or 'CALIFORNIA' in row['State']:
                return value * conversion_ratios['BOXES/ACRE (TANGERINES, AZ/CA)']
        
    return None  # Return None for unrecognized units


# Applying the conversion function to create the 'new_values' column
tradyield['new_values'] = tradyield.apply(convert_to_tons_acre, axis=1)

# Dropping the old 'Value' and 'Unit' columns
tradyield.drop(columns=['Value', 'Unit'], inplace=True)

# Renaming 'new_values' to 'yield_value' and adding unit column
tradyield.rename(columns={'new_values': 'yield_value'}, inplace=True)
tradyield['Unit'] = 'TON/ACRE'  # Add a new unit column with 'TON/ACRE'


tradyield.head(7)

Unnamed: 0,State,Commodity,Data Item,yield_value,Unit
0,ALABAMA,CORN,"CORN, GRAIN - YIELD, MEASURED IN BU / ACRE",4.116,TON/ACRE
1,ALABAMA,CORN,"CORN, SILAGE - YIELD, MEASURED IN TONS / ACRE",13.0,TON/ACRE
2,ALABAMA,COTTON,"COTTON - YIELD, MEASURED IN LB / ACRE",0.464,TON/ACRE
3,ALABAMA,COTTON,"COTTON, UPLAND - YIELD, MEASURED IN LB / ACRE",0.464,TON/ACRE
4,ALABAMA,HAY,"HAY - YIELD, MEASURED IN TONS / ACRE",2.5,TON/ACRE
5,ALABAMA,HAY,"HAY, (EXCL ALFALFA) - YIELD, MEASURED IN TONS ...",2.5,TON/ACRE
6,ALABAMA,PEANUTS,"PEANUTS - YIELD, MEASURED IN LB / ACRE",1.675,TON/ACRE


## Prepare data to pivot table

In [7]:
# Function to extract the commodity name from the Data Item
def extract_commodity(data_item):
    match = re.search(r'^(.*?)\s*-\s*', data_item)
    if match:
        return match.group(1).strip()  # Return the part before the dash
    return None

# Updating the Commodity column with extracted values
tradyield['Commodity'] = tradyield['Data Item'].apply(extract_commodity)

tradyield.head(200)

Unnamed: 0,State,Commodity,Data Item,yield_value,Unit
0,ALABAMA,"CORN, GRAIN","CORN, GRAIN - YIELD, MEASURED IN BU / ACRE",4.1160,TON/ACRE
1,ALABAMA,"CORN, SILAGE","CORN, SILAGE - YIELD, MEASURED IN TONS / ACRE",13.0000,TON/ACRE
2,ALABAMA,COTTON,"COTTON - YIELD, MEASURED IN LB / ACRE",0.4640,TON/ACRE
3,ALABAMA,"COTTON, UPLAND","COTTON, UPLAND - YIELD, MEASURED IN LB / ACRE",0.4640,TON/ACRE
4,ALABAMA,HAY,"HAY - YIELD, MEASURED IN TONS / ACRE",2.5000,TON/ACRE
...,...,...,...,...,...
208,FLORIDA,"HAY, (EXCL ALFALFA)","HAY, (EXCL ALFALFA) - YIELD, MEASURED IN TONS ...",2.9000,TON/ACRE
209,FLORIDA,"MELONS, CANTALOUP","MELONS, CANTALOUP - YIELD, MEASURED IN CWT / ACRE",10.5000,TON/ACRE
210,FLORIDA,"MELONS, WATERMELON","MELONS, WATERMELON - YIELD, MEASURED IN CWT / ...",17.5000,TON/ACRE
211,FLORIDA,ORANGES,"ORANGES - YIELD, MEASURED IN BOXES / ACRE",4.3645,TON/ACRE


In [8]:
#print how many rows have the word 'UTILIZED' or 'IN SHELL' or 'IMPROVED' in the 'Commodity' column
print("Number of rows with UTILIZED in Commodity: ", len(tradyield[tradyield['Commodity'].str.contains(r'UTILIZED')]))
print("Number of rows with IN SHELL in Commodity: ", len(tradyield[tradyield['Commodity'].str.contains(r'IN SHELL')]))
print("Number of rows with IMPROVED in Commodity: ", len(tradyield[tradyield['Commodity'].str.contains(r'IMPROVED')]))

# Remove the word 'UTILIZED' and 'IN SHELL' and 'IMPROVED' if present in the 'Commodity' column even if there are words, commas before or after
tradyield['Commodity'] = tradyield['Commodity'].str.replace(r'\s*UTILIZED\s*', '', regex=True).str.strip()
# Remove any trailing commas or extra spaces left behind
tradyield['Commodity'] = tradyield['Commodity'].str.replace(r',\s*$', '', regex=True).str.strip()

tradyield['Commodity'] = tradyield['Commodity'].str.replace(r'\s*IN SHELL\s*', '', regex=True).str.strip()
# Remove any trailing commas or extra spaces left behind
tradyield['Commodity'] = tradyield['Commodity'].str.replace(r',\s*$', '', regex=True).str.strip()

tradyield['Commodity'] = tradyield['Commodity'].str.replace(r'\s*IMPROVED\s*', '', regex=True).str.strip()
# Remove any trailing commas or extra spaces left behind
tradyield['Commodity'] = tradyield['Commodity'].str.replace(r',\s*$', '', regex=True).str.strip()

#Check if some rows have NaN yield_value and print them to see the unit (if it is not TON/ACRE or one of the other units already implemented)
missing_values = len(tradyield[tradyield['yield_value'].isnull()])
print("Number of rows with NaN yield_value: ", missing_values)
#print rows with NaN yield_value
if missing_values > 0:
    print(tradyield[tradyield['yield_value'].isnull()])

#Check if some rows have negative yield_value and print how many of them
negative_values = len(tradyield[tradyield['yield_value'] < 0])
print("Number of rows with negative yield_value: ", negative_values)
#Remove rows with negative yield_value
tradprice = tradyield[tradyield['yield_value'] >= 0]


Number of rows with UTILIZED in Commodity:  10
Number of rows with IN SHELL in Commodity:  6
Number of rows with IMPROVED in Commodity:  0
Number of rows with NaN yield_value:  0
Number of rows with negative yield_value:  0


In [9]:
#Check if there are some rows with the same Commodity and State but different yield_value
duplicates = len(tradyield[tradyield.duplicated(subset=['Commodity', 'State'], keep=False)])
print("Number of rows with the same Commodity and State but different yield_value: ", duplicates)

# Remove one of the duplicates based on State and Commodity
tradyield = tradyield.drop_duplicates(subset=['State', 'Commodity'])

duplicates_2 = len(tradyield[tradyield.duplicated(subset=['Commodity', 'State'], keep=False)])
print("Number of rows with the same Commodity and State but different yield_value: ", duplicates_2)


Number of rows with the same Commodity and State but different yield_value:  18
Number of rows with the same Commodity and State but different yield_value:  0


## Pivot table 
#### (for better/coherent extraction between diffferent tables later)

In [10]:
# Now pivoting the DataFrame
tradyield_pivot = tradyield.pivot(index='State', columns='Commodity', values='yield_value')

# Resetting the index to turn the pivot table back into a DataFrame
tradyield_pivot.reset_index(inplace=True)

# Export cleaned and pivoted databases

In [11]:
# Save the cleaned data to a new CSV file
tradyield_pivot.to_csv(DATA_FOLDER+'tradyield_pivotcleaned.csv', index=False)
tradyield.to_csv(DATA_FOLDER+'tradyield_cleaned.csv', index=False)