In [1]:
import pandas as pd
import numpy as np
import pickle
import re
import pyarrow.parquet as pq
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName('Python Spark SQL basic example') \
    .config('spark.some.config.option', 'some-value') \
    .getOrCreate()

pd.set_option('display.max_columns', 40)
pd.set_option('display.max_rows', 5000)


In [None]:
#df_2002 = spark.read.parquet('census_02.parquet')
#df_2007 = spark.read.load('census_07.parquet')
#df_2012 = spark.read.load('census_12.parquet')

In [17]:
def read_and_clean_USDA_census_parquet(input_parquet):
    """
    takes a parquet of one years USDA census of Agriculture data
    selects crops (ignores animals, economics/sales, etc)
    removes redundant, semi-redundant and just plain confusing data
    for more info see the README file at https://github.com/Rafael-SV/ag_census_analysis
    """
    dataframe_new = spark.read.parquet(input_parquet)
    dataframe_new.createOrReplaceTempView('dataframe_new')
    columns_to_select = 'DOMAIN_DESC,SECTOR_DESC,CLASS_DESC,PRODN_PRACTICE_DESC,COMMODITY_DESC,SHORT_DESC,UNIT_DESC,VALUE,YEAR,\
    COUNTY_NAME,COUNTY_CODE,ASD_DESC,STATE_ALPHA,STATISTICCAT_DESC'
    filtered_statisticcat_dataframe = spark.sql("SELECT {} FROM dataframe_new WHERE SECTOR_DESC = 'CROPS' \
        and (COMMODITY_DESC = 'COTTON' and CLASS_DESC='ALL CLASSES') \
        OR ( COMMODITY_DESC != 'COTTON' AND STATISTICCAT_DESC = 'AREA HARVESTED' OR 'STATISTICCAT_DESC' = 'AREA NOT HARVESTED' \
        OR STATISTICCAT_DESC = 'AREA IN PRODUCTION' OR STATISTICCAT_DESC = 'AREA BEARING' \
        OR STATISTICCAT_DESC = 'AREA NON-BEARING' OR STATISTICCAT_DESC = 'AREA GROWN')".format(columns_to_select))
    filtered_statisticcat_dataframe.createOrReplaceTempView("filtered_statisticcat_dataframe")
    #remove the aggregate totals except for the aggregate of the messy hay+haylage categories
    filtered_commodity_dataframe = spark.sql("SELECT {} FROM filtered_statisticcat_dataframe \
        WHERE COMMODITY_DESC NOT LIKE 'HAY' AND COMMODITY_DESC NOT LIKE 'HAYLAGE' AND COMMODITY_DESC NOT LIKE '%TOTALS' \
        AND COMMODITY_DESC NOT LIKE 'GRAIN STORAGE CAPAITY'".format(columns_to_select))
    filtered_commodity_dataframe.createOrReplaceTempView("filtered_commodity_dataframe")
    #remove categories based on irrigation pracitices
    filtered_unit_and_production_dataframe = spark.sql("\
        SELECT * FROM filtered_commodity_dataframe WHERE (UNIT_DESC LIKE 'OPERATIONS' OR UNIT_DESC LIKE 'SQ FT' \
        OR UNIT_DESC LIKE 'ACRES') AND PRODN_PRACTICE_DESC = 'ALL PRODUCTION PRACTICES' AND DOMAIN_DESC = 'TOTAL'")
    filtered_unit_and_production_dataframe.createOrReplaceTempView("filtered_unit_and_production_dataframe")
    return filtered_unit_and_production_dataframe
    

In [19]:
test_df = read_and_clean_USDA_census_parquet('census_02.parquet')
test_df.limit(100).toPandas().head(100)

Unnamed: 0,DOMAIN_DESC,SECTOR_DESC,CLASS_DESC,PRODN_PRACTICE_DESC,COMMODITY_DESC,SHORT_DESC,UNIT_DESC,VALUE,YEAR,COUNTY_NAME,COUNTY_CODE,ASD_DESC,STATE_ALPHA,STATISTICCAT_DESC
0,TOTAL,CROPS,SNAP,ALL PRODUCTION PRACTICES,BEANS,"BEANS, SNAP - ACRES HARVESTED",ACRES,(D),2002.0,COLBERT,33.0,NORTHERN VALLEY,AL,AREA HARVESTED
1,TOTAL,CROPS,ALL CLASSES,ALL PRODUCTION PRACTICES,COTTON,COTTON - ACRES HARVESTED,ACRES,24598,2002.0,COLBERT,33.0,NORTHERN VALLEY,AL,AREA HARVESTED
2,TOTAL,CROPS,ALL CLASSES,ALL PRODUCTION PRACTICES,HAY & HAYLAGE,HAY & HAYLAGE - ACRES HARVESTED,ACRES,12055,2002.0,COLBERT,33.0,NORTHERN VALLEY,AL,AREA HARVESTED
3,TOTAL,CROPS,ALL CLASSES,ALL PRODUCTION PRACTICES,RYE,RYE - ACRES HARVESTED,ACRES,(D),2002.0,COLBERT,33.0,NORTHERN VALLEY,AL,AREA HARVESTED
4,TOTAL,CROPS,ALL CLASSES,ALL PRODUCTION PRACTICES,SOYBEANS,SOYBEANS - ACRES HARVESTED,ACRES,3347,2002.0,COLBERT,33.0,NORTHERN VALLEY,AL,AREA HARVESTED
5,TOTAL,CROPS,ALL CLASSES,ALL PRODUCTION PRACTICES,SUNFLOWER,SUNFLOWER - ACRES HARVESTED,ACRES,23,2002.0,COLBERT,33.0,NORTHERN VALLEY,AL,AREA HARVESTED
6,TOTAL,CROPS,NON-OIL TYPE,ALL PRODUCTION PRACTICES,SUNFLOWER,"SUNFLOWER, NON-OIL TYPE - ACRES HARVESTED",ACRES,(D),2002.0,COLBERT,33.0,NORTHERN VALLEY,AL,AREA HARVESTED
7,TOTAL,CROPS,OIL TYPE,ALL PRODUCTION PRACTICES,SUNFLOWER,"SUNFLOWER, OIL TYPE - ACRES HARVESTED",ACRES,(D),2002.0,COLBERT,33.0,NORTHERN VALLEY,AL,AREA HARVESTED
8,TOTAL,CROPS,ALL CLASSES,ALL PRODUCTION PRACTICES,WHEAT,WHEAT - ACRES HARVESTED,ACRES,3363,2002.0,COLBERT,33.0,NORTHERN VALLEY,AL,AREA HARVESTED
9,TOTAL,CROPS,WINTER,ALL PRODUCTION PRACTICES,WHEAT,"WHEAT, WINTER - ACRES HARVESTED",ACRES,3363,2002.0,COLBERT,33.0,NORTHERN VALLEY,AL,AREA HARVESTED


In [18]:
## test cell to check spark sql queries
columns_to_select = 'DOMAIN_DESC,SECTOR_DESC,CLASS_DESC,PRODN_PRACTICE_DESC,COMMODITY_DESC,SHORT_DESC,UNIT_DESC,VALUE,YEAR,\
    COUNTY_NAME,COUNTY_CODE,ASD_DESC,STATE_ALPHA,STATISTICCAT_DESC'
dataframe_new = spark.read.parquet('census_02.parquet')
dataframe_new.createOrReplaceTempView('dataframe_new')
filtered_dataframe = spark.sql("SELECT {} FROM dataframe_new WHERE SECTOR_DESC = 'CROPS' \
    and (COMMODITY_DESC = 'COTTON' and CLASS_DESC='ALL CLASSES') \
    OR ( COMMODITY_DESC != 'COTTON' AND STATISTICCAT_DESC = 'AREA HARVESTED' OR 'STATISTICCAT_DESC' = 'AREA NOT HARVESTED' \
    OR STATISTICCAT_DESC = 'AREA IN PRODUCTION' OR STATISTICCAT_DESC = 'AREA BEARING' \
    OR STATISTICCAT_DESC = 'AREA NON-BEARING' OR STATISTICCAT_DESC = 'AREA GROWN')".format(columns_to_select))
filtered_dataframe.createOrReplaceTempView("filtered_statisticcat_dataframe")
filtered_dataframe.limit(100).toPandas().head(100)

Unnamed: 0,DOMAIN_DESC,SECTOR_DESC,CLASS_DESC,PRODN_PRACTICE_DESC,COMMODITY_DESC,SHORT_DESC,UNIT_DESC,VALUE,YEAR,COUNTY_NAME,COUNTY_CODE,ASD_DESC,STATE_ALPHA,STATISTICCAT_DESC
0,TOTAL,CROPS,SNAP,ALL PRODUCTION PRACTICES,BEANS,"BEANS, SNAP - ACRES HARVESTED",ACRES,(D),2002.0,COLBERT,33.0,NORTHERN VALLEY,AL,AREA HARVESTED
1,TOTAL,CROPS,ALL CLASSES,ALL PRODUCTION PRACTICES,COTTON,COTTON - ACRES HARVESTED,ACRES,24598,2002.0,COLBERT,33.0,NORTHERN VALLEY,AL,AREA HARVESTED
2,TOTAL,CROPS,ALL CLASSES,ALL PRODUCTION PRACTICES,HAY & HAYLAGE,HAY & HAYLAGE - ACRES HARVESTED,ACRES,12055,2002.0,COLBERT,33.0,NORTHERN VALLEY,AL,AREA HARVESTED
3,TOTAL,CROPS,ALL CLASSES,ALL PRODUCTION PRACTICES,HAY,HAY - ACRES HARVESTED,ACRES,12258,2002.0,COLBERT,33.0,NORTHERN VALLEY,AL,AREA HARVESTED
4,TOTAL,CROPS,ALL CLASSES,IRRIGATED,HAY,"HAY, IRRIGATED - ACRES HARVESTED",ACRES,82,2002.0,COLBERT,33.0,NORTHERN VALLEY,AL,AREA HARVESTED
5,TOTAL,CROPS,ALFALFA,ALL PRODUCTION PRACTICES,HAY,"HAY, ALFALFA - ACRES HARVESTED",ACRES,365,2002.0,COLBERT,33.0,NORTHERN VALLEY,AL,AREA HARVESTED
6,TOTAL,CROPS,ALFALFA,IRRIGATED,HAY,"HAY, ALFALFA, IRRIGATED - ACRES HARVESTED",ACRES,(D),2002.0,COLBERT,33.0,NORTHERN VALLEY,AL,AREA HARVESTED
7,TOTAL,CROPS,SMALL GRAIN,ALL PRODUCTION PRACTICES,HAY,"HAY, SMALL GRAIN - ACRES HARVESTED",ACRES,453,2002.0,COLBERT,33.0,NORTHERN VALLEY,AL,AREA HARVESTED
8,TOTAL,CROPS,WILD,ALL PRODUCTION PRACTICES,HAY,"HAY, WILD - ACRES HARVESTED",ACRES,1022,2002.0,COLBERT,33.0,NORTHERN VALLEY,AL,AREA HARVESTED
9,TOTAL,CROPS,ALL CLASSES,ALL PRODUCTION PRACTICES,RYE,RYE - ACRES HARVESTED,ACRES,(D),2002.0,COLBERT,33.0,NORTHERN VALLEY,AL,AREA HARVESTED


In [4]:
interesting_column_str = 'GROUP_DESC, COMMODITY_DESC, CLASS_DESC, PRODN_PRACTICE_DESC, \
    UTIL_PRACTICE_DESC, STATISTICCAT_DESC, UNIT_DESC, SHORT_DESC, LOCATION_DESC, DOMAIN_DESC, VALUE'
interesting_column_list = str.split(interesting_column_str)
interesting_columns_fewer_coulumns_str = 'GROUP_DESC, COMMODITY_DESC, STATISTICCAT_DESC, UNIT_DESC, \
    SHORT_DESC, LOCATION_DESC, DOMAIN_DESC, VALUE'
interesting_columns_fewer_coulumns_list = interesting_columns_fewer_coulumns_str.split()
print(interesting_columns_fewer_coulumns_list)

['GROUP_DESC,', 'COMMODITY_DESC,', 'STATISTICCAT_DESC,', 'UNIT_DESC,', 'SHORT_DESC,', 'LOCATION_DESC,', 'DOMAIN_DESC,', 'VALUE']


In [10]:
df_2002 = spark.read.parquet("census_02.parquet")
# Parquet files can also be used to create a temporary view and then used in SQL statements.
df_2002.createOrReplaceTempView("df_2002")

In [3]:
df_2002.limit(100).toPandas().head(100)

Unnamed: 0,SOURCE_DESC,SECTOR_DESC,GROUP_DESC,COMMODITY_DESC,CLASS_DESC,PRODN_PRACTICE_DESC,UTIL_PRACTICE_DESC,STATISTICCAT_DESC,UNIT_DESC,SHORT_DESC,DOMAIN_DESC,DOMAINCAT_DESC,AGG_LEVEL_DESC,STATE_ANSI,STATE_FIPS_CODE,STATE_ALPHA,STATE_NAME,ASD_CODE,ASD_DESC,COUNTY_ANSI,COUNTY_CODE,COUNTY_NAME,REGION_DESC,ZIP_5,WATERSHED_CODE,WATERSHED_DESC,CONGR_DISTRICT_CODE,COUNTRY_CODE,COUNTRY_NAME,LOCATION_DESC,YEAR,FREQ_DESC,BEGIN_CODE,END_CODE,REFERENCE_PERIOD_DESC,WEEK_ENDING,LOAD_TIME,VALUE,CV_%,__index_level_0__
0,CENSUS,CROPS,VEGETABLES,BEANS,SNAP,ALL PRODUCTION PRACTICES,ALL UTILIZATION PRACTICES,AREA HARVESTED,ACRES,"BEANS, SNAP - ACRES HARVESTED",TOTAL,NOT SPECIFIED,COUNTY,1.0,1.0,AL,ALABAMA,10.0,NORTHERN VALLEY,33.0,33.0,COLBERT,,,0.0,,,9000.0,UNITED STATES,"ALABAMA, NORTHERN VALLEY, COLBERT",2002.0,ANNUAL,0.0,0.0,YEAR,,2012-01-01 00:00:00,(D),,0
1,CENSUS,CROPS,FIELD CROPS,COTTON,ALL CLASSES,ALL PRODUCTION PRACTICES,ALL UTILIZATION PRACTICES,AREA HARVESTED,ACRES,COTTON - ACRES HARVESTED,TOTAL,NOT SPECIFIED,COUNTY,1.0,1.0,AL,ALABAMA,10.0,NORTHERN VALLEY,33.0,33.0,COLBERT,,,0.0,,,9000.0,UNITED STATES,"ALABAMA, NORTHERN VALLEY, COLBERT",2002.0,ANNUAL,0.0,0.0,YEAR,,2012-01-01 00:00:00,24598,,1
2,CENSUS,CROPS,FIELD CROPS,COTTON,UPLAND,ALL PRODUCTION PRACTICES,ALL UTILIZATION PRACTICES,AREA HARVESTED,ACRES,"COTTON, UPLAND - ACRES HARVESTED",TOTAL,NOT SPECIFIED,COUNTY,1.0,1.0,AL,ALABAMA,10.0,NORTHERN VALLEY,33.0,33.0,COLBERT,,,0.0,,,9000.0,UNITED STATES,"ALABAMA, NORTHERN VALLEY, COLBERT",2002.0,ANNUAL,0.0,0.0,YEAR,,2012-01-01 00:00:00,24598,,2
3,CENSUS,CROPS,FIELD CROPS,COTTON,UPLAND,IRRIGATED,ALL UTILIZATION PRACTICES,AREA HARVESTED,ACRES,"COTTON, UPLAND, IRRIGATED - ACRES HARVESTED",TOTAL,NOT SPECIFIED,COUNTY,1.0,1.0,AL,ALABAMA,10.0,NORTHERN VALLEY,33.0,33.0,COLBERT,,,0.0,,,9000.0,UNITED STATES,"ALABAMA, NORTHERN VALLEY, COLBERT",2002.0,ANNUAL,0.0,0.0,YEAR,,2012-01-01 00:00:00,(D),,3
4,CENSUS,CROPS,FIELD CROPS,HAY & HAYLAGE,ALL CLASSES,ALL PRODUCTION PRACTICES,ALL UTILIZATION PRACTICES,AREA HARVESTED,ACRES,HAY & HAYLAGE - ACRES HARVESTED,TOTAL,NOT SPECIFIED,COUNTY,1.0,1.0,AL,ALABAMA,10.0,NORTHERN VALLEY,33.0,33.0,COLBERT,,,0.0,,,9000.0,UNITED STATES,"ALABAMA, NORTHERN VALLEY, COLBERT",2002.0,ANNUAL,0.0,0.0,YEAR,,2012-01-01 00:00:00,12055,,4
5,CENSUS,CROPS,FIELD CROPS,HAY,ALL CLASSES,ALL PRODUCTION PRACTICES,ALL UTILIZATION PRACTICES,AREA HARVESTED,ACRES,HAY - ACRES HARVESTED,TOTAL,NOT SPECIFIED,COUNTY,1.0,1.0,AL,ALABAMA,10.0,NORTHERN VALLEY,33.0,33.0,COLBERT,,,0.0,,,9000.0,UNITED STATES,"ALABAMA, NORTHERN VALLEY, COLBERT",2002.0,ANNUAL,0.0,0.0,YEAR,,2012-01-01 00:00:00,12258,,5
6,CENSUS,CROPS,FIELD CROPS,HAY,ALL CLASSES,ALL PRODUCTION PRACTICES,ALL UTILIZATION PRACTICES,PRODUCTION,TONS,"HAY - PRODUCTION, MEASURED IN TONS",TOTAL,NOT SPECIFIED,COUNTY,1.0,1.0,AL,ALABAMA,10.0,NORTHERN VALLEY,33.0,33.0,COLBERT,,,0.0,,,9000.0,UNITED STATES,"ALABAMA, NORTHERN VALLEY, COLBERT",2002.0,ANNUAL,0.0,0.0,YEAR,,2012-01-01 00:00:00,(D),,6
7,CENSUS,CROPS,FIELD CROPS,HAY,ALL CLASSES,IRRIGATED,ALL UTILIZATION PRACTICES,AREA HARVESTED,ACRES,"HAY, IRRIGATED - ACRES HARVESTED",TOTAL,NOT SPECIFIED,COUNTY,1.0,1.0,AL,ALABAMA,10.0,NORTHERN VALLEY,33.0,33.0,COLBERT,,,0.0,,,9000.0,UNITED STATES,"ALABAMA, NORTHERN VALLEY, COLBERT",2002.0,ANNUAL,0.0,0.0,YEAR,,2012-01-01 00:00:00,82,,7
8,CENSUS,CROPS,FIELD CROPS,HAY,ALFALFA,ALL PRODUCTION PRACTICES,ALL UTILIZATION PRACTICES,AREA HARVESTED,ACRES,"HAY, ALFALFA - ACRES HARVESTED",TOTAL,NOT SPECIFIED,COUNTY,1.0,1.0,AL,ALABAMA,10.0,NORTHERN VALLEY,33.0,33.0,COLBERT,,,0.0,,,9000.0,UNITED STATES,"ALABAMA, NORTHERN VALLEY, COLBERT",2002.0,ANNUAL,0.0,0.0,YEAR,,2012-01-01 00:00:00,365,,8
9,CENSUS,CROPS,FIELD CROPS,HAY,ALFALFA,ALL PRODUCTION PRACTICES,ALL UTILIZATION PRACTICES,PRODUCTION,TONS,"HAY, ALFALFA - PRODUCTION, MEASURED IN TONS",TOTAL,NOT SPECIFIED,COUNTY,1.0,1.0,AL,ALABAMA,10.0,NORTHERN VALLEY,33.0,33.0,COLBERT,,,0.0,,,9000.0,UNITED STATES,"ALABAMA, NORTHERN VALLEY, COLBERT",2002.0,ANNUAL,0.0,0.0,YEAR,,2012-01-01 00:00:00,860,,9


In [13]:
only_crops_2002_df = spark.sql("SELECT {} FROM df_2002 WHERE SECTOR_DESC='CROPS'".format(interesting_column_str))
only_crops_2002_df.createOrReplaceTempView("only_crops_2002_df")

In [12]:
check_almonds_COMMOD_DESC_2002_df = spark.sql("\
    SELECT {} FROM only_crops_2002_df WHERE COMMODITY_DESC LIKE 'ALMONDS' \
".format(interesting_columns_fewer_coulumns_str))
check_almonds_COMMOD_DESC_2002_df.createOrReplaceTempView("check_almonds_COMMOD_DESC_2002_df")
check_almonds_COMMOD_DESC_2002_df.toPandas().head(10000)

Unnamed: 0,GROUP_DESC,COMMODITY_DESC,STATISTICCAT_DESC,UNIT_DESC,SHORT_DESC,LOCATION_DESC,DOMAIN_DESC,VALUE
0,FRUIT & TREE NUTS,ALMONDS,AREA BEARING,ACRES,ALMONDS - ACRES BEARING,"ARIZONA, NORTHERN, COCONINO",TOTAL,1
1,FRUIT & TREE NUTS,ALMONDS,AREA BEARING,OPERATIONS,ALMONDS - OPERATIONS WITH AREA BEARING,"ARIZONA, NORTHERN, COCONINO",TOTAL,5
2,FRUIT & TREE NUTS,ALMONDS,AREA BEARING & NON-BEARING,ACRES,ALMONDS - ACRES BEARING & NON-BEARING,"ARIZONA, NORTHERN, COCONINO",TOTAL,1
3,FRUIT & TREE NUTS,ALMONDS,AREA BEARING & NON-BEARING,OPERATIONS,ALMONDS - OPERATIONS WITH AREA BEARING & NON-B...,"ARIZONA, NORTHERN, COCONINO",TOTAL,5
4,FRUIT & TREE NUTS,ALMONDS,AREA NON-BEARING,ACRES,ALMONDS - ACRES NON-BEARING,"ARIZONA, NORTHERN, MOHAVE",TOTAL,(D)
5,FRUIT & TREE NUTS,ALMONDS,AREA NON-BEARING,OPERATIONS,ALMONDS - OPERATIONS WITH AREA NON-BEARING,"ARIZONA, NORTHERN, MOHAVE",TOTAL,1
6,FRUIT & TREE NUTS,ALMONDS,AREA BEARING & NON-BEARING,ACRES,ALMONDS - ACRES BEARING & NON-BEARING,"ARIZONA, NORTHERN, MOHAVE",TOTAL,(D)
7,FRUIT & TREE NUTS,ALMONDS,AREA BEARING & NON-BEARING,OPERATIONS,ALMONDS - OPERATIONS WITH AREA BEARING & NON-B...,"ARIZONA, NORTHERN, MOHAVE",TOTAL,1
8,FRUIT & TREE NUTS,ALMONDS,AREA BEARING,ACRES,ALMONDS - ACRES BEARING,"ARIZONA, NORTHERN, YAVAPAI",TOTAL,(D)
9,FRUIT & TREE NUTS,ALMONDS,AREA BEARING,OPERATIONS,ALMONDS - OPERATIONS WITH AREA BEARING,"ARIZONA, NORTHERN, YAVAPAI",TOTAL,1


In [10]:
check_other_COMMOD_DESC_2002_df = spark.sql("\
    SELECT {} FROM only_crops_2002_df WHERE COMMODITY_DESC LIKE '%OTHER%'\
".format(interesting_column_str))
check_other_COMMOD_DESC_2002_df.createOrReplaceTempView("check_other_COMMOD_DESC_2002_df")
check_other_COMMOD_DESC_2002_df.toPandas().tail(10000)

Unnamed: 0,GROUP_DESC,COMMODITY_DESC,CLASS_DESC,PRODN_PRACTICE_DESC,UTIL_PRACTICE_DESC,STATISTICCAT_DESC,UNIT_DESC,SHORT_DESC,LOCATION_DESC,DOMAIN_DESC,VALUE
9767,FIELD CROPS,"FIELD CROPS, OTHER",INCL HAY,ALL PRODUCTION PRACTICES,ALL UTILIZATION PRACTICES,SALES,$,"FIELD CROPS, OTHER, INCL HAY - SALES, MEASURED...","MONTANA, SOUTH CENTRAL, YELLOWSTONE",TOTAL,10451000
9768,FIELD CROPS,"FIELD CROPS, OTHER",INCL HAY,ALL PRODUCTION PRACTICES,ALL UTILIZATION PRACTICES,SALES,OPERATIONS,"FIELD CROPS, OTHER, INCL HAY - OPERATIONS WITH...","MONTANA, SOUTH CENTRAL, YELLOWSTONE",TOTAL,331
9769,FIELD CROPS,"FIELD CROPS, OTHER",INCL HAY,ALL PRODUCTION PRACTICES,ALL UTILIZATION PRACTICES,SALES,$,"FIELD CROPS, OTHER, INCL HAY - SALES, MEASURED...","MONTANA, SOUTHEAST, CARTER",TOTAL,(D)
9770,FIELD CROPS,"FIELD CROPS, OTHER",INCL HAY,ALL PRODUCTION PRACTICES,ALL UTILIZATION PRACTICES,SALES,OPERATIONS,"FIELD CROPS, OTHER, INCL HAY - OPERATIONS WITH...","MONTANA, SOUTHEAST, CARTER",TOTAL,21
9771,FIELD CROPS,"FIELD CROPS, OTHER",INCL HAY,ALL PRODUCTION PRACTICES,ALL UTILIZATION PRACTICES,SALES,$,"FIELD CROPS, OTHER, INCL HAY - SALES, MEASURED...","MONTANA, SOUTHEAST, CUSTER",TOTAL,3255000
9772,FIELD CROPS,"FIELD CROPS, OTHER",INCL HAY,ALL PRODUCTION PRACTICES,ALL UTILIZATION PRACTICES,SALES,OPERATIONS,"FIELD CROPS, OTHER, INCL HAY - OPERATIONS WITH...","MONTANA, SOUTHEAST, CUSTER",TOTAL,84
9773,FIELD CROPS,"FIELD CROPS, OTHER",INCL HAY,ALL PRODUCTION PRACTICES,ALL UTILIZATION PRACTICES,SALES,$,"FIELD CROPS, OTHER, INCL HAY - SALES, MEASURED...","MONTANA, SOUTHEAST, FALLON",TOTAL,436000
9774,FIELD CROPS,"FIELD CROPS, OTHER",INCL HAY,ALL PRODUCTION PRACTICES,ALL UTILIZATION PRACTICES,SALES,OPERATIONS,"FIELD CROPS, OTHER, INCL HAY - OPERATIONS WITH...","MONTANA, SOUTHEAST, FALLON",TOTAL,27
9775,FIELD CROPS,"FIELD CROPS, OTHER",INCL HAY,ALL PRODUCTION PRACTICES,ALL UTILIZATION PRACTICES,SALES,$,"FIELD CROPS, OTHER, INCL HAY - SALES, MEASURED...","MONTANA, SOUTHEAST, POWDER RIVER",TOTAL,583000
9776,FIELD CROPS,"FIELD CROPS, OTHER",INCL HAY,ALL PRODUCTION PRACTICES,ALL UTILIZATION PRACTICES,SALES,OPERATIONS,"FIELD CROPS, OTHER, INCL HAY - OPERATIONS WITH...","MONTANA, SOUTHEAST, POWDER RIVER",TOTAL,32


In [10]:
check_woody_COMMOD_DESC_2002_df = spark.sql("SELECT {} FROM only_crops_2002_df \
    WHERE COMMODITY_DESC LIKE '%CHRISTMAS TREES%' OR COMMODITY_DESC LIKE '%WOODY%' \
    AND STATISTICCAT_DESC NOT LIKE 'SALES'\
".format(interesting_columns_fewer_coulumns_str))
check_woody_COMMOD_DESC_2002_df.createOrReplaceTempView("check_woody_COMMOD_DESC_2002_df")
check_woody_COMMOD_DESC_2002_df.limit(1000).toPandas().tail(1000)

Unnamed: 0,GROUP_DESC,COMMODITY_DESC,STATISTICCAT_DESC,UNIT_DESC,SHORT_DESC,LOCATION_DESC,DOMAIN_DESC,VALUE
0,HORTICULTURE,CUT CHRISTMAS TREES & SHORT TERM WOODY CROPS,SALES,$,CUT CHRISTMAS TREES & SHORT TERM WOODY CROPS -...,"ALABAMA, NORTHERN VALLEY, COLBERT",TOTAL,(D)
1,HORTICULTURE,CUT CHRISTMAS TREES & SHORT TERM WOODY CROPS,SALES,OPERATIONS,CUT CHRISTMAS TREES & SHORT TERM WOODY CROPS -...,"ALABAMA, NORTHERN VALLEY, COLBERT",TOTAL,2
2,HORTICULTURE,SHORT TERM WOODY CROPS,AREA HARVESTED,ACRES,SHORT TERM WOODY CROPS - ACRES HARVESTED,"ALABAMA, NORTHERN VALLEY, COLBERT",TOTAL,(D)
3,HORTICULTURE,SHORT TERM WOODY CROPS,AREA HARVESTED,OPERATIONS,SHORT TERM WOODY CROPS - OPERATIONS WITH AREA ...,"ALABAMA, NORTHERN VALLEY, COLBERT",TOTAL,2
4,HORTICULTURE,SHORT TERM WOODY CROPS,AREA IN PRODUCTION,ACRES,SHORT TERM WOODY CROPS - ACRES IN PRODUCTION,"ALABAMA, NORTHERN VALLEY, COLBERT",TOTAL,60
5,HORTICULTURE,SHORT TERM WOODY CROPS,AREA IN PRODUCTION,OPERATIONS,SHORT TERM WOODY CROPS - OPERATIONS WITH AREA ...,"ALABAMA, NORTHERN VALLEY, COLBERT",TOTAL,3
6,HORTICULTURE,CUT CHRISTMAS TREES,AREA IN PRODUCTION,ACRES,CUT CHRISTMAS TREES - ACRES IN PRODUCTION,"ALABAMA, NORTHERN VALLEY, FRANKLIN",TOTAL,(D)
7,HORTICULTURE,CUT CHRISTMAS TREES,AREA IN PRODUCTION,OPERATIONS,CUT CHRISTMAS TREES - OPERATIONS WITH AREA IN ...,"ALABAMA, NORTHERN VALLEY, FRANKLIN",TOTAL,1
8,HORTICULTURE,CUT CHRISTMAS TREES & SHORT TERM WOODY CROPS,SALES,$,CUT CHRISTMAS TREES & SHORT TERM WOODY CROPS -...,"ALABAMA, NORTHERN VALLEY, FRANKLIN",TOTAL,(D)
9,HORTICULTURE,CUT CHRISTMAS TREES & SHORT TERM WOODY CROPS,SALES,OPERATIONS,CUT CHRISTMAS TREES & SHORT TERM WOODY CROPS -...,"ALABAMA, NORTHERN VALLEY, FRANKLIN",TOTAL,1


In [14]:
check_hay_COMMOD_DESC_2002_df = spark.sql("SELECT * FROM only_crops_2002_df WHERE COMMODITY_DESC LIKE '%HAY%'")
check_hay_COMMOD_DESC_2002_df.createOrReplaceTempView("check_hay_COMMOD_DESC_2002_df")
check_hay_COMMOD_DESC_2002_df.limit(100).toPandas().head(100)

Unnamed: 0,GROUP_DESC,COMMODITY_DESC,CLASS_DESC,PRODN_PRACTICE_DESC,UTIL_PRACTICE_DESC,STATISTICCAT_DESC,UNIT_DESC,SHORT_DESC,LOCATION_DESC,DOMAIN_DESC,VALUE
0,FIELD CROPS,HAY & HAYLAGE,ALL CLASSES,ALL PRODUCTION PRACTICES,ALL UTILIZATION PRACTICES,AREA HARVESTED,ACRES,HAY & HAYLAGE - ACRES HARVESTED,"ALABAMA, NORTHERN VALLEY, COLBERT",TOTAL,12055
1,FIELD CROPS,HAY,ALL CLASSES,ALL PRODUCTION PRACTICES,ALL UTILIZATION PRACTICES,AREA HARVESTED,ACRES,HAY - ACRES HARVESTED,"ALABAMA, NORTHERN VALLEY, COLBERT",TOTAL,12258
2,FIELD CROPS,HAY,ALL CLASSES,ALL PRODUCTION PRACTICES,ALL UTILIZATION PRACTICES,PRODUCTION,TONS,"HAY - PRODUCTION, MEASURED IN TONS","ALABAMA, NORTHERN VALLEY, COLBERT",TOTAL,(D)
3,FIELD CROPS,HAY,ALL CLASSES,IRRIGATED,ALL UTILIZATION PRACTICES,AREA HARVESTED,ACRES,"HAY, IRRIGATED - ACRES HARVESTED","ALABAMA, NORTHERN VALLEY, COLBERT",TOTAL,82
4,FIELD CROPS,HAY,ALFALFA,ALL PRODUCTION PRACTICES,ALL UTILIZATION PRACTICES,AREA HARVESTED,ACRES,"HAY, ALFALFA - ACRES HARVESTED","ALABAMA, NORTHERN VALLEY, COLBERT",TOTAL,365
5,FIELD CROPS,HAY,ALFALFA,ALL PRODUCTION PRACTICES,ALL UTILIZATION PRACTICES,PRODUCTION,TONS,"HAY, ALFALFA - PRODUCTION, MEASURED IN TONS","ALABAMA, NORTHERN VALLEY, COLBERT",TOTAL,860
6,FIELD CROPS,HAY,ALFALFA,IRRIGATED,ALL UTILIZATION PRACTICES,AREA HARVESTED,ACRES,"HAY, ALFALFA, IRRIGATED - ACRES HARVESTED","ALABAMA, NORTHERN VALLEY, COLBERT",TOTAL,(D)
7,FIELD CROPS,HAY,SMALL GRAIN,ALL PRODUCTION PRACTICES,ALL UTILIZATION PRACTICES,AREA HARVESTED,ACRES,"HAY, SMALL GRAIN - ACRES HARVESTED","ALABAMA, NORTHERN VALLEY, COLBERT",TOTAL,453
8,FIELD CROPS,HAY,SMALL GRAIN,ALL PRODUCTION PRACTICES,ALL UTILIZATION PRACTICES,PRODUCTION,TONS,"HAY, SMALL GRAIN - PRODUCTION, MEASURED IN TONS","ALABAMA, NORTHERN VALLEY, COLBERT",TOTAL,(D)
9,FIELD CROPS,HAY,WILD,ALL PRODUCTION PRACTICES,ALL UTILIZATION PRACTICES,AREA HARVESTED,ACRES,"HAY, WILD - ACRES HARVESTED","ALABAMA, NORTHERN VALLEY, COLBERT",TOTAL,1022


In [5]:
check_hay_again_COMMOD_DESC_2002_df = spark.sql("\
        SELECT {} FROM df_2002 WHERE COMMODITY_DESC LIKE 'HAY & HAYLAGE' AND (UNIT_DESC LIKE 'OPERATIONS' \
        OR UNIT_DESC LIKE 'SQ FT' OR UNIT_DESC LIKE 'ACRES') AND PRODN_PRACTICE_DESC = 'ALL PRODUCTION PRACTICES' \
".format(interesting_column_str))
check_hay_again_COMMOD_DESC_2002_df.createOrReplaceTempView("check_hay_again_COMMOD_DESC_2002_df")
check_hay_again_COMMOD_DESC_2002_df.limit(6).toPandas().head(10000)

Unnamed: 0,GROUP_DESC,COMMODITY_DESC,CLASS_DESC,PRODN_PRACTICE_DESC,UTIL_PRACTICE_DESC,STATISTICCAT_DESC,UNIT_DESC,SHORT_DESC,LOCATION_DESC,DOMAIN_DESC,VALUE
0,FIELD CROPS,HAY & HAYLAGE,ALL CLASSES,ALL PRODUCTION PRACTICES,ALL UTILIZATION PRACTICES,AREA HARVESTED,ACRES,HAY & HAYLAGE - ACRES HARVESTED,"ALABAMA, NORTHERN VALLEY, COLBERT",TOTAL,12055
1,FIELD CROPS,HAY & HAYLAGE,ALL CLASSES,ALL PRODUCTION PRACTICES,ALL UTILIZATION PRACTICES,AREA HARVESTED,OPERATIONS,HAY & HAYLAGE - OPERATIONS WITH AREA HARVESTED,"ALABAMA, NORTHERN VALLEY, COLBERT",TOTAL,287
2,FIELD CROPS,HAY & HAYLAGE,ALL CLASSES,ALL PRODUCTION PRACTICES,ALL UTILIZATION PRACTICES,AREA HARVESTED,OPERATIONS,HAY & HAYLAGE - OPERATIONS WITH AREA HARVESTED,"ALABAMA, NORTHERN VALLEY, COLBERT",AREA HARVESTED,122
3,FIELD CROPS,HAY & HAYLAGE,ALL CLASSES,ALL PRODUCTION PRACTICES,ALL UTILIZATION PRACTICES,AREA HARVESTED,OPERATIONS,HAY & HAYLAGE - OPERATIONS WITH AREA HARVESTED,"ALABAMA, NORTHERN VALLEY, COLBERT",AREA HARVESTED,26
4,FIELD CROPS,HAY & HAYLAGE,ALL CLASSES,ALL PRODUCTION PRACTICES,ALL UTILIZATION PRACTICES,AREA HARVESTED,OPERATIONS,HAY & HAYLAGE - OPERATIONS WITH AREA HARVESTED,"ALABAMA, NORTHERN VALLEY, COLBERT",AREA HARVESTED,137
5,FIELD CROPS,HAY & HAYLAGE,ALL CLASSES,ALL PRODUCTION PRACTICES,ALL UTILIZATION PRACTICES,AREA HARVESTED,OPERATIONS,HAY & HAYLAGE - OPERATIONS WITH AREA HARVESTED,"ALABAMA, NORTHERN VALLEY, COLBERT",AREA HARVESTED,2


In [56]:
check_cotton_COMMOD_DESC_2002_df = spark.sql("SELECT * FROM df_2002 WHERE COMMODITY_DESC LIKE '%COTTON%'")
check_cotton_COMMOD_DESC_2002_df.createOrReplaceTempView("check_cotton_COMMOD_DESC_2002_df")
check_cotton_COMMOD_DESC_2002_df.limit(6).toPandas().head(6)

Unnamed: 0,SOURCE_DESC,SECTOR_DESC,GROUP_DESC,COMMODITY_DESC,CLASS_DESC,PRODN_PRACTICE_DESC,UTIL_PRACTICE_DESC,STATISTICCAT_DESC,UNIT_DESC,SHORT_DESC,DOMAIN_DESC,DOMAINCAT_DESC,AGG_LEVEL_DESC,STATE_ANSI,STATE_FIPS_CODE,STATE_ALPHA,STATE_NAME,ASD_CODE,ASD_DESC,COUNTY_ANSI,COUNTY_CODE,COUNTY_NAME,REGION_DESC,ZIP_5,WATERSHED_CODE,WATERSHED_DESC,CONGR_DISTRICT_CODE,COUNTRY_CODE,COUNTRY_NAME,LOCATION_DESC,YEAR,FREQ_DESC,BEGIN_CODE,END_CODE,REFERENCE_PERIOD_DESC,WEEK_ENDING,LOAD_TIME,VALUE,CV_%,__index_level_0__
0,CENSUS,CROPS,FIELD CROPS,COTTON,ALL CLASSES,ALL PRODUCTION PRACTICES,ALL UTILIZATION PRACTICES,AREA HARVESTED,ACRES,COTTON - ACRES HARVESTED,TOTAL,NOT SPECIFIED,COUNTY,1.0,1.0,AL,ALABAMA,10.0,NORTHERN VALLEY,33.0,33.0,COLBERT,,,0.0,,,9000.0,UNITED STATES,"ALABAMA, NORTHERN VALLEY, COLBERT",2002.0,ANNUAL,0.0,0.0,YEAR,,2012-01-01 00:00:00,24598,,1
1,CENSUS,CROPS,FIELD CROPS,COTTON,UPLAND,ALL PRODUCTION PRACTICES,ALL UTILIZATION PRACTICES,AREA HARVESTED,ACRES,"COTTON, UPLAND - ACRES HARVESTED",TOTAL,NOT SPECIFIED,COUNTY,1.0,1.0,AL,ALABAMA,10.0,NORTHERN VALLEY,33.0,33.0,COLBERT,,,0.0,,,9000.0,UNITED STATES,"ALABAMA, NORTHERN VALLEY, COLBERT",2002.0,ANNUAL,0.0,0.0,YEAR,,2012-01-01 00:00:00,24598,,2
2,CENSUS,CROPS,FIELD CROPS,COTTON,UPLAND,IRRIGATED,ALL UTILIZATION PRACTICES,AREA HARVESTED,ACRES,"COTTON, UPLAND, IRRIGATED - ACRES HARVESTED",TOTAL,NOT SPECIFIED,COUNTY,1.0,1.0,AL,ALABAMA,10.0,NORTHERN VALLEY,33.0,33.0,COLBERT,,,0.0,,,9000.0,UNITED STATES,"ALABAMA, NORTHERN VALLEY, COLBERT",2002.0,ANNUAL,0.0,0.0,YEAR,,2012-01-01 00:00:00,(D),,3
3,CENSUS,CROPS,FIELD CROPS,COTTON,ALL CLASSES,ALL PRODUCTION PRACTICES,ALL UTILIZATION PRACTICES,AREA HARVESTED,OPERATIONS,COTTON - OPERATIONS WITH AREA HARVESTED,TOTAL,NOT SPECIFIED,COUNTY,1.0,1.0,AL,ALABAMA,10.0,NORTHERN VALLEY,33.0,33.0,COLBERT,,,0.0,,,9000.0,UNITED STATES,"ALABAMA, NORTHERN VALLEY, COLBERT",2002.0,ANNUAL,0.0,0.0,YEAR,,2012-01-01 00:00:00,22,,137
4,CENSUS,CROPS,FIELD CROPS,COTTON,ALL CLASSES,ALL PRODUCTION PRACTICES,ALL UTILIZATION PRACTICES,AREA HARVESTED,OPERATIONS,COTTON - OPERATIONS WITH AREA HARVESTED,AREA HARVESTED,AREA HARVESTED: (1.0 TO 24.9 ACRES),COUNTY,1.0,1.0,AL,ALABAMA,10.0,NORTHERN VALLEY,33.0,33.0,COLBERT,,,0.0,,,9000.0,UNITED STATES,"ALABAMA, NORTHERN VALLEY, COLBERT",2002.0,ANNUAL,0.0,0.0,YEAR,,2012-01-01 00:00:00,3,,138
5,CENSUS,CROPS,FIELD CROPS,COTTON,ALL CLASSES,ALL PRODUCTION PRACTICES,ALL UTILIZATION PRACTICES,AREA HARVESTED,OPERATIONS,COTTON - OPERATIONS WITH AREA HARVESTED,AREA HARVESTED,AREA HARVESTED: (100 TO 249 ACRES),COUNTY,1.0,1.0,AL,ALABAMA,10.0,NORTHERN VALLEY,33.0,33.0,COLBERT,,,0.0,,,9000.0,UNITED STATES,"ALABAMA, NORTHERN VALLEY, COLBERT",2002.0,ANNUAL,0.0,0.0,YEAR,,2012-01-01 00:00:00,2,,139


In [11]:
check_tubers_COMMOD_DESC_2002_df = spark.sql("\
    SELECT {} FROM only_crops_2002_df WHERE COMMODITY_DESC LIKE '%TUBERS%' OR COMMODITY_DESC LIKE 'TARO' \
".format(interesting_columns_fewer_coulumns_str))
check_tubers_COMMOD_DESC_2002_df.createOrReplaceTempView("check_tubers_COMMOD_DESC_2002_df")
check_tubers_COMMOD_DESC_2002_df.toPandas().head(10000)

Unnamed: 0,GROUP_DESC,COMMODITY_DESC,STATISTICCAT_DESC,UNIT_DESC,SHORT_DESC,LOCATION_DESC,DOMAIN_DESC,VALUE
0,HORTICULTURE,BULBS & CORMS & RHIZOMES & TUBERS,AREA IN PRODUCTION,OPERATIONS,"BULBS & CORMS & RHIZOMES & TUBERS, DRY - OPERA...","ALABAMA, MOUNTAINS & EASTERN VALLEY, BLOUNT",TOTAL,1
1,HORTICULTURE,BULBS & CORMS & RHIZOMES & TUBERS,AREA IN PRODUCTION,ACRES,"BULBS & CORMS & RHIZOMES & TUBERS, DRY, IN THE...","ALABAMA, MOUNTAINS & EASTERN VALLEY, BLOUNT",TOTAL,(D)
2,HORTICULTURE,BULBS & CORMS & RHIZOMES & TUBERS,AREA IN PRODUCTION,OPERATIONS,"BULBS & CORMS & RHIZOMES & TUBERS, DRY - OPERA...","ALABAMA, MOUNTAINS & EASTERN VALLEY, CALHOUN",TOTAL,2
3,HORTICULTURE,BULBS & CORMS & RHIZOMES & TUBERS,AREA IN PRODUCTION,ACRES,"BULBS & CORMS & RHIZOMES & TUBERS, DRY, IN THE...","ALABAMA, MOUNTAINS & EASTERN VALLEY, CALHOUN",TOTAL,(D)
4,HORTICULTURE,BULBS & CORMS & RHIZOMES & TUBERS,AREA IN PRODUCTION,OPERATIONS,"BULBS & CORMS & RHIZOMES & TUBERS, DRY - OPERA...","ALABAMA, MOUNTAINS & EASTERN VALLEY, CHEROKEE",TOTAL,1
5,HORTICULTURE,BULBS & CORMS & RHIZOMES & TUBERS,AREA IN PRODUCTION,SQ FT,"BULBS & CORMS & RHIZOMES & TUBERS, DRY, UNDER ...","ALABAMA, MOUNTAINS & EASTERN VALLEY, CHEROKEE",TOTAL,(D)
6,HORTICULTURE,BULBS & CORMS & RHIZOMES & TUBERS,AREA IN PRODUCTION,OPERATIONS,"BULBS & CORMS & RHIZOMES & TUBERS, DRY - OPERA...","ALABAMA, MOUNTAINS & EASTERN VALLEY, CULLMAN",TOTAL,1
7,HORTICULTURE,BULBS & CORMS & RHIZOMES & TUBERS,AREA IN PRODUCTION,SQ FT,"BULBS & CORMS & RHIZOMES & TUBERS, DRY, UNDER ...","ALABAMA, MOUNTAINS & EASTERN VALLEY, CULLMAN",TOTAL,(D)
8,HORTICULTURE,BULBS & CORMS & RHIZOMES & TUBERS,AREA IN PRODUCTION,OPERATIONS,"BULBS & CORMS & RHIZOMES & TUBERS, DRY - OPERA...","ALABAMA, MOUNTAINS & EASTERN VALLEY, JACKSON",TOTAL,1
9,HORTICULTURE,BULBS & CORMS & RHIZOMES & TUBERS,AREA IN PRODUCTION,ACRES,"BULBS & CORMS & RHIZOMES & TUBERS, DRY, IN THE...","ALABAMA, MOUNTAINS & EASTERN VALLEY, JACKSON",TOTAL,(D)


In [17]:
check_ginger_corms_COMMOD_DESC_2002_df = spark.sql("\
    SELECT {} FROM only_crops_2002_df WHERE COMMODITY_DESC LIKE '%TUBERS%' OR COMMODITY_DESC LIKE 'GINGER ROOT' \
".format(interesting_columns_fewer_coulumns_str))
check_ginger_corms_COMMOD_DESC_2002_df.createOrReplaceTempView("check_ginger_corms_COMMOD_DESC_2002_df")
check_ginger_corms_COMMOD_DESC_2002_df.toPandas().head(10000)

Unnamed: 0,GROUP_DESC,COMMODITY_DESC,STATISTICCAT_DESC,UNIT_DESC,SHORT_DESC,LOCATION_DESC,DOMAIN_DESC,VALUE
0,HORTICULTURE,BULBS & CORMS & RHIZOMES & TUBERS,AREA IN PRODUCTION,OPERATIONS,"BULBS & CORMS & RHIZOMES & TUBERS, DRY - OPERA...","ALABAMA, MOUNTAINS & EASTERN VALLEY, BLOUNT",TOTAL,1
1,HORTICULTURE,BULBS & CORMS & RHIZOMES & TUBERS,AREA IN PRODUCTION,ACRES,"BULBS & CORMS & RHIZOMES & TUBERS, DRY, IN THE...","ALABAMA, MOUNTAINS & EASTERN VALLEY, BLOUNT",TOTAL,(D)
2,HORTICULTURE,BULBS & CORMS & RHIZOMES & TUBERS,AREA IN PRODUCTION,OPERATIONS,"BULBS & CORMS & RHIZOMES & TUBERS, DRY - OPERA...","ALABAMA, MOUNTAINS & EASTERN VALLEY, CALHOUN",TOTAL,2
3,HORTICULTURE,BULBS & CORMS & RHIZOMES & TUBERS,AREA IN PRODUCTION,ACRES,"BULBS & CORMS & RHIZOMES & TUBERS, DRY, IN THE...","ALABAMA, MOUNTAINS & EASTERN VALLEY, CALHOUN",TOTAL,(D)
4,HORTICULTURE,BULBS & CORMS & RHIZOMES & TUBERS,AREA IN PRODUCTION,OPERATIONS,"BULBS & CORMS & RHIZOMES & TUBERS, DRY - OPERA...","ALABAMA, MOUNTAINS & EASTERN VALLEY, CHEROKEE",TOTAL,1
5,HORTICULTURE,BULBS & CORMS & RHIZOMES & TUBERS,AREA IN PRODUCTION,SQ FT,"BULBS & CORMS & RHIZOMES & TUBERS, DRY, UNDER ...","ALABAMA, MOUNTAINS & EASTERN VALLEY, CHEROKEE",TOTAL,(D)
6,HORTICULTURE,BULBS & CORMS & RHIZOMES & TUBERS,AREA IN PRODUCTION,OPERATIONS,"BULBS & CORMS & RHIZOMES & TUBERS, DRY - OPERA...","ALABAMA, MOUNTAINS & EASTERN VALLEY, CULLMAN",TOTAL,1
7,HORTICULTURE,BULBS & CORMS & RHIZOMES & TUBERS,AREA IN PRODUCTION,SQ FT,"BULBS & CORMS & RHIZOMES & TUBERS, DRY, UNDER ...","ALABAMA, MOUNTAINS & EASTERN VALLEY, CULLMAN",TOTAL,(D)
8,HORTICULTURE,BULBS & CORMS & RHIZOMES & TUBERS,AREA IN PRODUCTION,OPERATIONS,"BULBS & CORMS & RHIZOMES & TUBERS, DRY - OPERA...","ALABAMA, MOUNTAINS & EASTERN VALLEY, JACKSON",TOTAL,1
9,HORTICULTURE,BULBS & CORMS & RHIZOMES & TUBERS,AREA IN PRODUCTION,ACRES,"BULBS & CORMS & RHIZOMES & TUBERS, DRY, IN THE...","ALABAMA, MOUNTAINS & EASTERN VALLEY, JACKSON",TOTAL,(D)


In [14]:
check_ginger_COMMOD_DESC_2002_df = spark.sql("\
    SELECT {} FROM only_crops_2002_df WHERE COMMODITY_DESC LIKE 'GINGER ROOT' \
".format(interesting_columns_fewer_coulumns_str))
check_ginger_COMMOD_DESC_2002_df.createOrReplaceTempView("check_ginger_COMMOD_DESC_2002_df")
check_ginger_COMMOD_DESC_2002_df.toPandas().head(10000)

Unnamed: 0,GROUP_DESC,COMMODITY_DESC,STATISTICCAT_DESC,UNIT_DESC,SHORT_DESC,LOCATION_DESC,DOMAIN_DESC,VALUE
0,VEGETABLES,GINGER ROOT,AREA HARVESTED,ACRES,GINGER ROOT - ACRES HARVESTED,"HAWAII, HAWAII ISLAND, HAWAII",TOTAL,165
1,VEGETABLES,GINGER ROOT,PRODUCTION,LB,"GINGER ROOT - PRODUCTION, MEASURED IN LB","HAWAII, HAWAII ISLAND, HAWAII",TOTAL,4848145
2,VEGETABLES,GINGER ROOT,AREA HARVESTED,OPERATIONS,GINGER ROOT - OPERATIONS WITH AREA HARVESTED,"HAWAII, HAWAII ISLAND, HAWAII",TOTAL,59
3,VEGETABLES,GINGER ROOT,AREA HARVESTED,ACRES,"GINGER ROOT, IRRIGATED - ACRES HARVESTED","HAWAII, HAWAII ISLAND, HAWAII",TOTAL,19
4,VEGETABLES,GINGER ROOT,AREA HARVESTED,OPERATIONS,"GINGER ROOT, IRRIGATED - OPERATIONS WITH AREA ...","HAWAII, HAWAII ISLAND, HAWAII",TOTAL,5
5,VEGETABLES,GINGER ROOT,AREA HARVESTED,ACRES,GINGER ROOT - ACRES HARVESTED,"HAWAII, KAUAI AND NIIHAU ISLANDS, KAUAI",TOTAL,11
6,VEGETABLES,GINGER ROOT,PRODUCTION,LB,"GINGER ROOT - PRODUCTION, MEASURED IN LB","HAWAII, KAUAI AND NIIHAU ISLANDS, KAUAI",TOTAL,(D)
7,VEGETABLES,GINGER ROOT,AREA HARVESTED,OPERATIONS,GINGER ROOT - OPERATIONS WITH AREA HARVESTED,"HAWAII, KAUAI AND NIIHAU ISLANDS, KAUAI",TOTAL,8
8,VEGETABLES,GINGER ROOT,AREA HARVESTED,ACRES,"GINGER ROOT, IRRIGATED - ACRES HARVESTED","HAWAII, KAUAI AND NIIHAU ISLANDS, KAUAI",TOTAL,10
9,VEGETABLES,GINGER ROOT,AREA HARVESTED,OPERATIONS,"GINGER ROOT, IRRIGATED - OPERATIONS WITH AREA ...","HAWAII, KAUAI AND NIIHAU ISLANDS, KAUAI",TOTAL,7


In [10]:
check_ginger2_COMMOD_DESC_2002_df = spark.sql("\
    SELECT {} DOMAIN_DESC FROM only_crops_2002_df WHERE COMMODITY_DESC LIKE 'GINGER ROOT' \
".format(interesting_column_str))
check_ginger2_COMMOD_DESC_2002_df.createOrReplaceTempView("check_ginger2_COMMOD_DESC_2002_df")
check_ginger2_COMMOD_DESC_2002_df.toPandas().head(10000)

Unnamed: 0,GROUP_DESC,COMMODITY_DESC,CLASS_DESC,PRODN_PRACTICE_DESC,UTIL_PRACTICE_DESC,STATISTICCAT_DESC,UNIT_DESC,SHORT_DESC,LOCATION_DESC,DOMAIN_DESC,DOMAIN_DESC.1
0,VEGETABLES,GINGER ROOT,ALL CLASSES,ALL PRODUCTION PRACTICES,ALL UTILIZATION PRACTICES,AREA HARVESTED,ACRES,GINGER ROOT - ACRES HARVESTED,"HAWAII, HAWAII ISLAND, HAWAII",TOTAL,165
1,VEGETABLES,GINGER ROOT,ALL CLASSES,ALL PRODUCTION PRACTICES,ALL UTILIZATION PRACTICES,PRODUCTION,LB,"GINGER ROOT - PRODUCTION, MEASURED IN LB","HAWAII, HAWAII ISLAND, HAWAII",TOTAL,4848145
2,VEGETABLES,GINGER ROOT,ALL CLASSES,ALL PRODUCTION PRACTICES,ALL UTILIZATION PRACTICES,AREA HARVESTED,OPERATIONS,GINGER ROOT - OPERATIONS WITH AREA HARVESTED,"HAWAII, HAWAII ISLAND, HAWAII",TOTAL,59
3,VEGETABLES,GINGER ROOT,ALL CLASSES,IRRIGATED,ALL UTILIZATION PRACTICES,AREA HARVESTED,ACRES,"GINGER ROOT, IRRIGATED - ACRES HARVESTED","HAWAII, HAWAII ISLAND, HAWAII",TOTAL,19
4,VEGETABLES,GINGER ROOT,ALL CLASSES,IRRIGATED,ALL UTILIZATION PRACTICES,AREA HARVESTED,OPERATIONS,"GINGER ROOT, IRRIGATED - OPERATIONS WITH AREA ...","HAWAII, HAWAII ISLAND, HAWAII",TOTAL,5
5,VEGETABLES,GINGER ROOT,ALL CLASSES,ALL PRODUCTION PRACTICES,ALL UTILIZATION PRACTICES,AREA HARVESTED,ACRES,GINGER ROOT - ACRES HARVESTED,"HAWAII, KAUAI AND NIIHAU ISLANDS, KAUAI",TOTAL,11
6,VEGETABLES,GINGER ROOT,ALL CLASSES,ALL PRODUCTION PRACTICES,ALL UTILIZATION PRACTICES,PRODUCTION,LB,"GINGER ROOT - PRODUCTION, MEASURED IN LB","HAWAII, KAUAI AND NIIHAU ISLANDS, KAUAI",TOTAL,(D)
7,VEGETABLES,GINGER ROOT,ALL CLASSES,ALL PRODUCTION PRACTICES,ALL UTILIZATION PRACTICES,AREA HARVESTED,OPERATIONS,GINGER ROOT - OPERATIONS WITH AREA HARVESTED,"HAWAII, KAUAI AND NIIHAU ISLANDS, KAUAI",TOTAL,8
8,VEGETABLES,GINGER ROOT,ALL CLASSES,IRRIGATED,ALL UTILIZATION PRACTICES,AREA HARVESTED,ACRES,"GINGER ROOT, IRRIGATED - ACRES HARVESTED","HAWAII, KAUAI AND NIIHAU ISLANDS, KAUAI",TOTAL,10
9,VEGETABLES,GINGER ROOT,ALL CLASSES,IRRIGATED,ALL UTILIZATION PRACTICES,AREA HARVESTED,OPERATIONS,"GINGER ROOT, IRRIGATED - OPERATIONS WITH AREA ...","HAWAII, KAUAI AND NIIHAU ISLANDS, KAUAI",TOTAL,7


In [None]:
to_check_list =  ['FIELD CROPS, OTHER','GRASSES & LEGUMES, OTHER', 'FRUIT & TREE NUTS, OTHER', 'SHORT TERM WOODY CROPS', 'CUT CHRISTMAS TREES & SHORT TERM WOODY CROPS', ]

In [8]:
distinct_production_df = spark.sql('SELECT DISTINCT PRODN_PRACTICE_DESC from only_crops_2002_df')
distinct_production_df.show(250, False)

+-------------------------------------------------------+
|PRODN_PRACTICE_DESC                                    |
+-------------------------------------------------------+
|IRRIGATED, ENTIRE CROP                                 |
|IRRIGATED                                              |
|IN THE OPEN, IRRIGATED, PART OF CROP, IRRIGATED PORTION|
|UNDER PROTECTION                                       |
|IN THE OPEN                                            |
|IRRIGATED, PART OF CROP, IRRIGATED PORTION             |
|IRRIGATED, PART OF CROP                                |
|IN THE OPEN, IRRIGATED                                 |
|IN THE OPEN, IRRIGATED, PART OF CROP                   |
|PRODUCTION CONTRACT                                    |
|ALL PRODUCTION PRACTICES                               |
|IRRIGATED, NONE OF CROP                                |
|IN THE OPEN, IRRIGATED, NONE OF CROP                   |
|IN THE OPEN, IRRIGATED, ENTIRE CROP                    |
+-------------

In [7]:
distinct_statisticcat_df = spark.sql('select distinct STATISTICCAT_DESC from only_crops_2002_df')
distinct_statisticcat_df.show(250, False)

+--------------------------+
|STATISTICCAT_DESC         |
+--------------------------+
|PRODUCTION                |
|TAPS                      |
|AREA BEARING              |
|SALES                     |
|AREA NON-BEARING          |
|AREA IN PRODUCTION        |
|AREA GROWN                |
|CAPACITY                  |
|YIELD                     |
|AREA HARVESTED            |
|AREA BEARING & NON-BEARING|
|AREA NOT HARVESTED        |
+--------------------------+



In [15]:
distinct_commodity_df = spark.sql("SELECT DISTINCT COMMODITY_DESC FROM only_crops_2002_df")
#distinct_commodity_df.take(50)
distinct_commodity_df.show(250, False)                                 

+--------------------------------------------+
|COMMODITY_DESC                              |
+--------------------------------------------+
|TANGELOS                                    |
|OATS                                        |
|TEMPLES                                     |
|PEAS                                        |
|PASSION FRUIT                               |
|CRAMBE                                      |
|SORGHUM                                     |
|RYE                                         |
|AMARANTH                                    |
|FIELD CROPS, OTHER                          |
|TARO                                        |
|TANGERINES                                  |
|ARTICHOKES                                  |
|PAPAYAS                                     |
|CUCUMBERS                                   |
|BOYSENBERRIES                               |
|POMEGRANATES                                |
|PEACHES                                     |
|CARROTS     

In [9]:
distinct_units_df = spark.sql('select distinct UNIT_DESC from only_crops_2002_df')
distinct_units_df.show(250, False)

+-----------------+
|UNIT_DESC        |
+-----------------+
|CWT              |
|TONS / ACRE      |
|PCT OF FARM SALES|
|CWT / ACRE       |
|BU               |
|GALLONS          |
|OPERATIONS       |
|LB               |
|BU / OPERATION   |
|TREES            |
|TONS             |
|BALES / ACRE     |
|SQ FT            |
|LB / ACRE        |
|TONS, DRY BASIS  |
|BALES            |
|BU / ACRE        |
|ACRES            |
|NUMBER           |
|$                |
+-----------------+

