## Timothy Miller
## GTECH 73100, Dr. Sun

# [Assignment Two](https://github.com/TangoYankee/gtech_731-geocomp-hw/tree/main/assignment-two)
Further basics

### Import modules

In [4]:
import json
import io
import time

### Task 1
Read in a data file of all counties in the US.  
Make a list of unique county names

In [5]:
# Data Source https://eric.clst.org/assets/wiki/uploads/Stuff/gz_2010_us_050_00_5m.json
# address utf-8 error https://stackoverflow.com/questions/30996289/utf8-codec-cant-decode-byte-0xf3
# Use "with ... as ..." to better handle exceptions
with io.open('data/gz_2010_us_050_00_5m.json', encoding='latin-1') as f:
  data = json.load(f)

print('Number of Counties in US: {}'.format(len(data['features'])))

print(data['features'][1]['properties']['NAME'])

features = data['features']

unique_county_names = set()
for feature in features:
  properties = feature['properties']
  county_name = properties["NAME"]
  unique_county_names.add(county_name)

unique_county_names = list(unique_county_names)
print(f"unique {type(unique_county_names)} of county names: {unique_county_names}")


Number of Counties in US: 3221
Wade Hampton
unique <class 'list'> of county names: ['Dinwiddie', 'Warren', 'Yell', 'Kittson', 'Seneca', 'Idaho', 'Sumner', 'Camas', 'Bronx', 'Morrow', 'Harding', 'Hardy', 'Tulare', 'Atkinson', 'Salinas', 'Dimmit', 'Leflore', 'McCracken', 'Pottawattamie', 'Billings', "Queen Anne's", 'St. James', 'Atlantic', 'Bryan', 'Pitkin', 'Wallace', 'Sauk', 'Chowan', 'Durham', 'LaSalle', 'Garden', 'Allamakee', 'Tulsa', 'Sanpete', 'Florence', 'Waukesha', 'Worth', 'Bossier', 'McCone', 'Mellette', 'Ionia', 'Chester', 'Blount', 'Titus', 'Outagamie', 'Niobrara', 'McDonald', 'Alcona', 'Lexington', 'Carolina', 'Klickitat', 'Cullman', 'Hopkins', 'Hanson', 'Wichita', 'Red River', 'Santa Rosa', 'Hardee', 'Cheatham', 'Cook', 'Codington', 'Caswell', 'Story', 'Kosciusko', 'Kenton', 'Beaverhead', 'Schenectady', 'Socorro', 'Dawson', 'Divide', 'Kiowa', 'Pasquotank', 'Contra Costa', 'Foster', 'Beltrami', 'Lassen', 'De Soto', 'Cimarron', 'Glacier', 'Bell', 'Nance', 'Clallam', 'Republic

### Task 2

Derive the numbers of counties that use these three names, respectively. For each of them, list their county name and state code.

Hint: Think about what data structure would be best for this task. Once again, do not use special functions or packages for this task. Just use basic Python data structures and loops.

In [6]:
def get_county_states(features):
  """Format the counties to easily list their states

  Arguments:
  features list[dict] -- county data

  Returns:
  dict[str, list[str]] -- County name as the key and the list of state codes as the value
  """
  county_states = dict()
  for feature in features:
    properties = feature['properties']
    county_name = properties['NAME']
    state_code = properties['STATE']
    county_states.setdefault(county_name, []).append(state_code)

  return county_states


def test_get_county_states():
  mock_features = [{
    'type': 'Feature',
    "properties": {
      'GEO_ID': '0500000US01087',
      'STATE': '01',
      'COUNTY': '087',
      'NAME': 'Macon',
      'LSAD': 'County',
      'CENSUSAREA': 608.885
    },
    "geometry": None,
  }, {
    'type': 'Feature',
    "properties": {
      'GEO_ID': '0500000US02275',
      'STATE': '02',
      'COUNTY': '275',
      'NAME': 'Wrangell',
      'LSAD': 'Cty&Bor',
      'CENSUSAREA': 2541.483
    },
    "geometry": None,
  }, {
    'type': 'Feature',
      'properties': {
      'GEO_ID': '0500000US02270',
      'STATE': '02',
      'COUNTY': '270',
      'NAME': 'Wade Hampton',
      'LSAD': 'CA',
      'CENSUSAREA': 17081.433
    },
    "geometry": None,
  }, {
    'type': 'Feature',
      'properties': {
      'GEO_ID': '',
      'STATE': '03',
      'COUNTY': '',
      'NAME': 'Wade Hampton',
      'LSAD': '',
      'CENSUSAREA': 0
    },
    "geometry": None,
  }]

  expected = {
    "Macon": ["01"],
    "Wrangell": ["02"],
    "Wade Hampton": ["02", "03"]
  }

  obs = get_county_states(mock_features)
  assert(obs == expected)

test_get_county_states()

county_states = get_county_states(features)
def get_county_totals(county_states):
  """Reformat the counties and state list objects into a tuple of counties and their totals

  Arguments:
  county_states dict[str, list[str]] -- County name as the key and the list of state codes as the value

  Returns:
  tuple[str, int] -- The counties and the total number of states that use them.
  """
  county_totals = [(None, None)] * len(county_states) 
  i = 0
  for county, states in county_states.items():
    county_total = (county, len(states))
    county_totals[i] = county_total
    i+=1

  return county_totals


def test_get_county_totals():
  mock_county_states = {
    "A": ['0'],
    "B": ['0', '1'],
    "C": ['0', '1', '2']
  }
  expected_count_totals = [('A', 1), ('B', 2), ('C', 3)]
  assert(get_county_totals(mock_county_states) == expected_count_totals)

test_get_county_totals()

def top_k_sort_k(totals, k):
  """Find the top k values in a tuple of objects and their counts.
  
  Arguments:
  totals tuple[str, int]-- Item Name and total 
  k int -- the number of items to rank

  Returns:
  tuple[str, int] -- Top k items and their counts
  
  Note:
  This function iterates through the list of items, only sorting the list of rankings
  """
  top_k = [(None, 0)]*k
  for total in totals:
    total_val = total[1]
    bottom_k_val = top_k[0][1]
    if total_val > bottom_k_val:
      top_k[0] = total
      top_k.sort(key=lambda a: a[1])

  return top_k

def test_top_k_sort_k():
  mock_county_totals = [("A", 0), ("B", 1), ("C", 2), ("D", 3), ("E", 4), ("F", 4)]
  expected_top_k = [("D", 3), ("F", 4), ("E", 4)]
  
  assert(top_k_sort_k(mock_county_totals, 3) == expected_top_k)

test_top_k_sort_k()

def top_k_sort_all(totals, k):
  """Find the top k values in a tuple of objects and their counts.
    
    Arguments:
    totals tuple[str, int]-- Item Name and total 
    k int -- the number of items to rank

    Returns:
    tuple[str, int] -- Top k items and their counts
    
    Note:
    This function sorts the whole list and then takes the top k results
  """
  totals.sort(key = lambda a: a[1])
  return totals[-k:]

def test_top_k_sort_all():
  mock_county_totals = [("A", 0), ("B", 1), ("C", 2), ("D", 3), ("E", 4), ("F", 4)]
  expected_top_k = [("D", 3), ("E", 4), ("F", 4)]
  assert(top_k_sort_all(mock_county_totals, 3) == expected_top_k)

test_top_k_sort_all()

county_totals = get_county_totals(county_states)
"""
Compare the two algorithims to determine which has better performance.

Findings: sorting the rankings list is generally faster when k is less than 25.
Sorting the whole list of counties and taking the top is generally faster when k is more than 25.
"""
k = 3 
start_time = time.perf_counter()
top_counties = top_k_sort_k(county_totals, k)
stop_time = time.perf_counter()
print(f"top counties: {top_counties}")
print(f"top counter time: {stop_time - start_time}")

start_time = time.perf_counter()
top_counties_alt = top_k_sort_all(county_totals, k)
stop_time = time.perf_counter()
print(f"top counties alt: {top_counties_alt}")
print(f"top counter alt time: {stop_time - start_time}")


"""
Display the top counties and their states
"""
for top_county in top_counties:
  name = top_county[0]
  print(f"{name} county appears in state codes: {county_states[name]}")

top counties: [('Jefferson', 26), ('Franklin', 26), ('Washington', 31)]
top counter time: 0.0001097899985325057
top counties alt: [('Franklin', 26), ('Jefferson', 26), ('Washington', 31)]
top counter alt time: 0.00012915000115754083
Jefferson county appears in state codes: ['17', '13', '19', '41', '08', '05', '12', '20', '21', '01', '22', '28', '16', '18', '29', '47', '48', '36', '40', '31', '30', '42', '39', '54', '55', '53']
Franklin county appears in state codes: ['13', '01', '18', '16', '21', '19', '28', '36', '48', '05', '12', '22', '23', '25', '17', '20', '29', '47', '31', '37', '42', '39', '51', '51', '53', '50']
Washington county appears in state codes: ['08', '12', '05', '17', '19', '23', '24', '29', '47', '40', '41', '55', '49', '50', '20', '18', '21', '27', '13', '01', '22', '16', '36', '37', '31', '28', '42', '39', '44', '51', '48']


### Task 3
Basic statistics by state


### Task 3, part one
The number of counties in each state

In [8]:
def get_state_counties_total(features):
  """Find the total number of counties in each state

  Arguments:
  features list[dict] -- each county has a set of properties, state code is most relevant

  Returns:
  dict -- key is state code and value is the total number of counties 
  """
  totals = {}
  for feature in features:
    properties = feature['properties']
    state_code = properties['STATE']
    totals[state_code] = totals.get(state_code, 0) + 1

  return totals


def test_get_state_counties_total():
  mock_features = [{
    'type': 'Feature',
    "properties": {
      'GEO_ID': '0500000US01087',
      'STATE': '01',
      'COUNTY': '087',
      'NAME': 'Macon',
      'LSAD': 'County',
      'CENSUSAREA': 608.885
    },
    "geometry": None,
  }, {
    'type': 'Feature',
    "properties": {
      'GEO_ID': '0500000US02275',
      'STATE': '02',
      'COUNTY': '275',
      'NAME': 'Wrangell',
      'LSAD': 'Cty&Bor',
      'CENSUSAREA': 2541.483
    },
    "geometry": None,
  }, {
    'type': 'Feature',
      'properties': {
      'GEO_ID': '0500000US02270',
      'STATE': '02',
      'COUNTY': '270',
      'NAME': 'Wade Hampton',
      'LSAD': 'CA',
      'CENSUSAREA': 17081.433
    },
    "geometry": None,
  }] 

  expected_totals = {
    '01': 1,
    '02': 2
  }

  assert(get_state_counties_total(mock_features) == expected_totals)

test_get_state_counties_total()
state_counties_totals = get_state_counties_total(features)
print(f"List of {len(state_counties_totals)} states' county totals: {state_counties_totals}")


List of 52 states' county totals: {'02': 29, '04': 15, '05': 75, '08': 64, '09': 8, '12': 67, '13': 159, '06': 58, '01': 67, '17': 102, '18': 92, '19': 99, '15': 5, '16': 44, '20': 105, '21': 120, '22': 64, '26': 83, '27': 87, '28': 82, '23': 16, '24': 24, '25': 14, '30': 56, '31': 93, '32': 17, '33': 10, '34': 21, '35': 33, '29': 115, '37': 100, '38': 53, '39': 88, '40': 77, '36': 62, '45': 46, '46': 66, '47': 95, '41': 36, '42': 67, '48': 254, '49': 29, '53': 39, '54': 55, '55': 72, '56': 23, '72': 78, '50': 14, '51': 134, '10': 3, '11': 1, '44': 5}


### Task 3, part two
Name and size of the biggest and smallest county in each state, by area

In [9]:
def get_state_county_min_max_area(features):
  """Find the name and size of the biggest and smallest county in each state

  Arguments:
  features list[dict] -- county object with type, properties, and geometry

  Returns:
  dict[str, dict] -- county data for largest and smallest counties
  """
  state_county_min_max_area = {}

  for feature in features:
    properties = feature['properties']
    state_code = properties['STATE']
    county_name = properties['NAME']
    county_area = properties['CENSUSAREA']

    county = {
      "name": county_name,
      "area": county_area
    }

    if state_code in state_county_min_max_area:
      state = state_county_min_max_area[state_code]

      largest_county_area = state['largest_county']['area']
      if county_area > largest_county_area:
        state['largest_county'] = county

      smallest_county_area = state['smallest_county']['area']
      if county_area < smallest_county_area:
        state['smallest_county'] = county
    else:
      state = {
        "largest_county": county,
        "smallest_county": county
      }

      state_county_min_max_area[state_code] = state

  return state_county_min_max_area

def test_get_state_county_min_max_area():
  mock_features = [{
    'type': 'Feature',
    "properties": {
      'GEO_ID': '0500000US01087',
      'STATE': '01',
      'COUNTY': '087',
      'NAME': 'Macon',
      'LSAD': 'County',
      'CENSUSAREA': 608.885
    },
    "geometry": None,
  }, {
    'type': 'Feature',
    "properties": {
      'GEO_ID': '0500000US02275',
      'STATE': '02',
      'COUNTY': '275',
      'NAME': 'Wrangell',
      'LSAD': 'Cty&Bor',
      'CENSUSAREA': 2541.483
    },
    "geometry": None,
  }, {
    'type': 'Feature',
      'properties': {
      'GEO_ID': '0500000US02270',
      'STATE': '02',
      'COUNTY': '270',
      'NAME': 'Wade Hampton',
      'LSAD': 'CA',
      'CENSUSAREA': 17081.433
    },
    "geometry": None,
  }]

  expected = {
    '01': {
      'largest_county': {
        'name': "Macon",
        "area": 608.885
      },
      "smallest_county": {
        'name': "Macon",
        "area": 608.885
      }
    },
    '02': {
      "largest_county": {
        "name": "Wade Hampton",
        "area": 17081.433
      },
      "smallest_county": {
        "name": "Wrangell",
        "area": 2541.483
      }
    }
  }

  assert(get_state_county_min_max_area(mock_features) == expected)

test_get_state_county_min_max_area()
state_county_min_max_area = get_state_county_min_max_area(features)
print(f"min and max counties by area in each state: {state_county_min_max_area}")

min and max counties by area in each state: {'02': {'largest_county': {'name': 'Yukon-Koyukuk', 'area': 145504.789}, 'smallest_county': {'name': 'Skagway', 'area': 452.325}}, '04': {'largest_county': {'name': 'Coconino', 'area': 18618.885}, 'smallest_county': {'name': 'Santa Cruz', 'area': 1236.916}}, '05': {'largest_county': {'name': 'Union', 'area': 1039.214}, 'smallest_county': {'name': 'Lafayette', 'area': 528.268}}, '08': {'largest_county': {'name': 'Las Animas', 'area': 4772.672}, 'smallest_county': {'name': 'Broomfield', 'area': 33.034}}, '09': {'largest_county': {'name': 'Litchfield', 'area': 920.56}, 'smallest_county': {'name': 'Middlesex', 'area': 369.301}}, '12': {'largest_county': {'name': 'Collier', 'area': 1998.324}, 'smallest_county': {'name': 'Union', 'area': 243.556}}, '13': {'largest_county': {'name': 'Ware', 'area': 892.461}, 'smallest_county': {'name': 'Clarke', 'area': 119.2}}, '06': {'largest_county': {'name': 'San Bernardino', 'area': 20056.938}, 'smallest_county

### Task 3, part three
The total and average area of counties in each state

In [10]:
def get_state_total_avg_area_county(features):
  """ The total and average area of counties in each state

  Arguments:
  features list[dict] -- county data

  Returns:
  dict[str, dict] -- total and avg area for counties in the state
  """
  state_total_avg_area_county = {}
  for feature in features:
    properties = feature["properties"]
    state_code = properties['STATE']
    county_area = properties['CENSUSAREA']

    if state_code in state_total_avg_area_county:
      state = state_total_avg_area_county[state_code]
      state['county_total_area'] = state['county_total_area'] + county_area
      state['county_count'] = state['county_count'] + 1
      state['county_avg_area'] = state['county_total_area'] / state['county_count']
    else:
      state = {
        "county_total_area": county_area,
        "county_avg_area": county_area,
        "county_count": 1
      }
      state_total_avg_area_county[state_code] = state

  return state_total_avg_area_county

def test_get_state_total_avg_area_county():
  mock_features = [{
    'type': 'Feature',
    "properties": {
      'GEO_ID': '0500000US01087',
      'STATE': '01',
      'COUNTY': '087',
      'NAME': 'Macon',
      'LSAD': 'County',
      'CENSUSAREA': 608.885
    },
    "geometry": None,
  }, {
    'type': 'Feature',
    "properties": {
      'GEO_ID': '0500000US02275',
      'STATE': '02',
      'COUNTY': '275',
      'NAME': 'Wrangell',
      'LSAD': 'Cty&Bor',
      'CENSUSAREA': 2541.483
    },
    "geometry": None,
  }, {
    'type': 'Feature',
      'properties': {
      'GEO_ID': '0500000US02270',
      'STATE': '02',
      'COUNTY': '270',
      'NAME': 'Wade Hampton',
      'LSAD': 'CA',
      'CENSUSAREA': 17081.433
    },
    "geometry": None,
  }]

  expected = {
    '01': {
      "county_total_area": 608.885,
      "county_avg_area": 608.885,
      "county_count": 1
    }, 
    '02': {
      "county_total_area": 19622.916,
      "county_avg_area": 9811.458,
      "county_count": 2
    }
  }

  assert(get_state_total_avg_area_county(mock_features) == expected)

test_get_state_total_avg_area_county()
state_total_avg_area_county = get_state_total_avg_area_county(features)
print(f"County areas for states: {state_total_avg_area_county}")

County areas for states: {'02': {'county_total_area': 570640.9510000001, 'county_avg_area': 19677.274172413796, 'county_count': 29}, '04': {'county_total_area': 113594.085, 'county_avg_area': 7572.939, 'county_count': 15}, '05': {'county_total_area': 52035.477999999974, 'county_avg_area': 693.806373333333, 'county_count': 75}, '08': {'county_total_area': 103641.88800000002, 'county_avg_area': 1619.4045000000003, 'county_count': 64}, '09': {'county_total_area': 4842.356, 'county_avg_area': 605.2945, 'county_count': 8}, '12': {'county_total_area': 53624.759, 'county_avg_area': 800.3695373134328, 'county_count': 67}, '13': {'county_total_area': 57513.48900000002, 'county_avg_area': 361.72005660377374, 'county_count': 159}, '06': {'county_total_area': 155779.21800000002, 'county_avg_area': 2685.848586206897, 'county_count': 58}, '01': {'county_total_area': 50645.325000000004, 'county_avg_area': 755.9003731343284, 'county_count': 67}, '17': {'county_total_area': 55518.92600000002, 'county_a