## Timothy Miller
## GTECH 73100, Dr. Sun

# [Assignment Eight](https://github.com/TangoYankee/gtech_731-geocomp-hw/tree/main/assignment-eight)
States and counties in geopandas

### Import modules

In [277]:
import pandas as pd
import geopandas as gpd
import json
import io

### Translate encoding of json file
The original "data/gz_2010_us_050_00_20m.json" file appears to be encoded in 'latin-1', causing errors when attempting read it directly into geopandas
This is resolved by first reading in the file as json and then reexporting as json, changing the name to geojson

In [278]:
with io.open("data/gz_2010_us_050_00_20m.json", encoding="latin-1") as f:
    data = json.load(f)

with open("data/gz_2010_us_050_00_20m.geojson", "w") as fp:
    json.dump(data, fp)

### Read files 

In [279]:
county_data = gpd.read_file("data/gz_2010_us_050_00_20m.geojson")

with io.open("data/fipsToState.json") as f:
    fips_to_state = json.load(f)

with io.open("data/stateCodeToFips.json") as f:
    state_code_to_fips = json.load(f)

In [280]:
fips_to_state_df = pd.DataFrame(
    {
        "STATE": fips_to_state.keys(),
        "STATE_NAME": fips_to_state.values(),
    }
)

state_code_to_fips_df = pd.DataFrame(
    {"state_code": state_code_to_fips.keys(), "fips": state_code_to_fips.values()}
)

### Task 1
Find the top n most common county names

In [281]:
def get_most_common_county_names(n):
    return county_data.groupby("NAME").size().nlargest(n)

In [282]:
print(get_most_common_county_names(5))

NAME
Washington    31
Franklin      26
Jefferson     26
Jackson       24
Lincoln       24
dtype: int64


### Task 3
Join fips code to get fill names of states using merge  
(Doing before task two, in order to format task two with state names)

In [283]:
state_county_data = county_data.merge(fips_to_state_df)
state_county_data.head()

Unnamed: 0,GEO_ID,STATE,COUNTY,NAME,LSAD,CENSUSAREA,geometry,STATE_NAME
0,0500000US01001,1,1,Autauga,County,594.436,"POLYGON ((-86.49677 32.34444, -86.71790 32.402...",Alabama
1,0500000US01009,1,9,Blount,County,644.776,"POLYGON ((-86.57780 33.76532, -86.75914 33.840...",Alabama
2,0500000US01017,1,17,Chambers,County,596.531,"POLYGON ((-85.18413 32.87053, -85.12342 32.772...",Alabama
3,0500000US01021,1,21,Chilton,County,692.854,"POLYGON ((-86.51734 33.02057, -86.51596 32.929...",Alabama
4,0500000US01033,1,33,Colbert,County,592.619,"POLYGON ((-88.13999 34.58170, -88.13925 34.587...",Alabama


### Task 2
Summary statistics for states
- number of counties
- min and max area of counties within state

In [284]:
state_groups = state_county_data.groupby("STATE_NAME")

#### Task 2, Part A
Number of counties (output limited to five largest counts)

In [285]:
state_groups["NAME"].size().nlargest(5)

STATE_NAME
Texas       254
Geogia      159
Virginia    134
Kentucky    120
Missouri    115
Name: NAME, dtype: int64

#### Task 2, Part B
Area of largest county (output limited to states with the top 5 largest counties)

Method 1:  
Sort the data on census area as is, and then drop all states less than the max state
(Advantage is that it provides all other fields)

In [286]:
state_county_data.sort_values("CENSUSAREA", ascending=False).drop_duplicates(
    ["STATE_NAME"]
)[:5]

Unnamed: 0,GEO_ID,STATE,COUNTY,NAME,LSAD,CENSUSAREA,geometry,STATE_NAME
94,0500000US02290,2,290,Yukon-Koyukuk,CA,145504.789,"POLYGON ((-153.00134 62.72744, -153.00126 62.2...",Alaska
220,0500000US06071,6,71,San Bernardino,County,20056.938,"POLYGON ((-115.64803 35.80963, -115.64768 35.8...",California
529,0500000US04005,4,5,Coconino,County,18618.885,"POLYGON ((-112.53859 37.00067, -112.53454 37.0...",Arizona
1750,0500000US32023,32,23,Nye,County,18181.924,"POLYGON ((-115.84580 36.12024, -115.84608 35.9...",Nevada
3047,0500000US56037,56,37,Sweetwater,County,10426.649,"POLYGON ((-110.04800 41.57802, -110.05371 42.2...",Wyoming


Method 2:  
Group the states together and then find the max CENSUSAREA
(Advantage is that it seems more 'pythonic')

In [287]:
state_groups["CENSUSAREA"].max().nlargest(5).reset_index()

Unnamed: 0,STATE_NAME,CENSUSAREA
0,Alaska,145504.789
1,California,20056.938
2,Arizona,18618.885
3,Nevada,18181.924
4,Wyoming,10426.649


(Getting the index instead of the value could allow for looking of the full entry later)

In [288]:
state_groups["CENSUSAREA"].idxmax().head()

STATE_NAME
Alabama        39
Alaska         94
Arizona       529
Arkansas      154
California    220
Name: CENSUSAREA, dtype: int64

#### Task 2, Part C
Area of smallest county (output limited to 5 smallest)  
Using Method 1, as it preserves the most data

In [289]:
state_county_data.sort_values("CENSUSAREA").drop_duplicates(["STATE_NAME"])[:5]

Unnamed: 0,GEO_ID,STATE,COUNTY,NAME,LSAD,CENSUSAREA,geometry,STATE_NAME
2879,0500000US51610,51,610,Falls Church,city,1.999,"POLYGON ((-77.15029 38.87619, -77.15497 38.872...",Virginia
545,0500000US15005,15,5,Kalawao,County,11.991,"POLYGON ((-157.01455 21.18550, -156.99911 21.1...",Hawaii
2184,0500000US36061,36,61,New York,County,22.829,"MULTIPOLYGON (((-74.04086 40.70012, -74.04002 ...",New York
3138,0500000US44001,44,1,Bristol,County,24.164,"POLYGON ((-71.22480 41.71050, -71.22787 41.705...",Rhode Island
231,0500000US08014,8,14,Broomfield,County,33.034,"POLYGON ((-105.14734 39.91389, -105.14734 39.9...",Colorado


### Task 4
Map the top five counties with the most common names

In [300]:
top_counties = (
    state_county_data.groupby("NAME")
    .size()
    .rename("COUNT")
    .reset_index()
    .sort_values("COUNT", ascending=False)[:5]["NAME"]
)
type(top_counties)

pandas.core.series.Series

In [337]:
top_counties = (
    state_county_data.groupby("NAME").size().nlargest(5).reset_index()["NAME"]
)

In [344]:
first_county = top_counties[0]
first_count_matches = state_county_data.loc[state_county_data["NAME"] == first_county]
first_county_s = gpd.GeoSeries(first_count_matches["geometry"])
first_county_map = first_county_s.explore(style_kwds={"color": "#bebada"})


second_county = top_counties[1]
second_count_matches = state_county_data.loc[state_county_data["NAME"] == second_county]
second_county_s = gpd.GeoSeries(second_count_matches["geometry"])
second_county_map = second_county_s.explore(
    m=first_county_map, style_kwds={"color": "#ffffb3"}
)

third_county = top_counties[2]
third_count_matches = state_county_data.loc[state_county_data["NAME"] == third_county]
third_county_s = gpd.GeoSeries(third_count_matches["geometry"])
third_county_s.explore(m=first_county_map, style_kwds={"color": "#8dd3c7"})

In [290]:
top_counties_geog = (
    state_county_data.groupby("NAME")
    .size()
    .rename("COUNT")
    .reset_index()
    .merge(state_county_data, how="right")
    .sort_values("COUNT", ascending=False)
    .drop_duplicates(["NAME"])[:5]
)

In [322]:
top_counties_s = gpd.GeoSeries(top_counties_geog["geometry"])
top_counties_s.explore()