In [152]:
fips_file = './fips.txt'

In [153]:
import pandas as pd
import geopandas
import json

In [154]:
# get full data
with open(fips_file, 'r') as f:
    text = f.read()

In [155]:
text_top = text.split('state-level')[0]
states_text = text.split('state-level')[-1].split('county-level')[0]
counties_text = text.split('state-level')[-1].split('county-level')[-1]

# more cleaning
states_text = states_text.split('-----------')[-1].split(' -------')[-1]
counties_text = counties_text.split('--------------')[-1]

In [156]:
#print(counties_text)

## US States

In [157]:
# clean states
print(states_text)


       01        ALABAMA
       02        ALASKA
       04        ARIZONA
       05        ARKANSAS
       06        CALIFORNIA
       08        COLORADO
       09        CONNECTICUT
       10        DELAWARE
       11        DISTRICT OF COLUMBIA
       12        FLORIDA
       13        GEORGIA
       15        HAWAII
       16        IDAHO
       17        ILLINOIS
       18        INDIANA
       19        IOWA
       20        KANSAS
       21        KENTUCKY
       22        LOUISIANA
       23        MAINE
       24        MARYLAND
       25        MASSACHUSETTS
       26        MICHIGAN
       27        MINNESOTA
       28        MISSISSIPPI
       29        MISSOURI
       30        MONTANA
       31        NEBRASKA
       32        NEVADA
       33        NEW HAMPSHIRE
       34        NEW JERSEY
       35        NEW MEXICO
       36        NEW YORK
       37        NORTH CAROLINA
       38        NORTH DAKOTA
       39        OHIO
       40        OKLAHOMA
       41        OR

In [158]:
states = {'id':[], 'state':[]}

for l in states_text.split('\n'):
    ll = l.split('      ')
    for lll in ll:
        if len(lll.strip()) > 0:
            if lll.strip().isdigit():
                states['id'].append(lll.strip())
            else:
                states['state'].append(lll.strip())


In [159]:
states_df = pd.DataFrame(states)

In [160]:
states_df

Unnamed: 0,id,state
0,1,ALABAMA
1,2,ALASKA
2,4,ARIZONA
3,5,ARKANSAS
4,6,CALIFORNIA
5,8,COLORADO
6,9,CONNECTICUT
7,10,DELAWARE
8,11,DISTRICT OF COLUMBIA
9,12,FLORIDA


In [161]:
# save dataframe
states_df.to_csv('./us_state_codes.csv',index=False)

### Putting things together with geospatial info

In [162]:
from vega_datasets import data

In [163]:
states_geo_in = geopandas.read_file(data.us_10m.url,layer='states')

In [164]:
state_names = []
for i in states_geo_in['id'].values:
    d = states_df[states_df['id']==str(i).zfill(2)]
    if len(d) != 1:
        if int(i) == 72:
            n = 'Puerto Rico'
            state_names.append(n)
        elif int(i) == 78:
            state_names.append('U.S. Virgin Islands')
        else:
            print('issue with not having the right name! ID=',i)
        #import sys; sys.exit()
    else:
        n = d['state'].values[0]
        state_names.append(n)
    #print(n)

states_geo = states_geo_in.copy()
states_geo['State'] = state_names


In [165]:
#states_geo

In [166]:
# save to file
with open('./us_states_geo.geojson','w') as f:
    print(states_geo.to_json(),file=f)

In [None]:
#states_geo_in = geopandas.read_file('./us_states_geo.geojson')

In [167]:
states_geo

Unnamed: 0,id,geometry,State
0,2,"MULTIPOLYGON (((179.24009 51.34855, 178.94217 ...",ALASKA
1,15,"MULTIPOLYGON (((-155.68054 18.91039, -155.7953...",HAWAII
2,72,"MULTIPOLYGON (((-65.53182 18.07995, -65.57849 ...",Puerto Rico
3,1,"MULTIPOLYGON (((-86.83429 34.99115, -86.78404 ...",ALABAMA
4,5,"POLYGON ((-94.07748 36.4984, -93.86571 36.4984...",ARKANSAS
5,4,"POLYGON ((-109.99959 36.99742, -109.04484 36.9...",ARIZONA
6,6,"MULTIPOLYGON (((-118.42725 32.80225, -118.4990...",CALIFORNIA
7,8,"POLYGON ((-106.32056 40.99868, -106.19135 40.9...",COLORADO
8,9,"MULTIPOLYGON (((-73.05499 42.03968, -73.00832 ...",CONNECTICUT
9,11,"MULTIPOLYGON (((-77.01757 38.80924, -77.09653 ...",DISTRICT OF COLUMBIA


## US Counties

In [168]:
counties = {'id':[], 'county':[]}

for l in counties_text.split('\n'):
    ll = l.split('    ')
    for lll in ll:
        if len(lll.strip()) > 0:
            if lll.strip().isdigit():
                counties['id'].append(lll.strip())
            else:
                if '(' not in lll:
                    c = lll.strip()
                    counties['county'].append(c)
                else:
                    c = lll.split('(')[0] # take out any notes
                    c = c.strip()
                    # > 0?
                    if len(c) > 0:
                        counties['county'].append(c)
    # check
    if len(counties['id']) != len(counties['county']):
        print('mis match length')
        import sys; sys.exit()


In [169]:
#counties['id'][-5:], counties['county'][-5:]

In [170]:
counties_df = pd.DataFrame(counties)

In [171]:
counties_df

Unnamed: 0,id,county
0,01000,Alabama
1,01001,Autauga County
2,01003,Baldwin County
3,01005,Barbour County
4,01007,Bibb County
...,...,...
3190,56037,Sweetwater County
3191,56039,Teton County
3192,56041,Uinta County
3193,56043,Washakie County


In [172]:
# save dataframe
counties_df.to_csv('./us_county_codes.csv',index=False)

In [173]:
# also with geospatial stuff
counties_geo_in = geopandas.read_file(data.us_10m.url,layer='counties')

In [174]:
counties_geo_in['id']

0       22051
1       53000
2       53073
3       30105
4       30029
        ...  
3226    72037
3227    72069
3228    72147
3229    78010
3230    72051
Name: id, Length: 3231, dtype: object

In [175]:
counties_df['id']

0       01000
1       01001
2       01003
3       01005
4       01007
        ...  
3190    56037
3191    56039
3192    56041
3193    56043
3194    56045
Name: id, Length: 3195, dtype: object

In [176]:
county_names = []
for i in counties_geo_in['id'].values:
    d = counties_df[counties_df['id'].astype('int')==int(i)]
    if len(d) != 1:
        if int(i) == 8014:
            county_names.append('Broomfield county')
        elif int(i) == 12086:
            county_names.append('Miami-Dade County')
        # elif int(i) == 72125:
        #     county_names.append('San German Municipio')
        # elif int(i) == 72003:
        #     county_names.append('Aguada Municipio')
        else:
            print('issue with not having the right name! ID=',i)
            #import sys; sys.exit()
            # ignore
            county_names.append('NaN')
    else:
        n = d['county'].values[0]
        county_names.append(n)
    #print(n)

counties_geo = counties_geo_in.copy()
counties_geo['County'] = county_names

# subset
counties_geo = counties_geo[counties_geo['County']!='NaN']

issue with not having the right name! ID= 72125
issue with not having the right name! ID= 72003
issue with not having the right name! ID= 72097
issue with not having the right name! ID= 72065
issue with not having the right name! ID= 72055
issue with not having the right name! ID= 72083
issue with not having the right name! ID= 72025
issue with not having the right name! ID= 72045
issue with not having the right name! ID= 72133
issue with not having the right name! ID= 72121
issue with not having the right name! ID= 72027
issue with not having the right name! ID= 72001
issue with not having the right name! ID= 72111
issue with not having the right name! ID= 72047
issue with not having the right name! ID= 72091
issue with not having the right name! ID= 72013
issue with not having the right name! ID= 72145
issue with not having the right name! ID= 72031
issue with not having the right name! ID= 72061
issue with not having the right name! ID= 72129
issue with not having the right name! ID

In [177]:
counties_geo

Unnamed: 0,id,geometry,County
0,22051,MULTIPOLYGON EMPTY,Jefferson Parish
1,53000,"POLYGON ((-122.65544 48.41032, -122.65544 48.4...",Washington
2,53073,"MULTIPOLYGON (((-120.85361 49.00011, -120.7674...",Whatcom County
3,30105,"POLYGON ((-106.11238 48.99904, -106.15187 48.8...",Valley County
4,30029,"POLYGON ((-114.06985 48.99904, -114.05908 48.8...",Flathead County
...,...,...,...
3149,2201,"POLYGON ((-133.52741 55.6952, -133.69611 55.78...",Prince of Wales-Outer Ketchikan Census Area
3150,2201,"POLYGON ((-133.25822 55.77416, -133.34436 55.8...",Prince of Wales-Outer Ketchikan Census Area
3151,2201,"POLYGON ((-133.72123 55.90146, -133.81814 55.9...",Prince of Wales-Outer Ketchikan Census Area
3152,2201,"POLYGON ((-132.928 56.0148, -132.928 56.01749,...",Prince of Wales-Outer Ketchikan Census Area


In [178]:
d

Unnamed: 0,id,county


In [179]:
counties_geo

Unnamed: 0,id,geometry,County
0,22051,MULTIPOLYGON EMPTY,Jefferson Parish
1,53000,"POLYGON ((-122.65544 48.41032, -122.65544 48.4...",Washington
2,53073,"MULTIPOLYGON (((-120.85361 49.00011, -120.7674...",Whatcom County
3,30105,"POLYGON ((-106.11238 48.99904, -106.15187 48.8...",Valley County
4,30029,"POLYGON ((-114.06985 48.99904, -114.05908 48.8...",Flathead County
...,...,...,...
3149,2201,"POLYGON ((-133.52741 55.6952, -133.69611 55.78...",Prince of Wales-Outer Ketchikan Census Area
3150,2201,"POLYGON ((-133.25822 55.77416, -133.34436 55.8...",Prince of Wales-Outer Ketchikan Census Area
3151,2201,"POLYGON ((-133.72123 55.90146, -133.81814 55.9...",Prince of Wales-Outer Ketchikan Census Area
3152,2201,"POLYGON ((-132.928 56.0148, -132.928 56.01749,...",Prince of Wales-Outer Ketchikan Census Area


In [180]:
# save to file
with open('./us_counties_geo.geojson','w') as f:
    print(counties_geo.to_json(),file=f)

## World Countries

In [181]:
countries_geo_in = geopandas.read_file(data.world_110m.url,layer='countries')

In [182]:
countries_df = pd.read_json('./world-110m-country-codes.json')

In [183]:
countries_df.head()

Unnamed: 0,code,id,name
0,AF,4,Afghanistan
1,AL,8,Albania
2,DZ,12,Algeria
3,AO,24,Angola
4,AQ,10,Antarctica


In [184]:
countries_geo_in.head()

Unnamed: 0,id,geometry
0,4,"POLYGON ((61.20961 35.64925, 62.23202 35.27011..."
1,24,"MULTIPOLYGON (((23.91324 -10.92658, 24.01764 -..."
2,8,"POLYGON ((20.59041 41.85586, 20.4644 41.51565,..."
3,784,"POLYGON ((51.57952 24.24479, 51.75592 24.29387..."
4,32,"MULTIPOLYGON (((-66.95887 -54.89756, -67.56368..."


In [185]:
country_names = []
code = []
for i in countries_geo_in['id'].values:
    d = countries_df[countries_df['id'].astype('int')==int(i)]
    if len(d) != 1:
        print('issue with not having the right name! ID=',i )
        #import sys; sys.exit()
        # ignore
        country_names.append('NaN')
        code.append('NaN')
    else:
        n = d['name'].values[0]
        country_names.append(n)
        code.append(d['code'].values[0])
    #print(n)

countries_geo = countries_geo_in.copy()
countries_geo['Country'] = country_names
countries_geo['code'] = code

# subset
countries_geo = countries_geo[countries_geo['Country']!='NaN']

issue with not having the right name! ID= -99
issue with not having the right name! ID= -99
issue with not having the right name! ID= 275
issue with not having the right name! ID= -99


In [186]:
# save to file
with open('./world_countries_geo.geojson','w') as f:
    print(countries_geo.to_json(),file=f)

In [187]:
countries_geo

Unnamed: 0,id,geometry,Country,code
0,4,"POLYGON ((61.20961 35.64925, 62.23202 35.27011...",Afghanistan,AF
1,24,"MULTIPOLYGON (((23.91324 -10.92658, 24.01764 -...",Angola,AO
2,8,"POLYGON ((20.59041 41.85586, 20.4644 41.51565,...",Albania,AL
3,784,"POLYGON ((51.57952 24.24479, 51.75592 24.29387...",United Arab Emirates,AE
4,32,"MULTIPOLYGON (((-66.95887 -54.89756, -67.56368...",Argentina,AR
...,...,...,...,...
172,548,"MULTIPOLYGON (((167.51508 -16.59835, 167.18027...",Vanuatu,VU
173,887,"POLYGON ((52.38592 16.38285, 52.19152 15.93771...",Yemen,YE
174,710,"POLYGON ((28.21888 -32.77244, 27.46287 -33.227...",South Africa,ZA
175,894,"POLYGON ((32.75853 -9.23064, 33.23013 -9.67747...",Zambia,ZM
