In [1]:
import requests
import pandas as pd
import hvplot.pandas
from dotenv import load_dotenv
import os
from geopy.geocoders import Nominatim
from geopy.exc import GeocoderTimedOut
import locale
import time

load_dotenv()

True

In [2]:
api_key = os.getenv('CENSUS_API_KEY')

In [3]:
# Declare varibles used to build a request uri
base_uri = 'https://api.census.gov/data/timeseries/intltrade/imports/porths'
fields = 'PORT,PORT_NAME,GEN_VAL_YR'

# Declare a list that will be used to store dfs for each year
list_of_data_by_year = []

In [4]:
# Iterate through range of years and request US import port data for each year
for yr in range(2012, 2023):
    
    try:
        r = requests.get(f'{base_uri}?get={fields}&time={yr}-12&key={api_key}')
        data = r.json()
        df = pd.DataFrame(data)
        list_of_data_by_year.append(df)
        
    except: continue
    

us_import_port_df = pd.concat(list_of_data_by_year)
us_import_port_df.head()

Unnamed: 0,0,1,2,3
0,PORT,PORT_NAME,GEN_VAL_YR,time
1,-,TOTAL FOR ALL PORTS,2276267147199,2012-12
2,0104,"JACKMAN, ME",387677439,2012-12
3,0101,"PORTLAND, ME",2605248644,2012-12
4,0102,"BANGOR, ME",2394534076,2012-12


In [5]:
# Use first row to set columns
us_import_port_df.columns = us_import_port_df.iloc[0]

# Slice out headers and summary rows and reset index
us_import_port_df = us_import_port_df[2:].reset_index(drop=True)

display(us_import_port_df.head())
display(us_import_port_df.tail())

Unnamed: 0,PORT,PORT_NAME,GEN_VAL_YR,time
0,104,"JACKMAN, ME",387677439,2012-12
1,101,"PORTLAND, ME",2605248644,2012-12
2,102,"BANGOR, ME",2394534076,2012-12
3,103,"EASTPORT, ME",4922822,2012-12
4,105,"VANCEBORO, ME",257695889,2012-12


Unnamed: 0,PORT,PORT_NAME,GEN_VAL_YR,time
4149,5583,"FORT WORTH ALLIANCE AIRPORT, TX",1064365,2022-12
4150,5584,"ADDISON AIRPORT, DALLAS, TX",474587,2022-12
4151,5588,"DALLAS LOVE FIELD, DALLAS, TX",893856,2022-12
4152,6000,VESSELS UNDER OWN POWER,9080867,2022-12
4153,7070,LOW VALUE,21669585007,2022-12


In [6]:
# Remove totals
us_import_totals_df = us_import_port_df[us_import_port_df['PORT'] == '-']
us_import_port_df = us_import_port_df[us_import_port_df['PORT'] != '-']

In [7]:
# Cast GEN_VAL_YR (i.e. Year-to-date total value of general goods) to float
us_import_port_df['GEN_VAL_YR'] = pd.to_numeric(us_import_port_df['GEN_VAL_YR'], errors='coerce')
us_import_port_df['GEN_VAL_YR'] = us_import_port_df['GEN_VAL_YR'].astype(float)

In [8]:
# Create series datetime
us_import_port_df['datetime'] = pd.to_datetime(us_import_port_df['time'], format='%Y-%m', errors='coerce')
us_import_port_df.head()

Unnamed: 0,PORT,PORT_NAME,GEN_VAL_YR,time,datetime
0,104,"JACKMAN, ME",387677400.0,2012-12,2012-12-01
1,101,"PORTLAND, ME",2605249000.0,2012-12,2012-12-01
2,102,"BANGOR, ME",2394534000.0,2012-12,2012-12-01
3,103,"EASTPORT, ME",4922822.0,2012-12,2012-12-01
4,105,"VANCEBORO, ME",257695900.0,2012-12,2012-12-01


In [9]:
# Aggregate us port data grouping port and port name by date
aggregated_df = us_import_port_df[(us_import_port_df['datetime'] >= '2012-12') & (us_import_port_df['datetime'] <= '2022-12')]\
    .groupby(['PORT', 'PORT_NAME', pd.Grouper(key='datetime', freq='M')])['GEN_VAL_YR']\
    .sum()\
    .reset_index()

aggregated_df.head()

Unnamed: 0,PORT,PORT_NAME,datetime,GEN_VAL_YR
0,101,"PORTLAND, ME",2012-12-31,2605249000.0
1,101,"PORTLAND, ME",2013-12-31,3297602000.0
2,101,"PORTLAND, ME",2014-12-31,3068745000.0
3,101,"PORTLAND, ME",2015-12-31,2575698000.0
4,101,"PORTLAND, ME",2016-12-31,1586563000.0


In [10]:
# Grouping by Port to calculate the total summed value for each port
port_totals = aggregated_df.groupby(['PORT', 'PORT_NAME'])['GEN_VAL_YR'].sum().reset_index()

# Sorting the ports based on their total summed values in descending order
sorted_ports = port_totals.sort_values('GEN_VAL_YR', ascending=False)

In [11]:
# Selecting the top 10 ports with the highest summed values
top_10_ports = sorted_ports.head(10)
top_10_ports

Unnamed: 0,PORT,PORT_NAME,GEN_VAL_YR
182,2704,"LOS ANGELES, CA",2757943000000.0
62,1003,"NEWARK, NJ",1883582000000.0
352,3901,"CHICAGO, IL",1770119000000.0
153,2304,"LAREDO, TX",1366580000000.0
64,1012,"JFK INTERNATIONAL AIRPORT, NY",1186959000000.0
406,5301,"HOUSTON, TX",774004500000.0
332,3801,"DETROIT, MI",766237500000.0
102,1703,"SAVANNAH, GA",763486900000.0
184,2709,"LONG BEACH, CA",750466000000.0
128,2002,"NEW ORLEANS, LA",688972900000.0


In [12]:
# Plot the top ports
top_10_ports_plot = top_10_ports.hvplot(
    title='Top US Ports by Total Import Value 2012-2022',
    yformatter="$%.0f",
    kind='bar',
    x='PORT_NAME', 
    xlabel='Port Name',
    y='GEN_VAL_YR',
    ylabel='Total Value USD',
    hover_color='yellow',
    width=1200,
    height=700,
    rot=20
)
top_10_ports_plot

![Top Us Ports](./plots/top_us_ports.png)

In [13]:
# Selecting the top 3 ports with the highest summed values
top_3_ports = sorted_ports.head(3)
top_3_ports

Unnamed: 0,PORT,PORT_NAME,GEN_VAL_YR
182,2704,"LOS ANGELES, CA",2757943000000.0
62,1003,"NEWARK, NJ",1883582000000.0
352,3901,"CHICAGO, IL",1770119000000.0


In [14]:
# For the top 3 US ports, fetch data for imports by country

port_dict = {}

def fetch_port_data(port_id):
    
    # Create empty list to store dfs for port
    list_of_dfs = []
    
    for yr in range(2012, 2023):
        try:
            uri = f'{base_uri}?get=GEN_VAL_YR,CTY_NAME,CTY_CODE,PORT_NAME&PORT={port_id}&time={yr}-12&key={api_key}';
            r = requests.get(uri)
            data = r.json()
            df = pd.DataFrame(data)
            list_of_dfs.append(df)
        except Exception as e:
            print(f"Error fetching data for {port_id} and year {yr}: {e}")
            continue
        
    return list_of_dfs

        
for i in range(3):
    
    # Declare var and assign row
    port = top_3_ports.iloc[i]
    
    # Add port as key to list
    if port.PORT not in port_dict:
        port_dict[port.PORT] = None
     
    dfs = fetch_port_data(port.PORT)
    
    port_dict[port.PORT] = pd.concat(dfs)

    
for key, value in port_dict.items():
    print(key)
    print(value.head()) 

2704
              0                      1         2                3     4  \
0    GEN_VAL_YR               CTY_NAME  CTY_CODE        PORT_NAME  PORT   
1    3348141572                   OPEC      0001  LOS ANGELES, CA  2704   
2    7626630305         EUROPEAN UNION      0003  LOS ANGELES, CA  2704   
3  200994607366  PACIFIC RIM COUNTRIES      0014  LOS ANGELES, CA  2704   
4    1163354229               CAFTA-DR      0017  LOS ANGELES, CA  2704   

         5  
0     time  
1  2012-12  
2  2012-12  
3  2012-12  
4  2012-12  
1003
             0                      1         2           3     4        5
0   GEN_VAL_YR               CTY_NAME  CTY_CODE   PORT_NAME  PORT     time
1   6427845435                   OPEC      0001  NEWARK, NJ  1003  2012-12
2  51818956291         EUROPEAN UNION      0003  NEWARK, NJ  1003  2012-12
3  42719361340  PACIFIC RIM COUNTRIES      0014  NEWARK, NJ  1003  2012-12
4    683841232               CAFTA-DR      0017  NEWARK, NJ  1003  2012-12
3901
      

In [15]:
# Create a df with the top US Port
us_port_1_df = port_dict[top_3_ports.iloc[0].PORT]
us_port_1_df.head()

Unnamed: 0,0,1,2,3,4,5
0,GEN_VAL_YR,CTY_NAME,CTY_CODE,PORT_NAME,PORT,time
1,3348141572,OPEC,0001,"LOS ANGELES, CA",2704,2012-12
2,7626630305,EUROPEAN UNION,0003,"LOS ANGELES, CA",2704,2012-12
3,200994607366,PACIFIC RIM COUNTRIES,0014,"LOS ANGELES, CA",2704,2012-12
4,1163354229,CAFTA-DR,0017,"LOS ANGELES, CA",2704,2012-12


In [16]:
# Assign header row to columns attributes
us_port_1_df.columns = us_port_1_df.iloc[0]
us_port_1_df.head()

Unnamed: 0,GEN_VAL_YR,CTY_NAME,CTY_CODE,PORT_NAME,PORT,time
0,GEN_VAL_YR,CTY_NAME,CTY_CODE,PORT_NAME,PORT,time
1,3348141572,OPEC,0001,"LOS ANGELES, CA",2704,2012-12
2,7626630305,EUROPEAN UNION,0003,"LOS ANGELES, CA",2704,2012-12
3,200994607366,PACIFIC RIM COUNTRIES,0014,"LOS ANGELES, CA",2704,2012-12
4,1163354229,CAFTA-DR,0017,"LOS ANGELES, CA",2704,2012-12


In [17]:
# Reset index and drop header row
us_port_1_df = us_port_1_df[1:].reset_index(drop=True)
us_port_1_df.head()

Unnamed: 0,GEN_VAL_YR,CTY_NAME,CTY_CODE,PORT_NAME,PORT,time
0,3348141572,OPEC,1,"LOS ANGELES, CA",2704,2012-12
1,7626630305,EUROPEAN UNION,3,"LOS ANGELES, CA",2704,2012-12
2,200994607366,PACIFIC RIM COUNTRIES,14,"LOS ANGELES, CA",2704,2012-12
3,1163354229,CAFTA-DR,17,"LOS ANGELES, CA",2704,2012-12
4,1961022962,NAFTA,20,"LOS ANGELES, CA",2704,2012-12


In [18]:
# Cast GEN_VAL_YR (i.e. Year-to-date total value of general goods) to float
us_port_1_df['GEN_VAL_YR'] = pd.to_numeric(us_port_1_df['GEN_VAL_YR'], errors='coerce')
us_port_1_df['GEN_VAL_YR'] = us_port_1_df['GEN_VAL_YR'].astype(float)

In [19]:
# Create series datetime
us_port_1_df['datetime'] = pd.to_datetime(us_port_1_df['time'], format='%Y-%m', errors='coerce')
us_port_1_df.head()

Unnamed: 0,GEN_VAL_YR,CTY_NAME,CTY_CODE,PORT_NAME,PORT,time,datetime
0,3348142000.0,OPEC,1,"LOS ANGELES, CA",2704,2012-12,2012-12-01
1,7626630000.0,EUROPEAN UNION,3,"LOS ANGELES, CA",2704,2012-12,2012-12-01
2,200994600000.0,PACIFIC RIM COUNTRIES,14,"LOS ANGELES, CA",2704,2012-12,2012-12-01
3,1163354000.0,CAFTA-DR,17,"LOS ANGELES, CA",2704,2012-12,2012-12-01
4,1961023000.0,NAFTA,20,"LOS ANGELES, CA",2704,2012-12,2012-12-01


In [20]:
# Aggregate us port data grouping port and port name by date
aggregated_df_10 = us_port_1_df\
    .groupby(['CTY_CODE', 'CTY_NAME', pd.Grouper(key='datetime', freq='M')])['GEN_VAL_YR']\
    .sum()\
    .reset_index()

aggregated_df_10.head()

Unnamed: 0,CTY_CODE,CTY_NAME,datetime,GEN_VAL_YR
0,-,TOTAL FOR ALL COUNTRIES,2012-12-31,241349500000.0
1,-,TOTAL FOR ALL COUNTRIES,2013-12-31,245097000000.0
2,-,TOTAL FOR ALL COUNTRIES,2014-12-31,251547900000.0
3,-,TOTAL FOR ALL COUNTRIES,2015-12-31,238873900000.0
4,-,TOTAL FOR ALL COUNTRIES,2016-12-31,240614900000.0


In [21]:
# Grouping by Port to calculate the total summed value for each port
country_totals = aggregated_df_10.groupby(['CTY_CODE', 'CTY_NAME'])['GEN_VAL_YR'].sum().reset_index()

# Sorting the ports based on their total summed values in descending order
sorted_countries = country_totals.sort_values('GEN_VAL_YR', ascending=False)[1:]
sorted_countries

Unnamed: 0,CTY_CODE,CTY_NAME,GEN_VAL_YR
163,5XXX,ASIA,2.548433e+12
11,0026,APEC,2.503245e+12
3,0014,PACIFIC RIM COUNTRIES,2.190527e+12
157,5700,CHINA,1.387064e+12
7,0022,OECD,6.182910e+11
...,...,...,...
41,2489,GRENADA,1.048000e+04
39,2487,ST LUCIA,5.846000e+03
66,4031,"SVALBARD, JAN MAYEN ISLAND",5.624000e+03
16,1610,ST PIERRE AND MIQUELON,5.529000e+03


In [22]:
# Get top 10 countries for top US Port
top_10_countries = sorted_countries.head(10)
top_10_countries

Unnamed: 0,CTY_CODE,CTY_NAME,GEN_VAL_YR
163,5XXX,ASIA,2548433000000.0
11,0026,APEC,2503245000000.0
3,0014,PACIFIC RIM COUNTRIES,2190527000000.0
157,5700,CHINA,1387064000000.0
7,0022,OECD,618291000000.0
12,0027,ASEAN,465751900000.0
162,5880,JAPAN,354936800000.0
145,5520,VIETNAM,189520200000.0
161,5830,TAIWAN,136310100000.0
159,5800,"KOREA, SOUTH",126887000000.0


In [23]:
# Plot the top 10 countries imported into top US Port
top_10_countries_plot = top_10_countries.hvplot(
    title='Top Countries into Top US Port (Los Angeles) by Total Import Value 2012-2022',
    yformatter="$%.0f",
    kind='bar',
    x='CTY_NAME', 
    xlabel='Country Name',
    y='GEN_VAL_YR',
    ylabel='Total Value USD',
    hover_color='yellow',
    width=1200,
    height=700,
    rot=20
)
top_10_countries_plot

![Top Countries Top Port](./plots/top_countries_top_port.png)

In [24]:
# Create a df with the top country of the top US Port flattened with distinct date rows
top_country_top_port = us_port_1_df[us_port_1_df['CTY_CODE'] == top_10_countries.iloc[0].CTY_CODE]
top_country_top_port

Unnamed: 0,GEN_VAL_YR,CTY_NAME,CTY_CODE,PORT_NAME,PORT,time,datetime
137,222242000000.0,ASIA,5XXX,"LOS ANGELES, CA",2704,2012-12,2012-12-01
335,222742400000.0,ASIA,5XXX,"LOS ANGELES, CA",2704,2013-12,2013-12-01
553,228764400000.0,ASIA,5XXX,"LOS ANGELES, CA",2704,2014-12,2014-12-01
765,219310400000.0,ASIA,5XXX,"LOS ANGELES, CA",2704,2015-12,2015-12-01
989,224279500000.0,ASIA,5XXX,"LOS ANGELES, CA",2704,2016-12,2016-12-01
1207,233127200000.0,ASIA,5XXX,"LOS ANGELES, CA",2704,2017-12,2017-12-01
1423,244265200000.0,ASIA,5XXX,"LOS ANGELES, CA",2704,2018-12,2018-12-01
1643,226716300000.0,ASIA,5XXX,"LOS ANGELES, CA",2704,2019-12,2019-12-01
1859,214673800000.0,ASIA,5XXX,"LOS ANGELES, CA",2704,2020-12,2020-12-01
2070,249628100000.0,ASIA,5XXX,"LOS ANGELES, CA",2704,2021-12,2021-12-01


In [25]:
# Calculate key metrics
mean = top_country_top_port['GEN_VAL_YR'].mean()
std_dev = top_country_top_port['GEN_VAL_YR'].std()
median = top_country_top_port['GEN_VAL_YR'].median()
max_val = top_country_top_port['GEN_VAL_YR'].max()
min_val = top_country_top_port['GEN_VAL_YR'].min()


In [26]:
top_country_top_port_plot = top_country_top_port.hvplot(
    title='Top Country Top US Port: Total Import Value by Year 2012-2022',
    yformatter="$%.0f",
    kind='bar',
    x='time', 
    xlabel='Year',
    y='GEN_VAL_YR',
    ylabel='Total Value USD',
    hover_color='yellow',
    width=1200,
    height=700,
)
top_country_top_port_plot

![Top Country Top Port](./plots/top_country_top_port.png)

In [28]:
# Add geo data to US ports
sorted_ports.head()

Unnamed: 0,PORT,PORT_NAME,GEN_VAL_YR
182,2704,"LOS ANGELES, CA",2757943000000.0
62,1003,"NEWARK, NJ",1883582000000.0
352,3901,"CHICAGO, IL",1770119000000.0
153,2304,"LAREDO, TX",1366580000000.0
64,1012,"JFK INTERNATIONAL AIRPORT, NY",1186959000000.0


In [29]:
# Add series for lat and lon
sorted_ports['LAT'] = None
sorted_ports['LON'] = None

Unnamed: 0,PORT,PORT_NAME,GEN_VAL_YR,LAT,LON
182,2704,"LOS ANGELES, CA",2757943000000.0,,
62,1003,"NEWARK, NJ",1883582000000.0,,
352,3901,"CHICAGO, IL",1770119000000.0,,
153,2304,"LAREDO, TX",1366580000000.0,,
64,1012,"JFK INTERNATIONAL AIRPORT, NY",1186959000000.0,,


In [35]:
geolocator = Nominatim(user_agent="student_northwestern_university")

for i in range(len(sorted_ports)):
    
    port = sorted_ports.iloc[i].PORT_NAME
    location = None
    try: location = geolocator.geocode(port)
    except GeocoderTimedOut: continue
    
    if (location and location.latitude):
        sorted_ports.at[i, 'LAT'] = float(location.latitude)
    if (location and location.longitude):
        sorted_ports.at[i, 'LON'] = float(location.longitude)

In [36]:
sorted_ports.dropna(inplace=True)

sorted_ports.isna().any()

sorted_ports.shape

(395, 5)

In [40]:
ports_plot = sorted_ports.hvplot.points(
    'LON', 
    'LAT',
    tiles='OSM',
    geo=True,
    size='GEN_VAL_YR',
    scale=0.000015,
    frame_width=800,
    frame_height=400,
    title='Total USD Value of Monthly Imports by US Port (2012 - 2022)',
    hover_cols=['PORT_NAME', 'GEN_VAL_MO'],
)
ports_plot

![Top US Ports Geo](./plots/top_ports_geo.png)