In [1]:
%load_ext autoreload
%autoreload 2
import pandas as pd

pd.set_option("display.max_columns", None)

In [2]:
import immigration
import demographics
import temperatures
import database
from queries import insert_into

In [21]:
database.reset_database()

In [22]:
immigration.load_i94_data_to_db()

In [5]:
immigration.immigration_etl()

In [17]:
immigration = immigration_full.load_immigration()
immigration.head()

Unnamed: 0,cicid,i94yr,i94mon,i94cit,i94res,i94port,arrdate,i94mode,i94addr,depdate,i94bir,i94visa,count,dtadfile,visapost,occup,entdepa,entdepd,entdepu,matflag,biryear,dtaddto,gender,insnum,airline,admnum,fltno,visatype
0,6.0,2016.0,4.0,692.0,692.0,XXX,20573.0,,,,37.0,2.0,1.0,,,,T,,U,,1979.0,10282016,,,,1897628000.0,,B2
1,7.0,2016.0,4.0,254.0,276.0,ATL,20551.0,1.0,AL,,25.0,3.0,1.0,20130811.0,SEO,,G,,Y,,1991.0,D/S,M,,,3736796000.0,296.0,F1
2,15.0,2016.0,4.0,101.0,101.0,WAS,20545.0,1.0,MI,20691.0,55.0,2.0,1.0,20160401.0,,,T,O,,M,1961.0,09302016,M,,OS,666643200.0,93.0,B2
3,16.0,2016.0,4.0,101.0,101.0,NYC,20545.0,1.0,MA,20567.0,28.0,2.0,1.0,20160401.0,,,O,O,,M,1988.0,09302016,,,AA,92468460000.0,199.0,B2
4,17.0,2016.0,4.0,101.0,101.0,NYC,20545.0,1.0,MA,20567.0,4.0,2.0,1.0,20160401.0,,,O,O,,M,2012.0,09302016,,,AA,92468460000.0,199.0,B2


## Immigration data cleaning

Based on part of the data (approx. 3 million records) the following observations were made:

### Dropped columns

The following columns are not used:

- biryear (redundant because of i94bir)
- entdepu (high number of nulls, unknown meaning)
- entdepa (unknown meaning)
- entdepd (unknown meaning)
- count (no information)
- matflag (meaning unknown)
- occup (high number of nulls)
- visapost (high number of nulls, no relevant information for the case)
- insnum (high number of nulls, no information)
- dtadfile (no relevant information for the case)
- draddto (no relevant information for the case)
- visatype (no relevant information for the case)
- fltno (no relevant information for the case)
- admnum (no information)
- i94cit (a lot of codes - approx. 15 % - are not found in the regions list of the i94 description; in addition the exact meaning of the column is not known)

### NULL values

- i94bir: <0.1% of values are NaN. Nulls cause problems with aggregation, therefore the field is imputed with the median
- i94mode: <0.01% of i94mode values are NaN, a value of 9 has the meaning "not reported"
- i94addr: approx 5% of values are NaN. While a value of '99' has the meaning "others", it cannot be used here as the reason why the high number of NAs are present is not known
- gender: approx 15% of values are NaN, leave as is
- airline: < 1%, leave as is
- depdate: approx 5%, leave as is, the probable meaning is, that the immigrant has not departed yet

i94addr, gender, airline, depdate are allowed to be nullable

In [18]:
immigration_clean = immigration_functions.clean_immigration(immigration)
immigration_clean.head()

Unnamed: 0,i94yr,i94mon,i94res,i94port,i94mode,i94addr,arrdate,depdate,i94bir,i94visa,gender,airline
0,2016.0,4.0,692.0,XXX,9.0,,2016-04-29,,37.0,2.0,,
1,2016.0,4.0,276.0,ATL,1.0,AL,2016-04-07,,25.0,3.0,M,
2,2016.0,4.0,101.0,WAS,1.0,MI,2016-04-01,2016-08-25,55.0,2.0,M,OS
3,2016.0,4.0,101.0,NYC,1.0,MA,2016-04-01,2016-04-23,28.0,2.0,,AA
4,2016.0,4.0,101.0,NYC,1.0,MA,2016-04-01,2016-04-23,4.0,2.0,,AA


In [6]:
demographics.demographics_etl()

In [11]:
demographics_df = demographics.load_demographics()
demographics_df[(demographics_df.City == 'Columbia') & (demographics_df.State == 'Missouri')]

Unnamed: 0,City,State,Median Age,Male Population,Female Population,Total Population,Number of Veterans,Foreign-born,Average Household Size,State Code,Race,Count
874,Columbia,Missouri,26.8,56544.0,62554.0,119098,4548.0,10729.0,2.37,MO,Asian,8673
1384,Columbia,Missouri,26.8,56544.0,62554.0,119098,4548.0,10729.0,2.37,MO,Black or African-American,15489
1514,Columbia,Missouri,26.8,56544.0,62554.0,119098,4548.0,10729.0,2.37,MO,White,96067
1617,Columbia,Missouri,26.8,56544.0,62554.0,119098,4548.0,10729.0,2.37,MO,Hispanic or Latino,4956
1618,Columbia,Missouri,26.8,56544.0,62554.0,119098,4548.0,10729.0,2.37,MO,American Indian and Alaska Native,1713


In [7]:
temperatures.temperatures_etl()

In [75]:
states = immigration_functions.get_i94_description("states")
state_codes = {state[0] for state in states}
cities = pd.DataFrame(immigration_functions.get_i94_cities(state_codes))
cities.columns = ('city_name_short', 'city_name', 'us_state_name_short', 'country')
cities = pd.merge(cities, demographics_complete, how='left',
                  left_on=[cities['city_name'].str.lower(), cities['us_state_name_short'].str.lower()],
                  right_on=[demographics_complete['city'].str.lower(), demographics_complete['state_code'].str.lower()]).drop('key_0', axis=1)
cities[['city_name', 'us_state_name_short']].value_counts()

city_name            us_state_name_short
YUMA                 AZ                     2
AKRON                OH                     2
KETCHIKAN            AK                     2
CANNON CORNERS       NY                     2
Collapsed into INT   MN                     2
                                           ..
FORTUNA              ND                     1
FORT WORTH ALLIANCE  TX                     1
FORT PIERCE          FL                     1
FORT MYERS           FL                     1
LIMESTONE            ME                     1
Length: 499, dtype: int64

## Improvements

- Identify weather in origin country/city, for this also the origin city needs to be determined, e.g. with the flight plans of the airlines
- enrich the i94_ports dimension table
- enrich the i94_states dimension table
  - e.g. with state demographics data
- identify next big city/big cities from a port, more data is needed for that, the airports data is not sufficient for this

In [9]:
airports = pd.read_csv('data/airport-codes_csv.csv')
airports

Unnamed: 0,ident,type,name,elevation_ft,continent,iso_country,iso_region,municipality,gps_code,iata_code,local_code,coordinates
0,00A,heliport,Total Rf Heliport,11.0,,US,US-PA,Bensalem,00A,,00A,"-74.93360137939453, 40.07080078125"
1,00AA,small_airport,Aero B Ranch Airport,3435.0,,US,US-KS,Leoti,00AA,,00AA,"-101.473911, 38.704022"
2,00AK,small_airport,Lowell Field,450.0,,US,US-AK,Anchor Point,00AK,,00AK,"-151.695999146, 59.94919968"
3,00AL,small_airport,Epps Airpark,820.0,,US,US-AL,Harvest,00AL,,00AL,"-86.77030181884766, 34.86479949951172"
4,00AR,closed,Newport Hospital & Clinic Heliport,237.0,,US,US-AR,Newport,,,,"-91.254898, 35.6087"
...,...,...,...,...,...,...,...,...,...,...,...,...
55070,ZYYK,medium_airport,Yingkou Lanqi Airport,0.0,AS,CN,CN-21,Yingkou,ZYYK,YKH,,"122.3586, 40.542524"
55071,ZYYY,medium_airport,Shenyang Dongta Airport,,AS,CN,CN-21,Shenyang,ZYYY,,,"123.49600219726562, 41.784400939941406"
55072,ZZ-0001,heliport,Sealand Helipad,40.0,EU,GB,GB-ENG,Sealand,,,,"1.4825, 51.894444"
55073,ZZ-0002,small_airport,Glorioso Islands Airstrip,11.0,AF,TF,TF-U-A,Grande Glorieuse,,,,"47.296388888900005, -11.584277777799999"


In [27]:
airports[['blaine' in str(name).lower() for name in airports.name]]

Unnamed: 0,ident,type,name,elevation_ft,continent,iso_country,iso_region,municipality,gps_code,iata_code,local_code,coordinates
5680,4W6,closed,Blaine Municipal Airport,75.0,,US,US-WA,Blaine,,,,"-122.732002, 48.990101"
26083,KANE,small_airport,Anoka County-Blaine (Janes Field) Airport,912.0,,US,US-MN,Minneapolis,KANE,,ANE,"-93.211401, 45.145"
52347,WT34,heliport,Blaine Sectors Hqs Heliport,43.0,,US,US-WA,Blaine,WT34,,WT34,"-122.721389, 48.975556"


In [70]:
from config import config
states = immigration.get_i94_description("states")
states_clean = []
for state_code, state in states:
    state = state.title().replace('N.', 'North').replace('S.', 'South').replace('W.', 'West').replace('Dist. Of', 'District of').replace('Wisconson', 'Wisconsin')
    states_clean.append((state_code, state))  

df_states = pd.DataFrame(states_clean, columns=["state_code", "state"])
state_deographics = pd.read_csv("data/us_demographics_by_state.csv", skiprows=2).drop("Footnotes", axis=1).fillna(0)
df_states = df_states.merge(state_demographics, how="left", left_on="state", right_on="Location").drop("Location", axis=1)
df_states.columns = [col.lower().replace('/', '_').replace(' ', '_') for col in df_states.columns]
df_states.columns

Index(['state_code', 'state', 'white', 'black', 'hispanic', 'asian',
       'american_indian_alaska_native',
       'native_hawaiian_other_pacific_islander', 'multiple_races', 'total'],
      dtype='object')