In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd

import immigration_functions
import database
from queries import insert_into

In [3]:
pd.set_option("display.max_columns", None)

In [81]:
database.reset_database()

In [82]:
immigration_functions.load_i94_data_to_db()

In [6]:
demographics = pd.read_csv("data/us-cities-demographics.csv", sep=";")
temperatures = pd.read_csv("data/GlobalLandTemperaturesByCity.csv")
# immigration_full = pd.read_sas("data/immigration_full/18-83510-I94-Data-2016/i94_apr16_sub.sas7bdat", encoding="ISO-8859-1")
immigration_full = pd.read_parquet('data/immigration_full/i94_apr16_sub.parquet')

In [58]:
immigration_full.head()

Unnamed: 0,cicid,i94yr,i94mon,i94cit,i94res,i94port,arrdate,i94mode,i94addr,depdate,i94bir,i94visa,count,dtadfile,visapost,occup,entdepa,entdepd,entdepu,matflag,biryear,dtaddto,gender,insnum,airline,admnum,fltno,visatype,test
0,6.0,2016.0,4.0,692.0,692.0,XXX,20573.0,,,,37.0,2.0,1.0,,,,T,,U,,1979.0,10282016,,,,1897628000.0,,B2,
1,7.0,2016.0,4.0,254.0,276.0,ATL,20551.0,1.0,AL,,25.0,3.0,1.0,20130811.0,SEO,,G,,Y,,1991.0,D/S,M,,,3736796000.0,296.0,F1,AL
2,15.0,2016.0,4.0,101.0,101.0,WAS,20545.0,1.0,MI,20691.0,55.0,2.0,1.0,20160401.0,,,T,O,,M,1961.0,09302016,M,,OS,666643200.0,93.0,B2,MI
3,16.0,2016.0,4.0,101.0,101.0,NYC,20545.0,1.0,MA,20567.0,28.0,2.0,1.0,20160401.0,,,O,O,,M,1988.0,09302016,,,AA,92468460000.0,199.0,B2,MA
4,17.0,2016.0,4.0,101.0,101.0,NYC,20545.0,1.0,MA,20567.0,4.0,2.0,1.0,20160401.0,,,O,O,,M,2012.0,09302016,,,AA,92468460000.0,199.0,B2,MA


## Immigration data cleaning

Based on part of the data (approx. 3 million records) the following observations were made:

### Dropped columns

The following columns are not used:

- biryear (redundant because of i94bir)
- entdepu (high number of nulls, unknown meaning)
- entdepa (unknown meaning)
- entdepd (unknown meaning)
- count (no information)
- matflag (meaning unknown)
- occup (high number of nulls)
- visapost (high number of nulls, no relevant information for the case)
- insnum (high number of nulls, no information)
- dtadfile (no relevant information for the case)
- draddto (no relevant information for the case)
- visatype (no relevant information for the case)
- fltno (no relevant information for the case)
- admnum (no information)
- i94cit (a lot of codes - approx. 15 % - are not found in the regions list of the i94 description; in addition the exact meaning of the column is not known)

### NULL values

- i94bir: <0.1% of values are NaN. Nulls cause problems with aggregation, therefore the field is imputed with the median
- i94mode: <0.01% of i94mode values are NaN, a value of 9 has the meaning "not reported"
- i94addr: approx 5% of values are NaN. While a value of '99' has the meaning "others", it cannot be used here as the reason why the high number of NAs are present is not known
- gender: approx 15% of values are NaN, leave as is
- airline: < 1%, leave as is
- depdate: approx 5%, leave as is, the probable meaning is, that the immigrant has not departed yet

i94addr, gender, airline, depdate are allowed to be nullable

In [53]:
immigration_clean = immigration_functions.clean_immigration(immigration_full)
immigration_clean.head()

Unnamed: 0,i94yr,i94mon,i94res,i94port,i94mode,i94addr,arrdate,depdate,i94bir,i94visa,gender,airline
0,2016.0,4.0,692.0,XXX,9.0,,2016-04-29,,37.0,2.0,,
1,2016.0,4.0,276.0,ATL,1.0,AL,2016-04-07,,25.0,3.0,M,
2,2016.0,4.0,101.0,WAS,1.0,MI,2016-04-01,2016-08-25,55.0,2.0,M,OS
3,2016.0,4.0,101.0,NYC,1.0,MA,2016-04-01,2016-04-23,28.0,2.0,,AA
4,2016.0,4.0,101.0,NYC,1.0,MA,2016-04-01,2016-04-23,4.0,2.0,,AA


In [117]:
demographics.head()

# dropped: State (redundant); Number of Veterans, Male Population and Female Population (not relevant in this context)
demographics_clean = demographics[["City", "State Code", "Median Age", "Total Population", "Foreign-born", "Average Household Size"]].drop_duplicates()
demographics_races = pd.pivot_table(demographics, values='Count', index=['City', 'State Code'], columns=['Race'], fill_value=0).reset_index()
demographics_races.columns.name = None
print(demographics_races)

demographics_complete = demographics_clean.merge(demographics_races, on=['City', 'State Code'])
demographics_complete

            City State Code  American Indian and Alaska Native  Asian  \
0        Abilene         TX                               1813   2929   
1          Akron         OH                               1845   9033   
2        Alafaya         FL                                  0  10336   
3        Alameda         CA                               1329  27984   
4         Albany         GA                                445    650   
..           ...        ...                                ...    ...   
591      Yonkers         NY                               1112  13981   
592  Yorba Linda         CA                                211  17616   
593       Youngs         OH                                875    247   
594    Yuba City         CA                               2163  15065   
595         Yuma         AZ                               1228   1180   

     Black or African-American  Hispanic or Latino   White  
0                        14449               33222   95487  
1

Unnamed: 0,City,State Code,Median Age,Total Population,Foreign-born,Average Household Size,American Indian and Alaska Native,Asian,Black or African-American,Hispanic or Latino,White
0,Silver Spring,MD,33.8,82463,30908.0,2.60,1084,8841,21330,25924,37756
1,Quincy,MA,41.0,93629,32935.0,2.39,351,30473,3917,2566,58723
2,Hoover,AL,38.5,84839,8229.0,2.58,0,4759,18191,3430,61869
3,Rancho Cucamonga,CA,34.5,175232,33878.0,3.18,2789,24519,24437,65823,111832
4,Newark,NJ,34.6,281913,86253.0,2.73,2268,7349,144961,100432,76402
...,...,...,...,...,...,...,...,...,...,...,...
591,Mobile,AL,38.0,194305,7234.0,2.40,2816,5518,96397,5229,93755
592,League City,TX,35.9,98350,8361.0,2.72,0,8163,4166,14868,86012
593,Lafayette,IN,33.5,71170,5697.0,2.19,0,1167,6752,9151,63157
594,Guaynabo,PR,42.2,70492,,,589,0,0,69936,0


In [55]:
temperatures_clean = temperatures[['dt', 'AverageTemperature', 'AverageTemperatureUncertainty', 'City', 'Country']].copy()
temperatures_clean = temperatures_clean[temperatures_clean.City == 'Berlin']

temperatures_clean['dt'] = pd.to_datetime(temperatures_clean['dt'])
temperatures_clean['month'] = temperatures_clean['dt'].dt.month

temperatures_agg = temperatures_clean.sort_values(['City', 'Country', 'dt']).groupby(['City', 'Country', 'month']).tail(3)
temperatures_agg = temperatures_agg.groupby(['City', 'Country', 'month']).AverageTemperature.agg(['mean', 'min', 'max']).reset_index()
temperatures_agg

Unnamed: 0,City,Country,month,mean,min,max
0,Berlin,Germany,1,1.299,0.113,2.17
1,Berlin,Germany,2,-0.498667,-1.944,0.314
2,Berlin,Germany,3,4.254333,-0.391,7.79
3,Berlin,Germany,4,10.410333,9.088,12.669
4,Berlin,Germany,5,15.001667,14.139,15.657
5,Berlin,Germany,6,17.535,16.427,18.705
6,Berlin,Germany,7,19.329333,17.931,20.901
7,Berlin,Germany,8,19.250333,19.008,19.408
8,Berlin,Germany,9,15.7015,15.165,16.238
9,Berlin,Germany,10,9.391667,8.339,10.351
