In [1]:
import pandas as pd
import yaml

In [2]:
read_csv_opts = {'sep': '|',
                 'quotechar': '"',
                 'compression': 'gzip',
                 'encoding': 'utf-8',
                 'parse_dates': ['MissionDate'],
                 'infer_datetime_format': True}

In [3]:
with open('../input/passenger-dtypes-1.yaml', 'r') as yamlfile:
    dtypes_1 = yaml.load(yamlfile)
    
with open('../input/passenger-dtypes-2.yaml', 'r') as yamlfile:
    dtypes_2 = yaml.load(yamlfile)
    
with open('../input/mission-dtypes.yaml', 'r') as yamlfile:
    dtypes_3 = yaml.load(yamlfile)

In [4]:
df_1 = pd.read_csv('../input/ice-air-passengers-1.csv.gz', dtype=dtypes_1, **read_csv_opts)

In [5]:
df_2 = pd.read_csv('../input/ice-air-passengers-2.csv.gz', dtype=dtypes_2, **read_csv_opts)

In [6]:
missions = pd.read_csv('../input/ice-air-missions.csv.gz', dtype=dtypes_3, **read_csv_opts)

In [15]:
df_1['MissionDate'].describe()

count                 1732625
unique                   2409
top       2012-04-19 00:00:00
freq                     1524
first     2010-10-01 00:00:00
last      2018-12-05 00:00:00
Name: MissionDate, dtype: object

In [16]:
df_2['MissionDate'].describe()

count                 1826400
unique                   2530
top       2019-04-12 00:00:00
freq                     1532
first     2010-10-01 00:00:00
last      2019-05-04 00:00:00
Name: MissionDate, dtype: object

In [31]:
df_1.set_index('MissionDate', inplace=True)
df_2.set_index('MissionDate', inplace=True)

In [41]:
annual_total_1 = df_1['AlienMasterID'].resample('AS-OCT').count()
annual_total_2 = df_2['AlienMasterID'].resample('AS-OCT').count()

In [42]:
annual_total_2 - annual_total_1

MissionDate
2010-10-01        0
2011-10-01        0
2012-10-01        0
2013-10-01        0
2014-10-01        0
2015-10-01        0
2016-10-01        0
2017-10-01        0
2018-10-01    93775
Freq: AS-OCT, Name: AlienMasterID, dtype: int64

In [44]:
g_1 = df_1.groupby('MissionID')['AlienMasterID'].count()
g_2 = df_2.groupby('MissionID')['AlienMasterID'].count()

In [58]:
g_2 - g_1

MissionID
105      0.0
106      0.0
107      0.0
108      0.0
110      0.0
111      0.0
112      0.0
113      0.0
114      0.0
115      0.0
116      0.0
117      0.0
118      0.0
119      0.0
121      0.0
122      0.0
123      0.0
124      0.0
125      0.0
126      0.0
127      0.0
128      0.0
130      0.0
131      0.0
132      0.0
133      0.0
137      0.0
138      0.0
140      0.0
141      0.0
        ... 
48304    NaN
48305    NaN
48306    NaN
48307    NaN
48308    NaN
48309    NaN
48310    NaN
48311    NaN
48312    NaN
48313    NaN
48314    NaN
48315    NaN
48316    NaN
48317    NaN
48318    NaN
48319    NaN
48320    NaN
48321    NaN
48322    NaN
48323    NaN
48324    NaN
48325    NaN
48326    NaN
48328    NaN
48329    NaN
48330    NaN
48331    NaN
48332    NaN
48334    NaN
48335    NaN
Name: AlienMasterID, Length: 15735, dtype: float64

In [66]:
new_puloc = set(df_2['PULOC']).difference(set(df_1['PULOC']))
new_droploc = set(df_2['DropLoc']).difference(set(df_1['DropLoc']))

In [70]:
new_airports = new_puloc.union(new_droploc)
print(new_airports)

{'KPVD', 'FVRG', 'FEFF', 'GOBD', 'FNLU', 'DBBB'}


In [75]:
new_country_of_citizenship = set(df_2['CountryOfCitizenship']).difference(set(df_1['CountryOfCitizenship']))
print(new_country_of_citizenship)

{'GUATEMALAN', 'USC', 'ANGUILLA'}


In [74]:
new_status = set(df_2['Status']).difference(set(df_1['Status']))
print(new_status)

{'HI', '169'}


In [76]:
new_rt = set(df_2['R-T']).difference(set(df_1['R-T']))
print(new_rt)

set()


In [77]:
new_sex = set(df_2['Sex']).difference(set(df_1['Sex']))
print(new_sex)

set()


In [78]:
new_criminality = set(df_2['Criminality']).difference(set(df_1['Criminality']))
print(new_criminality)

set()


In [87]:
airport_db = pd.read_csv('../input/GlobalAirportDatabase.csv.gz', sep='|')

In [9]:
stops = ['MsnStart', 'MsnStpOne','MsnStpTwo', 'MsnStpThree',
         'MsnStpFour', 'MsnStpFive', 'MsnStpSix',
         'MsnStpSeven', 'MsnStpEight', 'MsnEnd']

In [48]:
passenger_airports = set(df_2['PULOC']).union(set(df_2['DropLoc']))

In [53]:
mission_airports = set()
for s in stops:
    airport_codes = set(missions[s])
    mission_airports = mission_airports.union(airport_codes)

In [58]:
airports_to_merge = pd.read_csv('../../../share/hand/airports_to_merge.csv', sep=',')

In [80]:
all_arts_airports = passenger_airports.union(mission_airports)

In [85]:
new_airports = all_arts_airports.difference(airports_to_merge['ICAOCode'])

In [90]:
new_airports.difference(airport_db['ICAOCode'])

{'*Add ELP stop*',
 '*Times Updated*',
 '16',
 '69',
 '72',
 'CYOX',
 'DEN',
 'DGGA',
 'ELP',
 'EPMO',
 'FZZA',
 'HKMM',
 'KAFW',
 'KAX',
 'KCMI',
 'KCSH',
 'KGPT',
 'KGSO',
 'KHNL',
 'KKBL',
 'KMGM',
 'KMGT',
 'KSDF',
 'KVPC',
 'LBTA',
 'LTFJ',
 'MIA',
 'RKSI',
 'TBPD',
 'TPBP',
 'UDYZ',
 'WIHH',
 nan}

In [107]:
print(airport_db[airport_db['ICAOCode'].isin(list(new_airports))][['ICAOCode', 'LatitudeDecimalDegrees', 'LongitudeDecimalDegrees']].to_csv())

,ICAOCode,LatitudeDecimalDegrees,LongitudeDecimalDegrees
56,BIKF,63.985,-22.605999999999998
221,CYQX,48.937,-54.568000000000005
308,CYYT,47.619,-52.751999999999995
502,DNMN,9.652000000000001,6.462000000000001
1008,EFHK,60.32,24.956
1150,EGKB,51.331,0.033
1153,EGKK,51.148,-0.19
1160,EGLF,51.276,-0.7759999999999999
1330,EHAM,52.309,4.763999999999999
1385,EINN,52.702,-8.925
2005,FAMU,-27.625999999999998,32.044000000000004
2030,FAOR,0.0,0.0
2489,FOGR,-0.7040000000000001,10.245999999999999
2576,FVFA,-18.096,25.839000000000002
2937,GCTS,28.044,-16.572
3319,HTKJ,-3.429,37.074
3415,KBGR,44.806999999999995,-68.828
3430,KBTR,30.533,-91.149
3437,KCAE,33.939,-81.119
3510,KFLL,26.072,-80.153
3537,KGRB,44.485,-88.12899999999999
3561,KHOU,29.645,-95.279
3596,KJFK,40.64,-73.779
3633,KMFE,26.176,-98.23899999999999
3648,KMOB,30.691,-88.243
3656,KMSY,29.993000000000002,-90.258
3711,KOKC,35.393,-97.601
3724,KPBI,26.683000000000003,-80.096
3729,KPHX,33.434,-112.008
3740,KPQI,46.68899999999999,-68.045
3751,

In [98]:
df_2[df_2['air_AirportID'] == 72]['PULOC'].unique()

[KJAX]
Categories (1, object): [KJAX]

In [99]:
df_2[df_2['air_AirportID'] == 69]['PULOC'].unique()

[KIAH]
Categories (1, object): [KIAH]

In [101]:
df_2[df_2['air2_AirportID'] == 16]['DropLoc'].unique()

[], Categories (0, object): []