In [1]:
import numpy as np
import pandas as pd
import yaml
import matplotlib.pyplot as plt

In [2]:
with open('../input/dtypes.yaml', 'r') as yamlfile:
    column_types = yaml.load(yamlfile)

read_csv_opts = {'sep': '|',
                 'quotechar': '"',
                 'compression': 'gzip',
                 'encoding': 'utf-8',
                 'dtype': column_types,
                 'parse_dates': ['MissionDate'],
                 'infer_datetime_format': True}

df = pd.read_csv('../input/ice-air.csv.gz', **read_csv_opts)

In [31]:
df['FY'] = df['MissionDate'].copy()

In [32]:
fiscal_year_grouper = pd.Grouper(freq='AS-OCT', key='FY')

In [57]:
g1 = df.groupby([fiscal_year_grouper,
                 'MissionDate',
                 'MissionID',
                 'MissionNumber',
                 'R-T',
                 'PULOC',
                 'DropLoc'])['AlienMasterID'].nunique()
g1 = g1.reset_index()
print(g1.shape)
g1.head(10)

(43449, 8)


Unnamed: 0,FY,MissionDate,MissionID,MissionNumber,R-T,PULOC,DropLoc,AlienMasterID
0,2010-10-01,2010-10-01,105,110005,T,KMDT,KAEX,122
1,2010-10-01,2010-10-01,106,110006,R,KHRL,MGGT,67
2,2010-10-01,2010-10-01,106,110006,R,KSAT,MGGT,37
3,2010-10-01,2010-10-01,107,110007,R,KAEX,MHLM,97
4,2010-10-01,2010-10-01,108,110008,R,KMCI,KHRL,35
5,2010-10-01,2010-10-01,108,110008,R,KORD,KHRL,63
6,2010-10-01,2010-10-01,108,110008,T,KMCI,KAEX,10
7,2010-10-01,2010-10-01,108,110008,T,KORD,KAEX,25
8,2010-10-01,2010-10-01,131,110002,R,KBFL,KSAN,13
9,2010-10-01,2010-10-01,131,110002,R,KOAK,KSAN,43


In [58]:
g2 = df.groupby([fiscal_year_grouper,
                'CountryOfCitizenship',
                'PULOC'])['AlienMasterID'].nunique()
g2 = g2.unstack().head(10)
print(g2.shape)
g2.head(10)

(10, 81)


Unnamed: 0_level_0,PULOC,KAEX,KATL,KBFI,KBFL,KCSG,KDAL,KDEN,KELP,KHRL,KIAH,...,KSDM,KVCV,KCHS,KABQ,KYKM,KMEM,KFTW,KPSM,KYNG,KALB
FY,CountryOfCitizenship,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
2010-10-01,ALGERIA,1.0,,,1.0,,,,,,,...,,,,,,,,,,
2010-10-01,ARGENTINA,1.0,,1.0,2.0,,,,,,,...,,,,,,,,,,
2010-10-01,ARMENIA,,,,11.0,,,,3.0,,,...,,,,,,,,,,
2010-10-01,BANGLADESH,3.0,,,,,,,3.0,,,...,,,,,,,,,,
2010-10-01,BARBADOS,1.0,,,,,,,1.0,,,...,,,,,,,,,,
2010-10-01,BOLIVIA,18.0,,,1.0,,,,,,,...,,,,,,,,,,
2010-10-01,BOSNIA,1.0,,,1.0,,,,,2.0,,...,,,,,,,,,,
2010-10-01,BRAZIL,73.0,,,2.0,,,,,,,...,,,,,,,,,,
2010-10-01,BRITISH VIRGIN ISLANDS,1.0,,,,,,,,,,...,,,,,,,,,,
2010-10-01,BULGARIA,1.0,,1.0,1.0,,,,1.0,,,...,,,,,,,,,,


In [59]:
g3 = df.groupby([fiscal_year_grouper,
                'CountryOfCitizenship',
                'DropLoc'])['AlienMasterID'].nunique()
g3 = g3.unstack().head(10)
print(g3.shape)
g3.head(10)

(10, 205)


Unnamed: 0_level_0,DropLoc,KAEX,KBFI,KBFL,KELP,KHRL,KIAH,KIWA,KLAS,KLRD,KMCI,...,OMDW,VTBS,GMMN,KPDX,KALB,FCBB,FYWH,HBBA,SBCF,SBBR
FY,CountryOfCitizenship,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
2010-10-01,ALGERIA,,,,,1.0,,1.0,,,,...,,,,,,,,,,
2010-10-01,ARGENTINA,1.0,,,2.0,,,10.0,,,,...,,,,,,,,,,
2010-10-01,ARMENIA,,1.0,1.0,2.0,,1.0,13.0,1.0,,,...,,,,,,,,,,
2010-10-01,BANGLADESH,7.0,,,3.0,1.0,1.0,4.0,,,,...,,,,,,,,,,
2010-10-01,BARBADOS,3.0,,,1.0,4.0,,,,,,...,,,,,,,,,,
2010-10-01,BOLIVIA,16.0,,,12.0,3.0,1.0,1.0,,,,...,,,,,,,,,,
2010-10-01,BOSNIA,2.0,1.0,,,3.0,,4.0,1.0,,,...,,,,,,,,,,
2010-10-01,BRAZIL,49.0,1.0,,13.0,4.0,3.0,13.0,1.0,,,...,,,,,,,,,,
2010-10-01,BRITISH VIRGIN ISLANDS,,,,,,,,,,,...,,,,,,,,,,
2010-10-01,BULGARIA,5.0,1.0,1.0,,,,3.0,,,,...,,,,,,,,,,


In [105]:
g4 = df.groupby(['MissionDate',
                'R-T',
                'PULOC',
                'DropLoc'])['AlienMasterID'].nunique()
g4 = g4.reset_index()

In [106]:
# g4['FY'] = g4['FY'].apply(lambda x: x.year)

In [107]:
g4.to_csv('../output/flights.csv')

In [96]:
pulocs = df[['PULOC', 'air_AirportName', 'air_LongitudeDecimalDegrees', 'air_LatitudeDecimalDegrees']].drop_duplicates()

In [97]:
droplocs = df[['DropLoc', 'air2_AirportName', 'air2_LongitudeDecimalDegrees', 'air2_LatitudeDecimalDegrees']].drop_duplicates()

In [98]:
cols = ['AirportCode', 'AirportName', 'Longitude', 'Latitude']

In [99]:
pulocs.columns = cols
droplocs.columns = cols

In [102]:
airports = pd.concat([pulocs, droplocs]).drop_duplicates()

In [104]:
airports.to_csv('../output/airports.csv')