In [1]:
import pandas as pd
import numpy as np
import yaml

In [2]:
with open('../input/dtypes.yaml', 'r') as yamlfile:
    column_types = yaml.load(yamlfile)

In [3]:
read_csv_opts = {'sep': '|',
                 'quotechar': '"',
                 'compression': 'gzip',
                 'encoding': 'utf-8',
                 'dtype': column_types,
                 'parse_dates': ['MissionDate'],
                 'infer_datetime_format': True}

In [4]:
arts_fy11 = pd.read_csv('../input/ARTS_Passenger_Data_FY11.csv.gz', **read_csv_opts)
arts_fy12 = pd.read_csv('../input/ARTS_Passenger_Data_FY12.csv.gz', **read_csv_opts)
arts_fy13 = pd.read_csv('../input/ARTS_Passenger_Data_FY13.csv.gz', **read_csv_opts)
arts_fy14 = pd.read_csv('../input/ARTS_Passenger_Data_FY14.csv.gz', **read_csv_opts)
arts_fy15 = pd.read_csv('../input/ARTS_Passenger_Data_FY15.csv.gz', **read_csv_opts)
arts_fy16 = pd.read_csv('../input/ARTS_Passenger_Data_FY16.csv.gz', **read_csv_opts)
arts_fy17 = pd.read_csv('../input/ARTS_Passenger_Data_FY17.csv.gz', **read_csv_opts)
arts_fy18 = pd.read_csv('../input/ARTS_Passenger_Data_FY17.csv.gz', **read_csv_opts)
arts_fy19 = pd.read_csv('../input/ARTS_Passenger_Data_FY17.csv.gz', **read_csv_opts)

In [5]:
files = [arts_fy11,
         arts_fy12,
         arts_fy13,
         arts_fy14,
         arts_fy15,
         arts_fy16,
         arts_fy17,
         arts_fy18,
         arts_fy19]

In [6]:
for file in files:
    print(file.columns)

Index(['Status', 'Sex', 'Convictions', 'GangMember', 'ClassLvl', 'Age',
       'MissionDate', 'MissionNumber', 'PULOC', 'DropLoc', 'StrikeFromList',
       'ReasonStruck', 'R-T', 'Code', 'CountryOfCitizenship', 'Juvenile',
       'MissionWeek', 'MissionQuarter', 'MissionYear', 'MissionMonth',
       'Criminality', 'FamilyUnitFlag', 'UnaccompaniedFlag', 'AlienMasterID',
       'MissionID', 'air_AirportID', 'air_AirportName', 'air_City',
       'st_StateID', 'st_StateAbbr', 'AOR_AORID', 'AOR_AOR', 'AOR_AORName',
       'air_Country', 'air2_AirportID', 'air2_AirportName', 'air2_City',
       'st2_StateID', 'st2_StateAbbr', 'aor2_AORID', 'aor2_AOR',
       'aor2_AORName', 'air2_Country'],
      dtype='object')
Index(['Status', 'Sex', 'Convictions', 'GangMember', 'ClassLvl', 'Age',
       'MissionDate', 'MissionNumber', 'PULOC', 'DropLoc', 'StrikeFromList',
       'ReasonStruck', 'R-T', 'Code', 'CountryOfCitizenship', 'Juvenile',
       'MissionWeek', 'MissionQuarter', 'MissionYear', 'Missi

In [7]:
# Concatenating DataFrames converts 'category' columns back to 'object' when new values introduced
df = pd.concat(files)

In [8]:
df_obj = df.select_dtypes(include=['object']).copy()

In [9]:
converted_obj = pd.DataFrame()

for col in df_obj.columns:
    num_unique_values = len(df_obj[col].unique())
    num_total_values = len(df_obj[col])
    if num_unique_values / num_total_values < 0.5:
        converted_obj.loc[:,col] = df_obj[col].astype('category')
    else:
        converted_obj.loc[:,col] = df_obj[col]

In [10]:
df[converted_obj.columns] = converted_obj

In [11]:
# We can convert columns back to 'category' type but resulting DataFrame is bigger than sum of FY DataFrames.
# May be better to work on each FY separately?
df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1894881 entries, 0 to 180383
Data columns (total 43 columns):
Status                  category
Sex                     category
Convictions             category
GangMember              category
ClassLvl                float32
Age                     float32
MissionDate             datetime64[ns]
MissionNumber           uint32
PULOC                   category
DropLoc                 category
StrikeFromList          float32
ReasonStruck            category
R-T                     category
Code                    category
CountryOfCitizenship    category
Juvenile                category
MissionWeek             uint8
MissionQuarter          uint8
MissionYear             uint16
MissionMonth            uint8
Criminality             category
FamilyUnitFlag          float32
UnaccompaniedFlag       float32
AlienMasterID           uint32
MissionID               uint16
air_AirportID           float32
air_AirportName         category
air_City      

In [12]:
len(set(df['MissionID']))

13037

In [13]:
len(set(df['AlienMasterID']))

1504659

In [14]:
df['MissionDate'].min()

Timestamp('2010-10-01 00:00:00')

In [15]:
df['MissionDate'].max()

Timestamp('2017-09-29 00:00:00')

In [16]:
df['air_AirportName'].value_counts().head(10)

Alexandria International Airport                        269981
Brownsville South Padre Island International Airport    215152
Phoenix-Mesa-Gateway Airport                            203974
Valley International Airport                            154138
El Paso International Airport                           153668
San Antonio International Airport                       109871
Columbus Metropolitan Airport                            92299
Harrisburg International Airport                         83513
Laredo International Airport                             67748
Miami International Airport                              61954
Name: air_AirportName, dtype: int64

In [17]:
df['air2_AirportName'].value_counts().head(10)

La Aurora Airport                                       333482
Ramon Villeda Morales International Airport             211282
Alexandria International Airport                        197005
El Salvador International Airport                       176221
Valley International Airport                            148700
Licenciado Benito Juarez International Airport          105593
Phoenix-Mesa-Gateway Airport                            101036
El Paso International Airport                            92775
Brownsville South Padre Island International Airport     72871
San Diego International Airport                          65693
Name: air2_AirportName, dtype: int64

In [18]:
with open('../../share/hand/clean.yaml', 'r') as yamlfile:
    clean = yaml.load(yamlfile)

In [19]:
df['R-T'] = df['R-T'].replace(clean['R-T'])
df['R-T'] = df['R-T'].astype('category')

In [20]:
df['Sex'] = df['Sex'].replace(clean['Sex'])
df['Sex'] = df['Sex'].astype('category')

In [21]:
df['R-T'].value_counts()

R             1337592
T              557270
Ineligible          2
No-show             1
Name: R-T, dtype: int64

In [22]:
df['Sex'].value_counts()

M    1734008
F     160713
U        156
Name: Sex, dtype: int64

In [30]:
# This column had a bunch of random-seeming numeric strings, replace removes any string that's just numbers
# I checked and there were no '13' or '15' or '18' values before replace
# Numeric strings didn't seem to be result of data shifted into column but need to investigate more
df['GangMember'] = df['GangMember'].str.replace('^[0-9]*$', '')
df['GangMember'].value_counts()
df['GangMember'] = df['GangMember'].astype('category')

In [32]:
df['Convictions'].value_counts()

NC                                        576804
Non-Criminal                               97397
DUI                                        94815
IE                                         86593
NON CRIMINAL                               76709
DRUGS                                      65542
Illegal Entry                              56147
IV                                         48642
Other - Non-Violent                        39155
Assault                                    37172
ASSLT                                      36954
Driving Under Influence Liquor             36248
Drugs                                      32436
NONE                                       25154
Non-Crim                                   22835
Traffic Offense                            10789
Aggravated Assault                         10489
FRAUD                                      10371
Robbery                                    10019
Assault / Robbery                           9531
Felony Drug Sale, Sm