In [9]:
import numpy as np
import pandas as pd
import yaml

In [10]:
with open('../input/passenger-dtypes.yaml', 'r') as yamlfile:
    column_types = yaml.load(yamlfile)

read_csv_opts = {'sep': '|',
                 'quotechar': '"',
                 'compression': 'gzip',
                 'encoding': 'utf-8',
                 'dtype': column_types,
                 'parse_dates': ['MissionDate'],
                 'infer_datetime_format': True}

df2 = pd.read_csv('../input/ice-air-passengers-2.csv.gz', **read_csv_opts)

In [11]:
df3 = pd.read_csv('../input/ice-air-passengers-3.csv.gz', **read_csv_opts)

In [12]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1826402 entries, 0 to 1826401
Data columns (total 52 columns):
 #   Column                        Dtype         
---  ------                        -----         
 0   ANumber                       category      
 1   LastName                      category      
 2   FirstName                     category      
 3   DOB                           category      
 4   Status                        category      
 5   Sex                           category      
 6   Convictions                   category      
 7   GangMember                    category      
 8   ClassLvl                      float32       
 9   Age                           float32       
 10  MissionDate                   datetime64[ns]
 11  MissionNumber                 uint32        
 12  PULOC                         category      
 13  DropLoc                       category      
 14  StrikeFromList                float32       
 15  ReasonStruck                  ca

In [13]:
df3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 341082 entries, 0 to 341081
Data columns (total 52 columns):
 #   Column                        Non-Null Count   Dtype         
---  ------                        --------------   -----         
 0   ANumber                       341082 non-null  category      
 1   LastName                      341082 non-null  category      
 2   FirstName                     341082 non-null  category      
 3   DOB                           341082 non-null  category      
 4   Status                        341082 non-null  category      
 5   Sex                           341079 non-null  category      
 6   Convictions                   333163 non-null  category      
 7   GangMember                    333003 non-null  category      
 8   ClassLvl                      68 non-null      float32       
 9   Age                           340883 non-null  float32       
 10  MissionDate                   341082 non-null  datetime64[ns]
 11  MissionNumber

In [14]:
df2['MissionDate'].describe()

  """Entry point for launching an IPython kernel.


count                 1826402
unique                   2530
top       2019-04-12 00:00:00
freq                     1532
first     2010-10-01 00:00:00
last      2019-05-04 00:00:00
Name: MissionDate, dtype: object

In [15]:
df2['MissionNumber'].min()

110001

In [16]:
df2['MissionNumber'].max()

191228

In [17]:
len(set(df2['MissionNumber']))

15735

In [18]:
df3['MissionDate'].describe()

  """Entry point for launching an IPython kernel.


count                  341082
unique                    483
top       2019-04-12 00:00:00
freq                     1532
first     2018-10-01 00:00:00
last      2020-05-08 00:00:00
Name: MissionDate, dtype: object

In [19]:
len(set(df3['MissionNumber']))

2965

In [20]:
df2 = df2.loc[df2['MissionNumber'] < df3['MissionNumber'].min()]

In [21]:
len(set(df2['MissionNumber']))

14651

In [22]:
df = pd.concat([df2, df3])

In [23]:
len(set(df['MissionNumber']))

17616

In [24]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2036458 entries, 0 to 341081
Data columns (total 52 columns):
 #   Column                        Dtype         
---  ------                        -----         
 0   ANumber                       category      
 1   LastName                      category      
 2   FirstName                     category      
 3   DOB                           category      
 4   Status                        object        
 5   Sex                           category      
 6   Convictions                   object        
 7   GangMember                    object        
 8   ClassLvl                      float32       
 9   Age                           float32       
 10  MissionDate                   datetime64[ns]
 11  MissionNumber                 uint32        
 12  PULOC                         object        
 13  DropLoc                       object        
 14  StrikeFromList                float32       
 15  ReasonStruck                  obj

In [25]:
repl = {'REMOVAL': 'R',
 'TRANSFER': 'T'}

df['R-T'] = df['R-T'].str.upper()
df['R-T'] = df['R-T'].replace(repl)

removals = df['R-T'] == 'R'
transfers = df['R-T'] == 'T'

status_count = df[removals]['Status'].str.upper().value_counts(dropna=False)

status = pd.read_csv('../../../share/resources/status.csv')

valid_status_codes = list(status['Code'])

status_dict = dict(zip(status['Code'], status['Status']))

pending = ['2A', '2B', '8A', '8B', '8D']
benefit = ['5C', '5D']
expedited_admin_rein = ['8F', '8G', '8H', '8I', '11', '16']
problematic = pending + benefit + expedited_admin_rein

In [26]:
problematic_count = 0
valid = df[removals]['Status'].str.upper().isin(valid_status_codes)
valid_count = sum(valid)
invalid = ~df[removals]['Status'].str.upper().isin(valid_status_codes)
invalid_count = sum(invalid)

print(f'ICE Air - All removals')
print()
print(f'ICE status codes for passengers on direct removal flights.')
print(f'(Statuses which may raise due process concerns starred)')
print()
print(f'{valid_count} valid values.')
print(f'{invalid_count} invalid values.')
print()
for c in valid_status_codes:
    
    try:
        count = status_count[c]
    except KeyError:
        count = 0
    
    if c in problematic:
        problematic_count = problematic_count + count
        print(f'[{c}]: *{count}* ({status_dict[c]})')
    else:
        print(f'[{c}]: {count} ({status_dict[c]})')
        
#assert sum(df[removals]['Status'].isin(problematic)) == problematic_count
print()
print(f'{problematic_count} total problematic cases.')
print(f"{sum(df.loc[removals, 'Status'].isin(pending))} with pending appeals.")
print(f"{sum(df.loc[removals, 'Status'].isin(benefit))} with benefit blocking deportation.")
print(f"{sum(df.loc[removals, 'Status'].isin(expedited_admin_rein))} under expedited, administrative, reinstatement of removal.")

ICE Air - All removals

ICE status codes for passengers on direct removal flights.
(Statuses which may raise due process concerns starred)

1043531 valid values.
396374 invalid values.

[1A]: 55172 (Voluntary Departure – Un-Expired and Un-Extended Voluntary Departure.)
[1B]: 291 (Voluntary Departure - Extended Departure Period)
[1C]: 8 (Exipred Voluntary Departure Period - Referred to Investigation)
[2A]: *2609* (Deportable – Under Adjudication by IJ)
[2B]: *115* (Deportable – Under Adjudication by BIA)
[3]: 26714 (Deportable – Administratively Final Order)
[5A]: 16 (Referred for Investigation - No Show for Hearing - No Final O...)
[5B]: 918 (Removable – ICE Fugitive)
[5C]: *17* (Relief Granted – Withholding of Deportation/Removal)
[5D]: *100* (Final Order of Deportation/Removal – Deferred Action Granted.)
[5E]: 66 (Relief Granted – Extended Voluntary Departure)
[5F]: 70 (Unable to Obtain Travel Document)
[8A]: *2254* (Excludable/Inadmissible – Hearing Not Commenced)
[8B]: *5636* (Excl

In [27]:
df['MissionDate'].describe()

  """Entry point for launching an IPython kernel.


count                 2036458
unique                   2840
top       2019-04-12 00:00:00
freq                     1532
first     2010-10-01 00:00:00
last      2020-05-08 00:00:00
Name: MissionDate, dtype: object

In [28]:
len(set(df['MissionNumber']))

17616