In [1]:
import numpy as np
import pandas as pd
import yaml
import matplotlib.pyplot as plt
import matplotlib.mlab as mlab
import seaborn as sns

# Import

First we open a file containing a dictionary of columns and associated data types generated in `../../optimize/`, then we read input file.

In [2]:
with open('../input/dtypes.yaml', 'r') as yamlfile:
    column_types = yaml.load(yamlfile)

read_csv_opts = {'sep': '|',
                 'quotechar': '"',
                 'compression': 'gzip',
                 'encoding': 'utf-8',
                 'dtype': column_types,
                 'parse_dates': ['MissionDate'],
                 'infer_datetime_format': True}

df = pd.read_csv('../input/ice-air.csv.gz', **read_csv_opts)

df.info(memory_usage='deep')

  interactivity=interactivity, compiler=compiler, result=result)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1763020 entries, 0 to 1763019
Data columns (total 43 columns):
Status                  category
Sex                     category
Convictions             category
GangMember              category
ClassLvl                float32
Age                     float32
MissionDate             datetime64[ns]
MissionNumber           uint32
PULOC                   category
DropLoc                 category
StrikeFromList          float32
ReasonStruck            category
R-T                     category
Code                    category
CountryOfCitizenship    category
Juvenile                object
MissionWeek             uint8
MissionQuarter          uint8
MissionYear             uint16
MissionMonth            uint8
Criminality             category
FamilyUnitFlag          float32
UnaccompaniedFlag       float32
AlienMasterID           uint32
MissionID               uint16
air_AirportID           float32
air_AirportName         category
air_City       

In [3]:
list(df['CountryOfCitizenship'].unique())

['GUATEMALA',
 'BULGARIA',
 'DOMINICAN REPUBLIC',
 'ECUADOR',
 'EL SALVADOR',
 'HONDURAS',
 'URUGUAY',
 'BRAZIL',
 'COLOMBIA',
 'EGYPT',
 'GHANA',
 'GUYANA',
 'INDIA',
 'JAMAICA',
 'MEXICO',
 'MOZAMBIQUE',
 'RUSSIA',
 'SRI LANKA',
 'TRINIDAD',
 'PANAMA',
 'PERU',
 'BOLIVIA',
 'PHILIPPINES',
 'PAKISTAN',
 'NICARAGUA',
 'KUWAIT',
 'CYPRUSS',
 'CAMBODIA',
 'JORDAN',
 'SOMALIA',
 'CANADA',
 'ETHIOPIA',
 'IRAQ',
 'INDONESIA',
 'CUBA',
 'ERITREA',
 'SOUTH KOREA',
 'SLOVAKIA',
 'PORTUGAL',
 'UNKNOWN',
 'KENYA',
 'FRANCE',
 'TAIWAN',
 'THAILAND',
 'MONGOLIA',
 'ROMANIA',
 'LIBERIA',
 'SUDAN',
 'CAMEROON',
 'LAOS',
 'SPAIN',
 'VIETNAM',
 'FIJI',
 'POLAND',
 'GERMANY',
 'CZECH REPUBLIC',
 'ALGERIA',
 'ISRAEL',
 'UKRAINE',
 'BOSNIA',
 'UZBEKISTAN',
 'VENEZUELA',
 'TURKEY',
 "COTE D'IVORIE (IVORY COAST)",
 'ARMENIA',
 'CHINA',
 'SWEDEN',
 'NEPAL',
 'ITALY',
 'NIGER',
 'NIGERIA',
 'BRITISH VIRGIN ISLANDS',
 'CAPE VERDE',
 'ARGENTINA',
 'HAITI',
 'SYRIA',
 'TOGO',
 'NEW ZEALAND',
 'PARAGUAY',
 'RWAN

# Global variables

In [4]:
number_of_records = len(df)
print(f'Number of records: {number_of_records}')

Number of records: 1763020


No missing values in ID fields. IDs repeat.

In [5]:
unique_MissionID = len(set(df['MissionID']))
assert sum(df['MissionID'].isnull()) == 0
print(f"Unique MissionID values: {unique_MissionID}")

Unique MissionID values: 14973


In [6]:
unique_MissionNumber = len(set(df['MissionNumber']))
assert sum(df['MissionNumber'].isnull()) == 0
print(f"Unique MissionNumber values: {unique_MissionNumber}")

Unique MissionNumber values: 14973


One to one relationship of MissionID to MissionNumber. These fields seem to be equivalent.

In [7]:
assert sum(df.groupby(['MissionID', 'MissionNumber'])['MissionNumber'].nunique() > 1) == 0

In [8]:
unique_AlienMasterID = len(set(df['AlienMasterID']))
assert sum(df['AlienMasterID'].isnull()) == 0
print(f"Unique AlienMasterID values: {unique_AlienMasterID}")

Unique AlienMasterID values: 1733555


In [9]:
earliest_record = df['MissionDate'].min()
print(f'Earliest record: {earliest_record}')

Earliest record: 2010-10-01 00:00:00


In [10]:
latest_record = df['MissionDate'].max()
print(f'Earliest record: {latest_record}')

Earliest record: 2018-12-05 00:00:00


In [11]:
number_of_pickup_airports = len(set(df['PULOC']))
number_of_dropoff_airports = len(set(df['DropLoc']))
print(f'Number of pickup airports: {number_of_pickup_airports}')
print(f'Number of dropoff airports: {number_of_dropoff_airports}')

Number of pickup airports: 84
Number of dropoff airports: 207


Some airport codes are missing full name and other details. Clean if necessary:

In [12]:
group = df.groupby(['air_AirportName', 'PULOC']).size().reset_index(name='count')
s = group['PULOC'].value_counts()
missing_air_AirportName = list(s[s != 1].index)
print(f'Pickup airports missing full name: {missing_air_AirportName}')

Pickup airports missing full name: ['KALB']


In [13]:
group = df.groupby(['air2_AirportName', 'DropLoc']).size().reset_index(name='count')
s = group['DropLoc'].value_counts()
missing_air2_AirportName = list(s[s != 1].index)
print(f'Dropoff airports missing full name: {missing_air2_AirportName}')

Dropoff airports missing full name: ['FCBB', 'HBBA', 'FYWH', 'KMHR', 'KALB', 'NSFA', 'KMDW', 'SBCF']


In [14]:
df['air_AirportName'].value_counts().head(10)

Alexandria International Airport                        239911
Brownsville South Padre Island International Airport    204895
Phoenix-Mesa-Gateway Airport                            196150
Valley International Airport                            154138
El Paso International Airport                           136127
San Antonio International Airport                       107434
Columbus Metropolitan Airport                            80797
Harrisburg International Airport                         76348
Laredo International Airport                             56409
Miami International Airport                              55588
Name: air_AirportName, dtype: int64

In [15]:
df['air2_AirportName'].value_counts().head(10)

La Aurora Airport                                       327010
Ramon Villeda Morales International Airport             202955
Alexandria International Airport                        169316
El Salvador International Airport                       157773
Valley International Airport                            148700
Phoenix-Mesa-Gateway Airport                             93892
El Paso International Airport                            87408
Licenciado Benito Juarez International Airport           85584
Brownsville South Padre Island International Airport     66444
San Diego International Airport                          62943
Name: air2_AirportName, dtype: int64

In [16]:
pickup_countries = set(df['air_Country'])
print(f'Number of pickup countries: {len(pickup_countries)}')

Number of pickup countries: 11


In [17]:
dropoff_countries = set(df['air2_Country'])
print(f'Number of dropoff countries: {len(dropoff_countries)}')

Number of dropoff countries: 116


In [18]:
all_countries = pickup_countries.union(dropoff_countries)
print(f'Total countries in ICE Air network (including US): {len(dropoff_countries)}')

Total countries in ICE Air network (including US): 116


In [19]:
df.drop_duplicates(subset='AlienMasterID')['CountryOfCitizenship'].value_counts().head(15)

MEXICO                568038
GUATEMALA             421400
HONDURAS              304683
EL SALVADOR           256825
DOMINICAN REPUBLIC     31704
ECUADOR                25686
HAITI                  23308
NICARAGUA              18679
COLOMBIA               14519
JAMAICA                12800
INDIA                   8952
CHINA                   4459
CUBA                    3658
BANGLADESH              2790
SOMALIA                 2659
Name: CountryOfCitizenship, dtype: int64

In [20]:
pickup_names = df[['PULOC', 'air_AirportName']].drop_duplicates()
pickup_names.set_index('PULOC', inplace=True)
dropoff_names = df[['DropLoc', 'air2_AirportName']].drop_duplicates()
dropoff_names.set_index('DropLoc', inplace=True)

pickup_dict = pickup_names.to_dict()['air_AirportName']
dropoff_dict = dropoff_names.to_dict()['air2_AirportName']

airport_dict = {**pickup_dict, **dropoff_dict}

# Finding repeat `AlienMasterID` values

In [21]:
count = df.groupby(['AlienMasterID']).size().reset_index(name='count')

count.set_index('AlienMasterID', inplace=True)

df.set_index('AlienMasterID', inplace=True)

df = df.join(count, how='left')

more_than_one = df['count'] > 1

multiple_alienID = df[more_than_one]

multiple_alienID.to_csv('../output/multiple_alienID.csv.gz', compression='gzip', sep='|')