In [2]:
import pandas as pd
import numpy as np
import yaml

# Finding optimal data types

https://www.dataquest.io/blog/pandas-big-data/

In [3]:
read_csv_opts = {'sep': '|',
                 'quotechar': '"',
                 'compression': 'gzip',
                 'encoding': 'utf-8'}

In [4]:
files = {'arts_fy11': '../input/ARTS_Passenger_Data_FY11.csv.gz',
         'arts_fy12': '../input/ARTS_Passenger_Data_FY12.csv.gz',
         'arts_fy13': '../input/ARTS_Passenger_Data_FY13.csv.gz',
         'arts_fy14': '../input/ARTS_Passenger_Data_FY14.csv.gz',
         'arts_fy15': '../input/ARTS_Passenger_Data_FY15.csv.gz',
         'arts_fy16': '../input/ARTS_Passenger_Data_FY16.csv.gz',
         'arts_fy17': '../input/ARTS_Passenger_Data_FY17.csv.gz',
         'arts_fy18': '../input/ARTS_Passenger_Data_FY18.csv.gz',
         'arts_fy19': '../input/ARTS_Passenger_Data_FY19.csv.gz'}

In [5]:
df = pd.read_csv('../input/ARTS_Passenger_Data_FY11.csv.gz', **read_csv_opts)

  interactivity=interactivity, compiler=compiler, result=result)


In [6]:
df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 233061 entries, 0 to 233060
Data columns (total 43 columns):
Status                  223252 non-null object
Sex                     233059 non-null object
Convictions             220107 non-null object
GangMember              172469 non-null object
ClassLvl                222996 non-null float64
Age                     232582 non-null float64
MissionDate             233061 non-null object
MissionNumber           233061 non-null int64
PULOC                   232537 non-null object
DropLoc                 232606 non-null object
StrikeFromList          0 non-null float64
ReasonStruck            6 non-null object
R-T                     233058 non-null object
Code                    233041 non-null object
CountryOfCitizenship    233027 non-null object
Juvenile                232582 non-null object
MissionWeek             233061 non-null int64
MissionQuarter          233061 non-null int64
MissionYear             233061 non-null int64
Mission

In [7]:
def mem_usage(pandas_obj):
    if isinstance(pandas_obj, pd.DataFrame):
        usage_b = pandas_obj.memory_usage(deep=True).sum()
    else: # we assume if not a df it's a series
        usage_b = pandas_obj.memory_usage(deep=True)
    usage_mb = usage_b / 1024 ** 2 # convert bytes to megabytes
    return "{:03.2f} MB".format(usage_mb)

In [8]:
df_int = df.select_dtypes(include=['int'])
converted_int = df_int.apply(pd.to_numeric,downcast='unsigned')

In [9]:
df_float = df.select_dtypes(include=['float'])
converted_float = df_float.apply(pd.to_numeric,downcast='float')

In [10]:
optimized_df = df.copy()

optimized_df[converted_int.columns] = converted_int
optimized_df[converted_float.columns] = converted_float

print(mem_usage(df))
print(mem_usage(optimized_df))

373.82 MB
354.93 MB


In [11]:
df_obj = df.select_dtypes(include=['object']).copy()
df_obj.describe()

Unnamed: 0,Status,Sex,Convictions,GangMember,MissionDate,PULOC,DropLoc,ReasonStruck,R-T,Code,...,st_StateAbbr,AOR_AOR,AOR_AORName,air_Country,air2_AirportName,air2_City,st2_StateAbbr,aor2_AOR,aor2_AORName,air2_Country
count,223252,233059,220107,172469,233061,232537,232606,6,233058,233041,...,232535,232535,232535,232537,232606,232606,149154,149154,149154,232606
unique,551,3,7591,1470,321,77,99,1,8,7,...,22,21,21,2,69,70,18,19,19,35
top,Removal,MALE,Non-Criminal,No,7/14/2011,KAEX,KHRL,no show,Removal,CR,...,TX,PHO,Phoenix,USA,Valley International Airport,Harlingen,TX,SNA,San Antonio,USA
freq,100138,220259,43693,93686,1268,26279,38852,6,179984,60184,...,50183,32907,32907,232535,56572,56572,81173,73434,73434,149154


In [12]:
converted_obj = pd.DataFrame()

for col in df_obj.columns:
    num_unique_values = len(df_obj[col].unique())
    num_total_values = len(df_obj[col])
    if num_unique_values / num_total_values < 0.5:
        converted_obj.loc[:,col] = df_obj[col].astype('category')
    else:
        converted_obj.loc[:,col] = df_obj[col]

In [13]:
print(mem_usage(df_obj))
print(mem_usage(converted_obj))

341.82 MB
7.78 MB


In [14]:
optimized_df[converted_obj.columns] = converted_obj

In [15]:
mem_usage(optimized_df)

'20.90 MB'

In [16]:
date = optimized_df.MissionDate
print(mem_usage(date))
date.head()

0.47 MB


0    10/1/2010
1    10/1/2010
2    10/1/2010
3    10/1/2010
4    10/1/2010
Name: MissionDate, dtype: category
Categories (321, object): [1/10/2011, 1/11/2011, 1/12/2011, 1/13/2011, ..., 9/6/2011, 9/7/2011, 9/8/2011, 9/9/2011]

In [17]:
optimized_df['MissionDate'] = pd.to_datetime(date,format='%m/%d/%Y')

print(mem_usage(optimized_df))
optimized_df['MissionDate'].head()

22.20 MB


0   2010-10-01
1   2010-10-01
2   2010-10-01
3   2010-10-01
4   2010-10-01
Name: MissionDate, dtype: datetime64[ns]

In [18]:
optimized_df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 233061 entries, 0 to 233060
Data columns (total 43 columns):
Status                  223252 non-null category
Sex                     233059 non-null category
Convictions             220107 non-null category
GangMember              172469 non-null category
ClassLvl                222996 non-null float32
Age                     232582 non-null float32
MissionDate             233061 non-null datetime64[ns]
MissionNumber           233061 non-null uint32
PULOC                   232537 non-null category
DropLoc                 232606 non-null category
StrikeFromList          0 non-null float32
ReasonStruck            6 non-null category
R-T                     233058 non-null category
Code                    233041 non-null category
CountryOfCitizenship    233027 non-null category
Juvenile                232582 non-null category
MissionWeek             233061 non-null uint8
MissionQuarter          233061 non-null uint8
MissionYear           

In [19]:
assert df.columns.tolist() == optimized_df.columns.tolist()

In [20]:
dtypes = optimized_df.drop('MissionDate',axis=1).dtypes

dtypes_col = dtypes.index
dtypes_type = [i.name for i in dtypes.values]

column_types = dict(zip(dtypes_col, dtypes_type))

In [21]:
column_types

{'Status': 'category',
 'Sex': 'category',
 'Convictions': 'category',
 'GangMember': 'category',
 'ClassLvl': 'float32',
 'Age': 'float32',
 'MissionNumber': 'uint32',
 'PULOC': 'category',
 'DropLoc': 'category',
 'StrikeFromList': 'float32',
 'ReasonStruck': 'category',
 'R-T': 'category',
 'Code': 'category',
 'CountryOfCitizenship': 'category',
 'Juvenile': 'category',
 'MissionWeek': 'uint8',
 'MissionQuarter': 'uint8',
 'MissionYear': 'uint16',
 'MissionMonth': 'uint8',
 'Criminality': 'category',
 'FamilyUnitFlag': 'float32',
 'UnaccompaniedFlag': 'float32',
 'AlienMasterID': 'uint32',
 'MissionID': 'uint16',
 'air_AirportID': 'float32',
 'air_AirportName': 'category',
 'air_City': 'category',
 'st_StateID': 'float32',
 'st_StateAbbr': 'category',
 'AOR_AORID': 'float32',
 'AOR_AOR': 'category',
 'AOR_AORName': 'category',
 'air_Country': 'category',
 'air2_AirportID': 'float32',
 'air2_AirportName': 'category',
 'air2_City': 'category',
 'st2_StateID': 'float32',
 'st2_StateAb

In [24]:
with open('../output/dtypes.yaml', 'w') as outfile:
    yaml.dump(column_types, outfile, default_flow_style=False)

In [21]:
read_and_optimized = pd.read_csv('../input/ARTS_Passenger_Data_FY12.csv.gz',dtype=column_types,parse_dates=['MissionDate'],infer_datetime_format=True,**read_csv_opts)

print(mem_usage(read_and_optimized))
read_and_optimized.head()

26.25 MB


Unnamed: 0,Status,Sex,Convictions,GangMember,ClassLvl,Age,MissionDate,MissionNumber,PULOC,DropLoc,...,air_Country,air2_AirportID,air2_AirportName,air2_City,st2_StateID,st2_StateAbbr,aor2_AORID,aor2_AOR,aor2_AORName,air2_Country
0,TRANSFER,MALE,DUI,,1.0,30.0,2011-10-01,120001,KIWA,KBFI,...,USA,39.0,Boeing Field King County International Airport,Seattle,51.0,WA,22.0,SEA,Seattle,USA
1,Removal,MALE,Traffic,No,1.0,27.0,2011-10-01,120001,KBFI,KSAN,...,USA,107.0,San Diego International Airport,San Diego,5.0,CA,20.0,SND,San Diego,USA
2,Removal,MALE,Assault,No,2.0,22.0,2011-10-01,120001,KBFI,KSAN,...,USA,107.0,San Diego International Airport,San Diego,5.0,CA,20.0,SND,San Diego,USA
3,Removal,MALE,DUI,No,1.0,24.0,2011-10-01,120001,KBFI,KSAN,...,USA,107.0,San Diego International Airport,San Diego,5.0,CA,20.0,SND,San Diego,USA
4,Removal,MALE,DUI,No,1.0,33.0,2011-10-01,120001,KBFI,KSAN,...,USA,107.0,San Diego International Airport,San Diego,5.0,CA,20.0,SND,San Diego,USA
