In [1]:
import numpy as np
import scipy as sc
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

% matplotlib inline

## Question 2

### Download dataset

In [10]:
import os
import urllib.request
import zipfile

data_dir = '/data/flights'
    
data_dir = os.path.join(data_dir)

if not os.path.exists(data_dir):
    os.makedirs(data_dir)

yr = [2014, 2015, 2016]
mm = np.arange(1,13)

for y in yr:
    for m in mm:
        DATA_URL = 'https://transtats.bts.gov/PREZIP/On_Time_On_Time_Performance_{}_{}.zip'.format(y, m)

        filename = DATA_URL.split('/')[-1]
        filepath = os.path.join(data_dir, filename)

        if not os.path.exists(filepath):
            filepath, _ = urllib.request.urlretrieve(DATA_URL, filepath)
            zip_ref = zipfile.ZipFile(filepath, 'r')
            zip_ref.extractall(data_dir)
            zip_ref.close()
            os.remove(filename)
            print('Successfullly downloaded', filename)

Successfullly downloaded On_Time_On_Time_Performance_2014_1.zip
Successfullly downloaded On_Time_On_Time_Performance_2014_2.zip
Successfullly downloaded On_Time_On_Time_Performance_2014_3.zip
Successfullly downloaded On_Time_On_Time_Performance_2014_4.zip
Successfullly downloaded On_Time_On_Time_Performance_2014_5.zip
Successfullly downloaded On_Time_On_Time_Performance_2014_6.zip
Successfullly downloaded On_Time_On_Time_Performance_2014_7.zip
Successfullly downloaded On_Time_On_Time_Performance_2014_8.zip
Successfullly downloaded On_Time_On_Time_Performance_2014_9.zip
Successfullly downloaded On_Time_On_Time_Performance_2014_10.zip
Successfullly downloaded On_Time_On_Time_Performance_2014_11.zip
Successfullly downloaded On_Time_On_Time_Performance_2014_12.zip
Successfullly downloaded On_Time_On_Time_Performance_2015_1.zip
Successfullly downloaded On_Time_On_Time_Performance_2015_2.zip
Successfullly downloaded On_Time_On_Time_Performance_2015_3.zip
Successfullly downloaded On_Time_On_T

### Change data-format to reduce memory usage

First, we will take an example table and inspect the memory usage for each data type: float, int and object

In [41]:
filename = '/data/flights/On_Time_On_Time_Performance_2014_1.csv'
data_fl = pd.read_csv(filename)

In [None]:
for dtype in ['float64','int64','object']:
    data_dtype = data_fl.select_dtypes(include=[dtype])
    tt_usage_b = data_dtype.memory_usage(deep=True).sum()
    tt_usage_mb = tt_usage_b / 1024 ** 2
    print("Total memory usage for {} columns: {:.3f} MB".format(dtype, tt_usage_mb))

In [2]:
def memory_usage(df):
    tt_usage = df.memory_usage(deep=True).sum() / 1024 ** 2
    print("Total memory usage {:.3f} MB".format(tt_usage))

Before merging all the table, we will apply down-casting without losing informaiton on int, float data type. We will also change the NAN data type as well as use category datatypte to store object variable.

In [3]:
def down_cast(df, filename):
    
    print('Downcasting...' + filename.split('/')[-1])
    df_int = df.select_dtypes(include=['int'])
    df_float = df.select_dtypes(include=['float'])
    df_obj = df.select_dtypes(include=['object'])
    
    df_int_cv = df_int.apply(pd.to_numeric, downcast='unsigned')
    df_float_cv = df_float.apply(pd.to_numeric, downcast='float')
    null_columns= df_float_cv.columns[df_float_cv.isnull().all()]
    df_float_cv[null_columns] = df_float_cv[null_columns].astype('float16')
    
    df_obj_cv = pd.DataFrame()
    for col in df_obj.columns:
        num_unique_values = len(df_obj[col].unique())
        num_total_values = len(df_obj[col])
        if num_unique_values / num_total_values < 0.5:
            df_obj_cv.loc[:,col] = df_obj[col].astype('category')
        else:
            df_obj_cv.loc[:,col] = df_obj[col]
            
    pd_new = pd.concat([df_int_cv, df_obj_cv, df_float_cv], axis=1)
    
    return pd_new

In [62]:
memory_usage(data_fl)
data_fl_cv = down_cast(data_fl)
memory_usage(data_fl_cv)

Total memory usage 800.439 MB
Downcasting...
Total memory usage 199.354 MB


### Process each .csv file and combine them

In [None]:
data_fl = pd.DataFrame(data=None)

yr = [2014, 2015, 2016]
mm = np.arange(1,13)
for y in yr:
    for m in mm:
        filename = '/data/flights/On_Time_On_Time_Performance_{}_{}.csv'.format(y, m)
        print('Processing ' + filename + '...')

        data_tmp = pd.read_csv(filename)
        data_tmp_cv = down_cast(data_tmp, filename)
        data_fl = data_fl.append(data_tmp_cv)

Processing /data/flights/On_Time_On_Time_Performance_2014_1.csv...
Downcasting...On_Time_On_Time_Performance_2014_1.csv
Processing /data/flights/On_Time_On_Time_Performance_2014_2.csv...
Downcasting...On_Time_On_Time_Performance_2014_2.csv
Processing /data/flights/On_Time_On_Time_Performance_2014_3.csv...
Downcasting...On_Time_On_Time_Performance_2014_3.csv
Processing /data/flights/On_Time_On_Time_Performance_2014_4.csv...
Downcasting...On_Time_On_Time_Performance_2014_4.csv
Processing /data/flights/On_Time_On_Time_Performance_2014_5.csv...
Downcasting...On_Time_On_Time_Performance_2014_5.csv
Processing /data/flights/On_Time_On_Time_Performance_2014_6.csv...
Downcasting...On_Time_On_Time_Performance_2014_6.csv
Processing /data/flights/On_Time_On_Time_Performance_2014_7.csv...
Downcasting...On_Time_On_Time_Performance_2014_7.csv
Processing /data/flights/On_Time_On_Time_Performance_2014_8.csv...
Downcasting...On_Time_On_Time_Performance_2014_8.csv
Processing /data/flights/On_Time_On_Time

In [97]:
data_tmp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 430602 entries, 0 to 430601
Columns: 110 entries, Year to Unnamed: 109
dtypes: float64(70), int64(21), object(19)
memory usage: 361.4+ MB


In [64]:
data_fl.select_dtypes(include=['int']).info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 471949 entries, 0 to 471948
Data columns (total 21 columns):
Year                  471949 non-null int64
Quarter               471949 non-null int64
Month                 471949 non-null int64
DayofMonth            471949 non-null int64
DayOfWeek             471949 non-null int64
AirlineID             471949 non-null int64
FlightNum             471949 non-null int64
OriginAirportID       471949 non-null int64
OriginAirportSeqID    471949 non-null int64
OriginCityMarketID    471949 non-null int64
OriginStateFips       471949 non-null int64
OriginWac             471949 non-null int64
DestAirportID         471949 non-null int64
DestAirportSeqID      471949 non-null int64
DestCityMarketID      471949 non-null int64
DestStateFips         471949 non-null int64
DestWac               471949 non-null int64
CRSDepTime            471949 non-null int64
CRSArrTime            471949 non-null int64
DistanceGroup         471949 non-null int64
DivAirpor

In [84]:
data_fl_cv.select_dtypes(include=['uint8', 'uint16', 'uint32']).info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 471949 entries, 0 to 471948
Data columns (total 21 columns):
Year                  471949 non-null uint16
Quarter               471949 non-null uint8
Month                 471949 non-null uint8
DayofMonth            471949 non-null uint8
DayOfWeek             471949 non-null uint8
AirlineID             471949 non-null uint16
FlightNum             471949 non-null uint16
OriginAirportID       471949 non-null uint16
OriginAirportSeqID    471949 non-null uint32
OriginCityMarketID    471949 non-null uint16
OriginStateFips       471949 non-null uint8
OriginWac             471949 non-null uint8
DestAirportID         471949 non-null uint16
DestAirportSeqID      471949 non-null uint32
DestCityMarketID      471949 non-null uint16
DestStateFips         471949 non-null uint8
DestWac               471949 non-null uint8
CRSDepTime            471949 non-null uint16
CRSArrTime            471949 non-null uint16
DistanceGroup         471949 non-null uint