In [4]:
import os
import zipfile

import requests
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

if int(os.environ.get("MODERN_PANDAS_EPUB", 0)):
    import prep
    print("yes")

In [6]:
import requests

headers = {
    'Referer': 'https://www.transtats.bts.gov/DL_SelectFields.asp?Table_ID=236&DB_Short_Name=On-Time',
    'Origin': 'https://www.transtats.bts.gov',
    'Content-Type': 'application/x-www-form-urlencoded',
}

params = (
    ('Table_ID', '236'),
    ('Has_Group', '3'),
    ('Is_Zipped', '0'),
)

with open('modern-1-url.txt', encoding='utf-8') as f:
    data = f.read().strip()

os.makedirs('data', exist_ok=True)
dest = "data/flights.csv.zip"

if not os.path.exists(dest):
    r = requests.post('https://www.transtats.bts.gov/DownLoad_Table.asp',
                      headers=headers, params=params, data=data, stream=True)

    with open("data/flights.csv.zip", 'wb') as f:
        for chunk in r.iter_content(chunk_size=102400): 
            if chunk:
                f.write(chunk)

In [7]:
zf = zipfile.ZipFile("data/flights.csv.zip")
fp = zf.extract(zf.filelist[0].filename, path='data/')
df = pd.read_csv(fp, parse_dates=["FL_DATE"]).rename(columns=str.lower)

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 450017 entries, 0 to 450016
Data columns (total 33 columns):
fl_date                  450017 non-null datetime64[ns]
unique_carrier           450017 non-null object
airline_id               450017 non-null int64
tail_num                 449378 non-null object
fl_num                   450017 non-null int64
origin_airport_id        450017 non-null int64
origin_airport_seq_id    450017 non-null int64
origin_city_market_id    450017 non-null int64
origin                   450017 non-null object
origin_city_name         450017 non-null object
dest_airport_id          450017 non-null int64
dest_airport_seq_id      450017 non-null int64
dest_city_market_id      450017 non-null int64
dest                     450017 non-null object
dest_city_name           450017 non-null object
crs_dep_time             450017 non-null int64
dep_time                 441476 non-null float64
dep_delay                441476 non-null float64
taxi_out                

In [8]:
df.ix[10:15, ['fl_date', 'tail_num']]

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  """Entry point for launching an IPython kernel.


Unnamed: 0,fl_date,tail_num
10,2017-01-01,N855AA
11,2017-01-01,N785AA
12,2017-01-01,N785AA
13,2017-01-01,N790AA
14,2017-01-01,N784AA
15,2017-01-01,N783AA


In [9]:
# filter the warning for now on
import warnings
warnings.simplefilter("ignore", DeprecationWarning)

In [10]:
first = df.groupby('airline_id')[['fl_date', 'unique_carrier']].first()
first.head()

Unnamed: 0_level_0,fl_date,unique_carrier
airline_id,Unnamed: 1_level_1,Unnamed: 2_level_1
19393,2017-01-01,WN
19690,2017-01-01,HA
19790,2017-01-01,DL
19805,2017-01-01,AA
19930,2017-01-01,AS


In [11]:
first.ix[10:15, ['fl_date', 'tail_num']]

KeyError: "['tail_num'] not in index"

In [12]:
first = df.groupby('unique_carrier').first()
first.ix[10:15, ['fl_date', 'tail_num']]

Unnamed: 0_level_0,fl_date,tail_num
unique_carrier,Unnamed: 1_level_1,Unnamed: 2_level_1
VX,2017-01-01,N846VA
WN,2017-01-01,N955WN


In [13]:
first.loc[['AA', 'AS', 'DL'], ['fl_date', 'tail_num']]

Unnamed: 0_level_0,fl_date,tail_num
unique_carrier,Unnamed: 1_level_1,Unnamed: 2_level_1
AA,2017-01-01,N787AA
AS,2017-01-01,N303AS
DL,2017-01-01,N942DL


In [16]:
first.iloc[[0, 1, 3], [0, 2]]

Unnamed: 0_level_0,fl_date,tail_num
unique_carrier,Unnamed: 1_level_1,Unnamed: 2_level_1
AA,2017-01-01,N787AA
AS,2017-01-01,N303AS
DL,2017-01-01,N942DL


In [15]:
first.head()

Unnamed: 0_level_0,fl_date,airline_id,tail_num,fl_num,origin_airport_id,origin_airport_seq_id,origin_city_market_id,origin,origin_city_name,dest_airport_id,...,arr_time,arr_delay,cancelled,cancellation_code,carrier_delay,weather_delay,nas_delay,security_delay,late_aircraft_delay,unnamed: 32
unique_carrier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AA,2017-01-01,19805,N787AA,1,12478,1247803,31703,JFK,"New York, NY",12892,...,1209.0,27.0,0.0,A,27.0,0.0,0.0,0.0,0.0,
AS,2017-01-01,19930,N303AS,360,14747,1474703,30559,SEA,"Seattle, WA",14893,...,1703.0,48.0,1.0,A,0.0,0.0,48.0,0.0,0.0,
B6,2017-01-01,20409,N593JB,264,14747,1474703,30559,SEA,"Seattle, WA",12478,...,506.0,1.0,0.0,A,0.0,0.0,0.0,0.0,54.0,
DL,2017-01-01,19790,N942DL,2,12478,1247803,31703,JFK,"New York, NY",11697,...,2301.0,5.0,0.0,B,16.0,0.0,1.0,0.0,0.0,
EV,2017-01-01,20366,N12530,4101,12915,1291503,31205,LCH,"Lake Charles, LA",12266,...,634.0,-1.0,0.0,A,14.0,0.0,15.0,0.0,26.0,


In [17]:
f = pd.DataFrame({'a':[1,2,3,4,5], 'b':[10,20,30,40,50]})
f

Unnamed: 0,a,b
0,1,10
1,2,20
2,3,30
3,4,40
4,5,50


In [18]:
# ignore the context manager for now
with pd.option_context('mode.chained_assignment', None):
    f[f['a'] <= 3]['b'] = f[f['a'] <= 3 ]['b'] / 10
f

Unnamed: 0,a,b
0,1,10
1,2,20
2,3,30
3,4,40
4,5,50


In [19]:
f.loc[f['a'] <= 3, 'b'] = f.loc[f['a'] <= 3, 'b'] / 10
f

Unnamed: 0,a,b
0,1,1.0
1,2,2.0
2,3,3.0
3,4,40.0
4,5,50.0
