# Day 1 Recap

In [1]:
import numpy as np
import pandas as pd
import seaborn.apionly as sns
import matplotlib.pyplot as plt

In [2]:
flights = pd.read_csv("data/ny-flights.csv.gz",
                      parse_dates=["fl_date", "arr", "dep"])
first = flights.groupby("unique_carrier").first()
first.head()

Unnamed: 0_level_0,fl_date,airline_id,tail_num,fl_num,origin,dest,dep_time,dep_delay,arr_time,arr_delay,cancelled,arr,dep
unique_carrier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
AA,2014-01-01,19805,N338AA,1,JFK,LAX,914.0,14.0,1238.0,13.0,0.0,2014-01-01 12:38:00,2014-01-01 09:14:00
B6,2014-01-01,20409,N318JB,115,SYR,JFK,720.0,90.0,808.0,55.0,0.0,2014-01-01 08:08:00,2014-01-01 07:20:00
DL,2014-01-01,19790,N930DL,61,LGA,ATL,1810.0,10.0,2054.0,10.0,0.0,2014-01-01 20:54:00,2014-01-01 18:10:00
EV,2014-01-01,20366,N14977,3259,HPN,ORD,731.0,-5.0,940.0,31.0,0.0,2014-01-01 09:40:00,2014-01-01 07:31:00
F9,2014-01-01,20436,N209FR,507,LGA,DEN,1804.0,35.0,2047.0,47.0,0.0,2014-01-01 20:47:00,2014-01-01 18:04:00


## Data Structures

1. `DataFrame`: 2-dimensional labeled array
2. `Series`: 1-dimensional labeled array
3. `Index`: label containers

## Indexing

Use `[]` aka `__getitem__` for selecting just columns

In [3]:
first[['origin', 'dest']].head()

Unnamed: 0_level_0,origin,dest
unique_carrier,Unnamed: 1_level_1,Unnamed: 2_level_1
AA,JFK,LAX
B6,SYR,JFK
DL,LGA,ATL
EV,HPN,ORD
F9,LGA,DEN


Use `.loc` for label indexing

In [4]:
first.loc[['AA', 'DL'], ['origin', 'dest']]

Unnamed: 0_level_0,origin,dest
unique_carrier,Unnamed: 1_level_1,Unnamed: 2_level_1
AA,JFK,LAX
DL,LGA,ATL


Use `.iloc` for positional indexing

In [5]:
first.iloc[[0, 2], [4, 5]]

Unnamed: 0_level_0,origin,dest
unique_carrier,Unnamed: 1_level_1,Unnamed: 2_level_1
AA,JFK,LAX
DL,LGA,ATL


All indexers acccept a *boolean mask*

In [6]:
flights[flights.dep.isnull()].head()

Unnamed: 0,fl_date,unique_carrier,airline_id,tail_num,fl_num,origin,dest,dep_time,dep_delay,arr_time,arr_delay,cancelled,arr,dep
29,2014-01-01,AA,19805,N3EAAA,359,LGA,ORD,,,,,1.0,NaT,NaT
31,2014-01-01,AA,19805,N542AA,371,LGA,ORD,,,,,1.0,NaT,NaT
195,2014-01-01,B6,20409,N913JB,1103,JFK,SJU,,,,,1.0,NaT,NaT
209,2014-01-01,B6,20409,N193JB,518,JFK,BOS,,,,,1.0,NaT,NaT
359,2014-01-01,EV,20366,N17159,4130,SYR,CLE,,,,,1.0,NaT,NaT


## Alignment

In [7]:
df1 = pd.DataFrame({"A": [1, 2, 3]}, index=['a', 'b', 'c'])
df2 = pd.DataFrame({"A": [2, 4, 6]}, index=['b', 'a', 'd'])

In [8]:
df1

Unnamed: 0,A
a,1
b,2
c,3


In [9]:
df2

Unnamed: 0,A
b,2
a,4
d,6


Pandas *aligns* by label, then does the operation.

In [10]:
df1 + df2

Unnamed: 0,A
a,5.0
b,4.0
c,
d,


This saves you from ahving to write the join yourself.

## Groupby

1. Split by some array
2. Apply some function
3. Combine the results
    - `.agg`: 1 output row per input group
    - `.transform`: 1 output row per input row

In [11]:
flights.groupby("unique_carrier").dep_delay.mean()

unique_carrier
AA    13.044583
B6    30.479133
DL    29.188422
EV    26.735965
F9    22.906250
FL    24.292553
HA    41.241379
MQ    20.540419
OO    45.000000
UA    13.470244
US     2.636301
VX    17.387543
WN    24.854691
Name: dep_delay, dtype: float64