## Example: Analyzing Airport Operations

As an example, let's analyze `airports.csv`, `airport-frequencies.csv`, `countries.csv`, `regions.csv` from [OurAirports.com](https://ourairports.com/data/)

In [26]:
import numpy as np
import pandas as pd

In [27]:
# Load data directly from the internet URL
airports = pd.read_csv("https://ourairports.com/data/airports.csv", sep=",")
airports.head()

Unnamed: 0,id,ident,type,name,latitude_deg,longitude_deg,elevation_ft,continent,iso_country,iso_region,municipality,scheduled_service,gps_code,iata_code,local_code,home_link,wikipedia_link,keywords
0,6523,00A,heliport,Total Rf Heliport,40.070801,-74.933601,11.0,,US,US-PA,Bensalem,no,00A,,00A,,,
1,323361,00AA,small_airport,Aero B Ranch Airport,38.704022,-101.473911,3435.0,,US,US-KS,Leoti,no,00AA,,00AA,,,
2,6524,00AK,small_airport,Lowell Field,59.947733,-151.692524,450.0,,US,US-AK,Anchor Point,no,00AK,,00AK,,,
3,6525,00AL,small_airport,Epps Airpark,34.864799,-86.770302,820.0,,US,US-AL,Harvest,no,00AL,,00AL,,,
4,6526,00AR,closed,Newport Hospital & Clinic Heliport,35.6087,-91.254898,237.0,,US,US-AR,Newport,no,,,,,,00AR


In [28]:
# Size? Columns? Data types?

print(airports.shape)
print(airports.dtypes)

(69197, 18)
id                     int64
ident                 object
type                  object
name                  object
latitude_deg         float64
longitude_deg        float64
elevation_ft         float64
continent             object
iso_country           object
iso_region            object
municipality          object
scheduled_service     object
gps_code              object
iata_code             object
local_code            object
home_link             object
wikipedia_link        object
keywords              object
dtype: object


In [29]:
# Missing values?

airports.isnull().sum()

id                       0
ident                    0
type                     0
name                     0
latitude_deg             0
longitude_deg            0
elevation_ft         13099
continent            34114
iso_country            257
iso_region               0
municipality          5383
scheduled_service        0
gps_code             26577
iata_code            59961
local_code           36889
home_link            65839
wikipedia_link       58760
keywords             56470
dtype: int64

In [30]:
# Load other csv files
airport_freq = pd.read_csv("https://ourairports.com/data/airport-frequencies.csv", sep=',')
countries = pd.read_csv("https://ourairports.com/data/countries.csv", sep=',')
regions = pd.read_csv("https://ourairports.com/data/regions.csv", sep=',')

In [33]:
# Extract all large airports in New York state from airports data frame

# airports['type'].value_counts()
# us_airports = airports[airports['iso_country'] == "US"]
# us_airports['iso_region'].value_counts()
results = airports[(airports['type'] == 'large_airport') & \
                   (airports['iso_region'] == "US-NY")]
results

Unnamed: 0,id,ident,type,name,latitude_deg,longitude_deg,elevation_ft,continent,iso_country,iso_region,municipality,scheduled_service,gps_code,iata_code,local_code,home_link,wikipedia_link,keywords
32616,3431,KBUF,large_airport,Buffalo Niagara International Airport,42.940498,-78.732201,728.0,,US,US-NY,Buffalo,yes,KBUF,BUF,BUF,,https://en.wikipedia.org/wiki/Buffalo_Niagara_...,
34029,3622,KJFK,large_airport,John F Kennedy International Airport,40.639801,-73.7789,13.0,,US,US-NY,New York,yes,KJFK,JFK,JFK,https://www.jfkairport.com/,https://en.wikipedia.org/wiki/John_F._Kennedy_...,"Manhattan, New York City, NYC, Idlewild, IDL, ..."
34170,3643,KLGA,large_airport,La Guardia Airport,40.777199,-73.872597,21.0,,US,US-NY,New York,yes,KLGA,LGA,LGA,https://www.laguardiaairport.com/,https://en.wikipedia.org/wiki/LaGuardia_Airport,"Manhattan, New York City, NYC, Glenn H. Curtis..."
36415,3913,KSYR,large_airport,Syracuse Hancock International Airport,43.111198,-76.1063,421.0,,US,US-NY,Syracuse,yes,KSYR,SYR,SYR,http://www.syrairport.org/,https://en.wikipedia.org/wiki/Syracuse_Hancock...,


In [21]:
airport_freq.head()

Unnamed: 0,id,airport_ref,airport_ident,type,description,frequency_mhz
0,70518,6528,00CA,CTAF,CTAF,122.9
1,307581,6589,01FL,ARCAL,,122.9
2,75239,6589,01FL,CTAF,CEDAR KNOLL TRAFFIC,122.8
3,60191,6756,04CA,CTAF,CTAF,122.9
4,59287,6779,04MS,UNIC,UNICOM,122.8


In [25]:
airport_idents = results['ident']
airport_idents

32616    KBUF
34029    KJFK
34170    KLGA
36415    KSYR
Name: ident, dtype: object

In [34]:
# Extract all communication frequencies used for a large airport in New York state

airport_freq[airport_freq['airport_ident'].isin(airport_idents)]

Unnamed: 0,id,airport_ref,airport_ident,type,description,frequency_mhz
7800,69857,3431,KBUF,A/D,Buffalo APP/DEP,126.15
7801,69858,3431,KBUF,ATIS,ATIS,135.35
7802,69859,3431,KBUF,CLD,CLNC DEL,124.7
7803,69860,3431,KBUF,GND,GND,133.2
7804,69861,3431,KBUF,RDO,RDO,122.6
7805,69862,3431,KBUF,TWR,TWR,120.5
11668,69293,3622,KJFK,APP,NEW YORK APP (ROBER),125.7
11669,301312,3622,KJFK,APP,NEW YORK APPROACH (CAMRN),127.4
11670,301313,3622,KJFK,APP,NEW YORK APPROACH (FINAL),132.4
11671,69294,3622,KJFK,ATIS,ATIS,115.1


In [42]:
# Calculate the number of large airports for each country

# 1. Extract the large airports from data frame airports
large_airports = airports[airports['type'] == "large_airport"]
# large_airports['type'].value_counts()

# 2. Split the data according to country
groups = large_airports.groupby('iso_country')
# for label, group in groups:
#     print("Label:", label)
#     print(group[['name', 'iso_country']].head())

# 3. Count the number of rows in each country group
num_large_airport = groups.size()
num_large_airport

# 4. Present the results
num_large_airport = num_large_airport.to_frame("Number of Large Airports")
num_large_airport

Unnamed: 0_level_0,Number of Large Airports
iso_country,Unnamed: 1_level_1
AE,4
AL,1
AM,1
AO,1
AR,2
...,...
VN,2
VU,1
ZA,3
ZM,1


In [44]:
# In order to find the full country names, let's inspect the 
# coutries data frame
countries.head()

Unnamed: 0,id,code,name,continent,wikipedia_link,keywords
0,302672,AD,Andorra,EU,https://en.wikipedia.org/wiki/Andorra,
1,302618,AE,United Arab Emirates,AS,https://en.wikipedia.org/wiki/United_Arab_Emir...,"UAE,مطارات في الإمارات العربية المتحدة"
2,302619,AF,Afghanistan,AS,https://en.wikipedia.org/wiki/Afghanistan,
3,302722,AG,Antigua and Barbuda,,https://en.wikipedia.org/wiki/Antigua_and_Barbuda,
4,302723,AI,Anguilla,,https://en.wikipedia.org/wiki/Anguilla,


In [53]:
# Extract the code column and the name column from countries
country_names = countries[['code', 'name']]
# country_names.head()

# Merge country_names with num_large_airport
results = pd.merge(num_large_airport, country_names,
                   left_index=True, right_on="code",
                   how="left")
# results.head()

# Use country names as index
results = results.set_index('name', drop=True)
# results.head()

# Remove the code column
results = results.drop('code', axis=1)
# results.head()

# Sort the rows according to the number
results = results.sort_values("Number of Large Airports", ascending=False)
results.head(10)

Unnamed: 0_level_0,Number of Large Airports
name,Unnamed: 1_level_1
United States,65
China,35
Russia,20
Italy,12
Japan,12
Spain,11
Canada,11
Germany,10
India,10
Mexico,10


In [54]:
# Which country has the most large airports?

# US has the most large airports.

## Data Aggregations
Aggregation refer to any data transformation that produces numeric values from arrays. Examples of data aggregation methods include `mean()`, `count()`, `first()`, `min()`, and `sum()`. Moreover, user-defined functions can also be applied to create desired summary.

In [6]:
url = "https://shanelynnwebsite-mid9n9g1q9y8tt.netdna-ssl.com/wp-content/uploads/2015/06/phone_data.csv"
data = pd.read_csv(url, index_col='index')
print(data.shape)
data.head(20)

(830, 6)


Unnamed: 0_level_0,date,duration,item,month,network,network_type
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,15/10/14 06:58,34.429,data,2014-11,data,data
1,15/10/14 06:58,13.0,call,2014-11,Vodafone,mobile
2,15/10/14 14:46,23.0,call,2014-11,Meteor,mobile
3,15/10/14 14:48,4.0,call,2014-11,Tesco,mobile
4,15/10/14 17:27,4.0,call,2014-11,Tesco,mobile
5,15/10/14 18:55,4.0,call,2014-11,Tesco,mobile
6,16/10/14 06:58,34.429,data,2014-11,data,data
7,16/10/14 15:01,602.0,call,2014-11,Three,mobile
8,16/10/14 15:12,1050.0,call,2014-11,Three,mobile
9,16/10/14 15:30,19.0,call,2014-11,voicemail,voicemail


In [3]:
# Define function get_range() that returns(max - min)
def get_range(array):
    return array.max() - array.min()

In [4]:
# Apply agg() to find the range of each type of cell phone use.
data.groupby(['item'])['duration'].agg(get_range)

item
call    10527.0
data        0.0
sms         0.0
Name: duration, dtype: float64

In [5]:
groups = data.groupby('item')['duration']
for label, group in groups:
    print("Label:", label)
    print("Max:", group.max())
    print("Min:", group.min())

Label: call
Max: 10528.0
Min: 1.0
Label: data
Max: 34.429
Min: 34.429
Label: sms
Max: 1.0
Min: 1.0


In [7]:
# Apply multiple aggregation functions
data.groupby(['item'])['duration'].agg([get_range, np.max, np.min])

Unnamed: 0_level_0,get_range,amax,amin
item,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
call,10527.0,10528.0,1.0
data,0.0,34.429,34.429
sms,0.0,1.0,1.0


In [8]:
# Declare columns names
data.groupby(['item'])['duration'].agg([('range', get_range),
                                        ('maximum', np.max),
                                        ('minimum', np.min)])

Unnamed: 0_level_0,range,maximum,minimum
item,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
call,10527.0,10528.0,1.0
data,0.0,34.429,34.429
sms,0.0,1.0,1.0
