<h1 align = 'center' style = "font-family:Lucida Bright" > Analyzing Large Dataset using DASK </h1>

In [1]:
# import necessary libraries
import dask.dataframe as dd

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
import datetime as dt

import plotly.offline as py
import plotly.figure_factory as ff
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
from plotly import tools

## About Dataset

>Data was collected and published by the U.S. Department of Transportation's for 2015. The dataset contains 3 CSV files:
- airlines.csv: contains information about airlines.
- airports.csv: contains information about airports.
- flights.csv: contains 33 columns related to flight information.

## Reason for choosing the Dataset
<p>According to a 2010 report made by the US Federal Aviation Administration, the economic price of domestic flight delays entails a yearly cost of 32.9 billion dollars to passengers, airlines and other parts of the economy. More than half of that amount comes from the pockets of passengers who not only lose time waiting for their planes to leave, but they also miss connecting flights, spend money on food and have to sleep on hotel rooms while they're stranded.</p>

<P> We tried to look reasons behind cancellation and delay in USA domestic Air service.</P>

## Questions to Answer

>- Top 15 origins
>- Number of flights by origin
>- Most and 

## Loading and Cleaning dataset

### *Loading Flights dataset*

In [2]:
# import data into DASK dataframe
df= dd.read_csv("flights.csv",assume_missing=True,low_memory=False)


In [3]:
df.head(5).T

Unnamed: 0,0,1,2,3,4
YEAR,2015,2015,2015,2015,2015
MONTH,1,1,1,1,1
DAY,1,1,1,1,1
DAY_OF_WEEK,4,4,4,4,4
AIRLINE,AS,AA,US,AA,AS
FLIGHT_NUMBER,98,2336,840,258,135
TAIL_NUMBER,N407AS,N3KUAA,N171US,N3HYAA,N527AS
ORIGIN_AIRPORT,ANC,LAX,SFO,LAX,SEA
DESTINATION_AIRPORT,SEA,PBI,CLT,MIA,ANC
SCHEDULED_DEPARTURE,5,10,20,20,25


In [None]:
df.dtypes

YEAR                   float64
MONTH                  float64
DAY                    float64
DAY_OF_WEEK            float64
AIRLINE                 object
FLIGHT_NUMBER          float64
TAIL_NUMBER             object
ORIGIN_AIRPORT          object
DESTINATION_AIRPORT     object
SCHEDULED_DEPARTURE    float64
DEPARTURE_TIME         float64
DEPARTURE_DELAY        float64
TAXI_OUT               float64
WHEELS_OFF             float64
SCHEDULED_TIME         float64
ELAPSED_TIME           float64
AIR_TIME               float64
DISTANCE               float64
WHEELS_ON              float64
TAXI_IN                float64
SCHEDULED_ARRIVAL      float64
ARRIVAL_TIME           float64
ARRIVAL_DELAY          float64
DIVERTED               float64
CANCELLED              float64
CANCELLATION_REASON     object
AIR_SYSTEM_DELAY       float64
SECURITY_DELAY         float64
AIRLINE_DELAY          float64
LATE_AIRCRAFT_DELAY    float64
WEATHER_DELAY          float64
dtype: object

In [None]:
df.info()

<class 'dask.dataframe.core.DataFrame'>
Columns: 31 entries, YEAR to WEATHER_DELAY
dtypes: object(5), float64(26)

In [None]:
s=df.describe().compute()
s.T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
YEAR,5819079.0,2015.0,0.0,2015.0,2015.0,2015.0,2015.0,2015.0
MONTH,5819079.0,6.524085,3.405137,1.0,4.0,8.0,12.0,12.0
DAY,5819079.0,15.704594,8.783425,1.0,11.0,19.0,29.0,31.0
DAY_OF_WEEK,5819079.0,3.926941,1.988845,1.0,2.0,4.0,6.0,7.0
FLIGHT_NUMBER,5819079.0,2173.092742,1757.063999,1.0,758.0,1768.0,3485.0,9855.0
SCHEDULED_DEPARTURE,5819079.0,1329.60247,483.751821,1.0,925.0,1345.0,1745.0,2359.0
DEPARTURE_TIME,5732926.0,1335.204439,496.42326,1.0,931.0,1356.0,1800.0,2400.0
DEPARTURE_DELAY,5732926.0,9.370158,37.080942,-82.0,-3.0,2.0,24.0,1988.0
TAXI_OUT,5730032.0,16.071662,8.895574,1.0,11.0,14.0,19.0,225.0
WHEELS_OFF,5730032.0,1357.170841,498.009356,1.0,947.0,1408.0,1813.0,2400.0


### *Loading Airlines and Airports dataset*

In [4]:
#airlines and airports datatset
airlines = dd.read_csv("airlines.csv")
airports= dd.read_csv("airports.csv")

### *Merging datasets*

In [5]:
df = dd.merge(df,airlines, left_on='AIRLINE', right_on = 'IATA_CODE')

In [6]:
df.head(5).T

Unnamed: 0,0,1,2,3,4
YEAR,2015,2015,2015,2015,2015
MONTH,1,1,1,1,1
DAY,1,1,1,1,1
DAY_OF_WEEK,4,4,4,4,4
AIRLINE_x,AS,AS,AS,AS,AS
FLIGHT_NUMBER,98,135,108,122,130
TAIL_NUMBER,N407AS,N527AS,N309AS,N413AS,N457AS
ORIGIN_AIRPORT,ANC,SEA,ANC,ANC,FAI
DESTINATION_AIRPORT,SEA,ANC,SEA,PDX,SEA
SCHEDULED_DEPARTURE,5,25,45,50,115


<div class="alert alert-block alert-info">
<b>info:</b>  We drop IATA_CODE from the df as airports dataset has also IATA_CODE. In airports dataset IATA_CODE is for airport butin our current ddf,IATA_CODE is for airlines. So, we remove IATA_CODE but later we will rename AIRLINE_x to IATA_CODE</div>

In [7]:
#renaming AIRLINE_y to airline and dropping IATA_CODE as IATA_CODE and AIRLINE_x is the same thing
df.columns = ['AIRLINE' if x=='AIRLINE_y' else x for x in df.columns]
df = df.drop(['IATA_CODE'], axis=1)

In [8]:
df.head(5).T

Unnamed: 0,0,1,2,3,4
YEAR,2015,2015,2015,2015,2015
MONTH,1,1,1,1,1
DAY,1,1,1,1,1
DAY_OF_WEEK,4,4,4,4,4
AIRLINE_x,AS,AS,AS,AS,AS
FLIGHT_NUMBER,98,135,108,122,130
TAIL_NUMBER,N407AS,N527AS,N309AS,N413AS,N457AS
ORIGIN_AIRPORT,ANC,SEA,ANC,ANC,FAI
DESTINATION_AIRPORT,SEA,ANC,SEA,PDX,SEA
SCHEDULED_DEPARTURE,5,25,45,50,115


<div class="alert alert-block alert-info">
<b>info:</b>  Now with airports datatset</div>

In [10]:
df = dd.merge(df,airports[['IATA_CODE','AIRPORT','CITY']], left_on='ORIGIN_AIRPORT', right_on = 'IATA_CODE')

In [11]:
df.head(5).T

Unnamed: 0,0,1,2,3,4
YEAR,2015,2015,2015,2015,2015
MONTH,1,1,1,1,1
DAY,1,1,1,1,1
DAY_OF_WEEK,4,4,4,4,4
AIRLINE_x,AS,AS,AS,AS,AS
FLIGHT_NUMBER,98,108,122,136,134
TAIL_NUMBER,N407AS,N309AS,N413AS,N431AS,N464AS
ORIGIN_AIRPORT,ANC,ANC,ANC,ANC,ANC
DESTINATION_AIRPORT,SEA,SEA,PDX,SEA,SEA
SCHEDULED_DEPARTURE,5,45,50,135,155


<div class="alert alert-block alert-info">
<b>info:</b>  for airport we have IATA_CODE for airports. but we already have a column named ORIGIN_AIRPORT, with the same value. so we drop IATA_CODE</div>

In [12]:
df = df.drop(['IATA_CODE'], axis=1)

In [13]:
# now we conver AIRLINE_x column name to IATA_CODE_PLANE
df.columns = ['IATA_CODE_PLANE' if x=='AIRLINE_x' else x for x in df.columns]

In [None]:
df.YEAR.unique().compute()

0    2015.0
Name: YEAR, dtype: float64

<div class="alert alert-block alert-info">
<b>info:</b>  Final clean dataset </div>

In [14]:
df.head(5).T

Unnamed: 0,0,1,2,3,4
YEAR,2015,2015,2015,2015,2015
MONTH,1,1,1,1,1
DAY,1,1,1,1,1
DAY_OF_WEEK,4,4,4,4,4
IATA_CODE_PLANE,AS,AS,AS,AS,AS
FLIGHT_NUMBER,98,108,122,136,134
TAIL_NUMBER,N407AS,N309AS,N413AS,N431AS,N464AS
ORIGIN_AIRPORT,ANC,ANC,ANC,ANC,ANC
DESTINATION_AIRPORT,SEA,SEA,PDX,SEA,SEA
SCHEDULED_DEPARTURE,5,45,50,135,155


<div class="alert alert-block alert-info">
<b>info:</b> October is missing from our dataset </div>

In [15]:
df['MONTH'].unique().compute()

0      1.0
1      2.0
2      3.0
3      4.0
4      5.0
5      6.0
6      7.0
7      8.0
8      9.0
9     11.0
10    12.0
Name: MONTH, dtype: float64

## Analyzing the Dataset

### *Top 15 popular origins*

In [16]:
airport = df['AIRPORT'].value_counts().head(15)

In [17]:
trace = go.Bar(
    x=airport.index,
    y=airport.values,
    marker=dict(
        color = airport.values,
        colorscale='Jet',
        showscale=True
    )
)

data = [trace]
layout = go.Layout(
    title='Top 15 popular airport for departure', 
    yaxis = dict(title = '# of Flights')
)

fig = go.Figure(data=data, layout=layout)
#for showing on the notbook
py.iplot(fig)

### *Number of Flights by origin(Top 10)*

In [18]:
city=df.CITY.value_counts().head(10)

In [19]:
trace = go.Bar(
    x=city.index,
    y=city.values,
    marker=dict(
        color = city.values,
        colorscale='Jet',
        showscale=True
    )
)

data = [trace]
layout = go.Layout(
    title='Origin City Distribution', 
    yaxis = dict(title = '# of Flights')
)

fig = go.Figure(data=data, layout=layout)
py.iplot(fig)

### *Top 15 destinations*

In [20]:
destination = df['DESTINATION_AIRPORT'].value_counts().head(15)

In [21]:
trace = go.Bar(
    x=destination.index,
    y=destination.values,
    marker=dict(
        color = destination.values,
        colorscale='Jet',
        showscale=True
    )
)

data = [trace]
layout = go.Layout(
    title='Top 15 popular airport for destination', 
    yaxis = dict(title = '# of Flights')
)

fig = go.Figure(data=data, layout=layout)
py.iplot(fig)

### *Most and least popular airlines*

In [22]:
most=df['AIRLINE'].value_counts().head(7)
least=df['AIRLINE'].value_counts().tail(7)

In [23]:
#airlines flight rankings
trace1 = go.Scatter(x=most.index, y=most.values,name='Most Populer Flights',marker=dict(color='green'))
trace2 = go.Scatter(x=least.index, y=least.values,name='Least Populer Flights',marker=dict(color='red'))

data=[trace1,trace2]
layout = dict(title = 'Airline distribution')
fig = dict(data=data,layout = layout)
py.iplot(fig)

### *Monthly and weekly flights*

In [24]:
month = df.MONTH.value_counts()
MONTH = {1: 'Jan', 2: 'Feb', 3: 'Mar', 4: 'Apr', 5: 'May', 6: 'Jun',
         7: 'Jul', 8: 'Aug', 9: 'Sep', 10: 'Oct', 11: 'Nov', 12: 'Dec'}
month.index = month.index.map(MONTH)

In [25]:
trace1 = go.Bar(x=month.index, y=month.values)
data1 = [trace1]
layout = go.Layout(title='Monthly Flights',
                   yaxis=dict(title='Number of flights'),
                   height=500,
                   width=800)
fig = go.Figure(data=data1, layout=layout)
py.iplot(fig)



In [26]:
trace2 =go.Pie(labels=month.index, values=month.values)
data2 = [trace2]
layout = go.Layout(
    title='Distribution of total number of flights monthly'
)

fig = go.Figure(data=data2, layout=layout)
py.iplot(fig)

In [27]:
dayOfWeek={1:'Monday', 2:'Tuesday', 3:'Wednesday', 4:'Thursday', 5:'Friday', 
                                           6:'Saturday', 7:'Sunday'}
day=df.DAY_OF_WEEK.value_counts()
day.index=day.index.map(dayOfWeek)

In [28]:
trace1 = go.Bar(x=day.index, y=day.values)
data1 = [trace1]
layout = go.Layout(title='Weekly Flights',
                   yaxis=dict(title='Number of flights'),
                   height=500,
                   width=800)
fig = go.Figure(data=data1, layout=layout)
py.iplot(fig)

In [29]:
trace2 =go.Pie(labels=day.index, values=day.values)
data2 = [trace2]
layout = go.Layout(
    title='Distribution of total number of flights weekly'
)

fig = go.Figure(data=data2, layout=layout)
py.iplot(fig)

### *Mean speed of each airlines*

In [30]:
df['SPEED'] = 60*df['DISTANCE']/df['AIR_TIME']
speed = df.groupby('AIRLINE').SPEED.mean()


In [31]:
trace = go.Scatter(
    x=speed.index,
    y=speed.values,
    mode='markers',
    marker=dict(
        sizemode = 'diameter',
        sizeref = 1,
        size = 30,
        color = speed.values,
        colorscale='Jet',
        showscale=True
    )
)

data = [trace]
layout = go.Layout(xaxis=dict(tickangle=-20),
    title='Mean Speed by Airlines', 
                   yaxis = dict(title = 'Speed meter/mins')
)

fig = go.Figure(data=data, layout=layout)
py.iplot(fig)

### *Mean taxi in and taxi out time* 
> *taxi in time: arrival_time - wheels_on*
>
> *taxi out time:wheels_off - departure_time*

In [32]:

fig3 = go.Figure(
    data=[
        go.Bar(
            name='Taxi in',
            x=df.AIRLINE,
            y=df.TAXI_IN,
            offsetgroup=0,
        ),
        go.Bar(
            name='Taxi out',
            x=df.AIRLINE,
            y=df.TAXI_OUT,
            offsetgroup=1,
        )
    ],
    layout=go.Layout(
        title="Mean taxi in and taxi out by airline",
        yaxis_title="Total number"))
py.iplot(fig3) #i did mistake while calling the figure, i called fig insted of fig3 while running the code

In [33]:
df['OUT_IN_DIFF'] = df['TAXI_OUT'] - df['TAXI_IN']

In [34]:
s=df.head(5)
s.T

Unnamed: 0,0,1,2,3,4
YEAR,2015,2015,2015,2015,2015
MONTH,1,1,1,1,1
DAY,1,1,1,1,1
DAY_OF_WEEK,4,4,4,4,4
IATA_CODE_PLANE,AS,AS,AS,AS,AS
FLIGHT_NUMBER,98,108,122,136,134
TAIL_NUMBER,N407AS,N309AS,N413AS,N431AS,N464AS
ORIGIN_AIRPORT,ANC,ANC,ANC,ANC,ANC
DESTINATION_AIRPORT,SEA,SEA,PDX,SEA,SEA
SCHEDULED_DEPARTURE,5,45,50,135,155


In [35]:
df_diff=df.groupby('AIRLINE').OUT_IN_DIFF.mean()
df_diff.compute()

AIRLINE
Alaska Airlines Inc.             8.723644
American Airlines Inc.           8.862173
American Eagle Airlines Inc.     7.468230
Atlantic Southeast Airlines      9.107744
Delta Air Lines Inc.            10.488260
Frontier Airlines Inc.           6.612890
Hawaiian Airlines Inc.           4.106766
JetBlue Airways                 11.831071
Skywest Airlines Inc.           11.198276
Southwest Airlines Co.           5.776747
Spirit Air Lines                 5.114375
US Airways Inc.                 11.373059
United Air Lines Inc.            8.935741
Virgin America                   6.619072
Name: OUT_IN_DIFF, dtype: float64

In [36]:
trace = go.Scatter(
    x=df_diff.index,
    y=df_diff.values,
    mode='markers',
    marker=dict(
        sizemode = 'diameter',
        sizeref = 1,
        size = 30,
        color = df_diff.values,
        colorscale='Jet',
        showscale=True
    )
)

data = [trace]
layout = go.Layout(xaxis=dict(tickangle=-20),
    title='Mean time a flight spends on taxing by Airlines', 
                   yaxis = dict(title = 'minutes')
)

fig = go.Figure(data=data, layout=layout)
py.iplot(fig)

In [None]:
##airspace
df.columns

Index(['YEAR', 'MONTH', 'DAY', 'DAY_OF_WEEK', 'IATA_CODE_PLANE',
       'FLIGHT_NUMBER', 'TAIL_NUMBER', 'ORIGIN_AIRPORT', 'DESTINATION_AIRPORT',
       'SCHEDULED_DEPARTURE', 'DEPARTURE_TIME', 'DEPARTURE_DELAY', 'TAXI_OUT',
       'WHEELS_OFF', 'SCHEDULED_TIME', 'ELAPSED_TIME', 'AIR_TIME', 'DISTANCE',
       'WHEELS_ON', 'TAXI_IN', 'SCHEDULED_ARRIVAL', 'ARRIVAL_TIME',
       'ARRIVAL_DELAY', 'DIVERTED', 'CANCELLED', 'CANCELLATION_REASON',
       'AIR_SYSTEM_DELAY', 'SECURITY_DELAY', 'AIRLINE_DELAY',
       'LATE_AIRCRAFT_DELAY', 'WEATHER_DELAY', 'AIRLINE', 'AIRPORT', 'CITY',
       'SPEED'],
      dtype='object')

### *Correlation of the variables*

In [37]:
correlation = df[['DAY_OF_WEEK','MONTH','ELAPSED_TIME', 'AIR_TIME', 'DISTANCE',
       'WHEELS_ON', 'TAXI_IN', 'SCHEDULED_ARRIVAL', 'ARRIVAL_TIME',
       'ARRIVAL_DELAY','SPEED']].fillna(0).corr()
cols = correlation.columns.values
corr  = correlation.values



In [38]:
trace = go.Heatmap(z = corr,
                   x = cols,
                   y = cols,
                   colorscale = "YlOrRd",reversescale = True
                                    ) 

data = [trace]
layout = go.Layout(dict(title = "Correlation Matrix for variables",
                        autosize = False,
                        height  = 600,
                        width   = 800,
                        margin  = dict(l = 200
                                      ),
                        yaxis   = dict(tickfont = dict(size = 8)),
                        xaxis   = dict(tickfont = dict(size = 8))
                       )
                  )

fig = go.Figure(data=data,layout=layout)
py.iplot(fig)

### *In depth analysis of flight delays*

In [39]:

airline_delay = df.loc[:, ['AIRLINE', 
                                     'DEPARTURE_DELAY',
                                     'ARRIVAL_DELAY','DAY_OF_WEEK','MONTH']]

#### *Mean delay by airlines*

In [41]:
delay_air=airline_delay.groupby('AIRLINE').mean()


In [42]:
delay_air.head(5)

Unnamed: 0_level_0,DEPARTURE_DELAY,ARRIVAL_DELAY,DAY_OF_WEEK,MONTH
AIRLINE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Alaska Airlines Inc.,1.957637,-0.791409,3.981842,6.296158
American Airlines Inc.,9.330882,3.935214,3.947695,6.983532
American Eagle Airlines Inc.,10.685483,7.31654,3.916589,5.798577
Atlantic Southeast Airlines,9.076964,6.96455,3.886423,6.030024
Delta Air Lines Inc.,7.763099,0.680265,3.900085,6.295508


In [43]:
fig = go.Figure(data=[
        go.Bar(name='Departure Delay', x=delay_air.index, y=delay_air.DEPARTURE_DELAY),
        go.Bar(name='Arrival Delay', x=delay_air.index, y=delay_air.ARRIVAL_DELAY),
    ])

fig.update_layout(barmode='group')
py.iplot(fig)

#### *Mean departure delay and arrival delay difference*

In [44]:
delay_air['DEP_ARR_DIFF']=delay_air['DEPARTURE_DELAY']-delay_air['ARRIVAL_DELAY']
delay_air.head(5)

Unnamed: 0_level_0,DEPARTURE_DELAY,ARRIVAL_DELAY,DAY_OF_WEEK,MONTH,DEP_ARR_DIFF
AIRLINE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Alaska Airlines Inc.,1.957637,-0.791409,3.981842,6.296158,2.749046
American Airlines Inc.,9.330882,3.935214,3.947695,6.983532,5.395668
American Eagle Airlines Inc.,10.685483,7.31654,3.916589,5.798577,3.368943
Atlantic Southeast Airlines,9.076964,6.96455,3.886423,6.030024,2.112414
Delta Air Lines Inc.,7.763099,0.680265,3.900085,6.295508,7.082835


In [45]:
trace = go.Bar(
    x=delay_air.index,
    y=delay_air.DEP_ARR_DIFF,
    marker=dict(
        color = delay_air.DEP_ARR_DIFF,
        colorscale='Jet',
        showscale=True
    )
)

data = [trace]
layout = go.Layout(xaxis=dict(tickangle=15),
    title='Mean (Departure Delay - Arrival Delay) by Airlines', 
                   yaxis = dict(title = 'minute')
                  )

fig = go.Figure(data=data, layout=layout)
py.iplot(fig)

#### *Proportion of monthly delay* 

In [47]:
month = {1: 'Jan', 2: 'Feb', 3: 'Mar', 4: 'Apr', 5: 'May',
            6: 'Jun', 7: 'Jul', 8: 'Aug', 9: 'Sep', 10: 'Oct', 11: 'Nov', 12: 'Dec'}
dff = airline_delay.groupby('MONTH').ARRIVAL_DELAY.mean()

dff.index = dff.index.map(MONTH)
trace1 = go.Bar(
    x=dff.index,
    y=dff.values,
    name = 'Arrival_delay',
    marker = dict(
        color = 'green'
    )
)

dff =airline_delay.groupby('MONTH').DEPARTURE_DELAY.mean()
dff.index = dff.index.map(month)

trace2 = go.Bar(
    x=dff.index,
    y=dff.values,
    name='Departure_delay',
    marker=dict(
        color = 'red'
    )
)

data = [trace1,trace2]
layout = go.Layout(
    title='% Delay (Months)', 
    yaxis = dict(title = '%')
)

fig = go.Figure(data=data, layout=layout)
py.iplot(fig)

#### *Proportion of weekly delay*

In [49]:
dayOfWeek={1:'Monday', 2:'Tuesday', 3:'Wednesday', 4:'Thursday', 5:'Friday', 
                                           6:'Saturday', 7:'Sunday'}
dff = airline_delay.groupby('DAY_OF_WEEK').DEPARTURE_DELAY.mean()

dff.index = dff.index.map(dayOfWeek)

trace1 = go.Bar(
    x=dff.index,
    y=dff.values,
    name = 'Departure_delay',
    marker=dict(
        color = 'cyan'
    )
)

dff = airline_delay.groupby('DAY_OF_WEEK').ARRIVAL_DELAY.mean()
dff.index = dff.index.map(dayOfWeek)

trace2 = go.Bar(
    x=dff.index,
    y=dff.values,
    name='Arrival_delay',
    marker=dict(
        color = 'indigo'
    )
)

data = [trace1,trace2]
layout = go.Layout(
    title='% Delay (Day of Week)', 
    yaxis = dict(title = '%')
)

fig = go.Figure(data=data, layout=layout)
py.iplot(fig)

#### *Mean delay of top 10 cities*

In [50]:
delay = df.loc[:, ['CITY', 'DEPARTURE_DELAY',
                                     'ARRIVAL_DELAY','DAY_OF_WEEK','MONTH']]
delay_city=delay.groupby('CITY').mean()

In [51]:
del_city=delay_city.nlargest(10, 'DEPARTURE_DELAY').compute()

In [52]:
arr_city=delay_city.nlargest(10, 'ARRIVAL_DELAY').compute()

In [53]:

trace1 = go.Bar(
    x=del_city.index,
    y=del_city.DEPARTURE_DELAY,
    marker=dict(
        color = 'limegreen'
    )
)



trace2 = go.Bar(
    x=arr_city.index,
    y=arr_city.ARRIVAL_DELAY,
    marker=dict(
        color = 'gold'
    )
)

fig = tools.make_subplots(rows=1, cols=2, subplot_titles=('Mean Departure Delay by City', 
                                                          'Mean Arrival Delay by City'))
fig.append_trace(trace1, 1,1)
fig.append_trace(trace2, 1,2)

fig['layout'].update(yaxis = dict(title = 'minute'), height=500, width=850, 
                     title='Arrival and departure delay by city',  
                     showlegend=False)                    
py.iplot(fig)


plotly.tools.make_subplots is deprecated, please use plotly.subplots.make_subplots instead



#### *Reasons for yearly delay by month*

In [54]:
df_del=df.copy().compute()

In [55]:
dff=df_del[['AIR_SYSTEM_DELAY', 'SECURITY_DELAY', 'AIRLINE_DELAY',
       'LATE_AIRCRAFT_DELAY', 'WEATHER_DELAY', 'MONTH']]

In [56]:
df2 = dff.groupby('MONTH')[['AIR_SYSTEM_DELAY', 'SECURITY_DELAY', 'AIRLINE_DELAY',
       'LATE_AIRCRAFT_DELAY', 'WEATHER_DELAY']].sum()

In [57]:
df2.head(5)

Unnamed: 0_level_0,AIR_SYSTEM_DELAY,SECURITY_DELAY,AIRLINE_DELAY,LATE_AIRCRAFT_DELAY,WEATHER_DELAY
MONTH,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1.0,1278055.0,6700.0,1708155.0,2183865.0,263087.0
2.0,1349173.0,4580.0,1712660.0,2157918.0,411265.0
3.0,1228893.0,6586.0,1818680.0,2156090.0,228661.0
4.0,1118027.0,3660.0,1490594.0,1785437.0,221590.0
5.0,1254652.0,5711.0,1667920.0,2172521.0,336555.0


In [58]:
df2.index=df2.index.map(MONTH)
trace_1=go.Scatter(x=df2.index, y=df2.AIR_SYSTEM_DELAY,
                    mode='lines',
                    name='AIR_SYSTEM_DELAY')
trace_2=go.Scatter(x=df2.index, y=df2.SECURITY_DELAY,
                    mode='lines',
                    name='SECURITY_DELAY')

trace_3=go.Scatter(x=df2.index, y=df2.AIRLINE_DELAY,
                    mode='lines',
                    name='AIRLINE_DELAY')

trace_4=go.Scatter(x=df2.index, y=df2.LATE_AIRCRAFT_DELAY,
                    mode='lines',
                    name='LATE_AIRCRAFT_DELAY')

trace_5=go.Scatter(x=df2.index, y=df2.WEATHER_DELAY,
                    mode='lines',
                    name='WEATHER_DELAY')
data = [trace_1,trace_2,trace_3,trace_4,trace_5]
layout = go.Layout(
 title='Delayed reason', 
                   yaxis = dict(title = 'Number of incidents'))
fig = go.Figure(data=data, layout=layout)
py.iplot(fig)

#### *Reasons for airlines flight delay*

In [59]:
df_air=df.copy().compute()

In [60]:
dff=df_air[['AIR_SYSTEM_DELAY', 'SECURITY_DELAY', 'AIRLINE_DELAY',
       'LATE_AIRCRAFT_DELAY', 'WEATHER_DELAY', 'AIRLINE']]

In [61]:
df2 = dff.groupby('AIRLINE')[['AIR_SYSTEM_DELAY', 'SECURITY_DELAY', 'AIRLINE_DELAY',
       'LATE_AIRCRAFT_DELAY', 'WEATHER_DELAY']].sum()

In [62]:
trace_1=go.Bar(x=df2.index, y=df2.AIR_SYSTEM_DELAY,
                    name='AIR_SYSTEM_DELAY')
trace_2=go.Bar(x=df2.index, y=df2.SECURITY_DELAY,
                    name='SECURITY_DELAY')

trace_3=go.Bar(x=df2.index, y=df2.AIRLINE_DELAY,
                    name='AIRLINE_DELAY')

trace_4=go.Bar(x=df2.index, y=df2.LATE_AIRCRAFT_DELAY,
                    name='LATE_AIRCRAFT_DELAY')

trace_5=go.Bar(x=df2.index, y=df2.WEATHER_DELAY,
                    name='WEATHER_DELAY')
data = [trace_1,trace_2,trace_3,trace_4,trace_5]
layout = go.Layout(
 title='Delayed reason by airlines', 
                   yaxis = dict(title = 'Number of incidents'))
fig = go.Figure(data=data, layout=layout)
py.iplot(fig)

### *In depth analysis of flight cancellations* 

#### *Cancellation rate by airlines*

In [63]:
dff = df.groupby('AIRLINE')[['CANCELLED']].mean()

In [64]:
trace1 = go.Scatter(
    x=dff.index,
    y=dff.CANCELLED,
    mode='markers',
    marker=dict(
        symbol = 'star-square',
        sizemode = 'diameter',
        sizeref = 1,
        size = 30,
        color = dff.CANCELLED,
        colorscale='Portland',
        showscale=True
    )
)

data = [trace1]
layout = go.Layout(xaxis=dict(tickangle=20),
    title='Cancellation Rate by Airlines', yaxis = dict(title = 'Cancellation Rate')
)

fig = go.Figure(data=data, layout=layout)
py.iplot(fig)

#### *Top 10 and bottom 10 cities with cancellation* 

In [65]:
dff = df.groupby('CITY')[['CANCELLED']].mean()
dff1=dff.nlargest(10, 'CANCELLED').compute()
dff2=dff.nsmallest(10, 'CANCELLED').compute()

In [66]:
trace2 = go.Scatter(
    x=dff1.index,
    y=dff1.CANCELLED,
    mode='markers',
    marker=dict(symbol = 'diamond',
        sizemode = 'diameter',
        sizeref = 1,
        size = 30,
        color = dff1.CANCELLED,
        colorscale='Portland',
        showscale=True
    )
)

data = [trace2]
layout = go.Layout(xaxis=dict(tickangle=20),
    title='Top 10 cities by cancelation rate', 
                   yaxis = dict(title = 'Cancellation Rate')
)

fig = go.Figure(data=data, layout=layout)
py.iplot(fig)

trace22 = go.Scatter(
    x=dff2.index,
    y=dff2.CANCELLED,
    mode='markers',
    marker=dict(symbol = 'diamond',
        sizemode = 'diameter',
        sizeref = 1,
        size = 30,
        color = dff2.CANCELLED,
        colorscale='Portland',
        showscale=True
    )
)

data = [trace22]
layout = go.Layout(xaxis=dict(tickangle=20),
    title='Bottom 10 cities by cancelation rate', 
                   yaxis = dict(title = 'Cancellation Rate')
)

fig = go.Figure(data=data, layout=layout)
py.iplot(fig)

#### *Reasons for cancellation*

In [None]:
df['CANCELLATION_REASON'].unique().compute()

0    NaN
1      A
2      B
3      C
4      D
Name: CANCELLATION_REASON, dtype: object

In [67]:
reason={'A':'Airline/Carrier', 'B':'Weather', 'C':'National Air System', 'D':'Security'}
df.CANCELLATION_REASON = df.CANCELLATION_REASON.map(reason)
#if df.CANCELLED==0, means non cacelation
dff = df[df.CANCELLED==1]['MONTH'].value_counts()

In [68]:
dff.index=dff.index.map(MONTH)
trace = go.Bar(
    x=dff.index,
    y=dff.values,
    marker=dict(
        color = dff.values,
        colorscale='Reds',
        showscale=True
    )
)

data = [trace]
layout = go.Layout(
    title='# of Cancelled Flights (monthly)', 
    yaxis = dict(title = '# of Flights'
                                                          )
)

fig = go.Figure(data=data, layout=layout)
py.iplot(fig)

#### *monthly cancellation* 

In [69]:
dff = df[df.CANCELLATION_REASON=='Weather'].MONTH.value_counts()

trace1 = go.Bar(
    x=dff.index,
    y=dff.values,
    name = 'Weather',
    marker=dict(
        color = 'aqua'
    )
)

dff = df[df.CANCELLATION_REASON=='Airline/Carrier'].MONTH.value_counts()


trace2 = go.Bar(
    x=dff.index,
    y=dff.values,
    name='Airline/Carrier',
    marker=dict(
        color = 'red'
    )
)

dff = df[df.CANCELLATION_REASON=='National Air System'].MONTH.value_counts()


trace3 = go.Bar(
    x=dff.index,
    y=dff.values,
    name='National Air System',
    marker=dict(
        color = 'navy'
    )
)



data = [trace1,trace2,trace3]
layout = go.Layout(
    title='Cancellation Reasons (Monthly)', 
    yaxis = dict(title = '# of Flights'
                                                        )
)

fig = go.Figure(data=data, layout=layout)
py.iplot(fig)

#### *weekly cancellation* 

In [70]:
dff = df[df.CANCELLATION_REASON == 'Weather'].DAY_OF_WEEK.value_counts()


trace1 = go.Bar(
    x=dff.index,
    y=dff.values,
    name = 'Weather',
    marker=dict(
        color = 'aqua'
    )
)

dff = df[df.CANCELLATION_REASON=='Airline/Carrier'].DAY_OF_WEEK.value_counts()


trace2 = go.Bar(
    x=dff.index,
    y=dff.values,
    name='Airline/Carrier',
    marker=dict(
        color = 'red'
    )
)

dff = df[df.CANCELLATION_REASON=='National Air System'].DAY_OF_WEEK.value_counts()


trace3 = go.Bar(
    x=dff.index,
    y=dff.values,
    name='National Air System',
    marker=dict(
        color = 'navy'
    )
)

data = [trace1,trace2,trace3]
layout = go.Layout(
    title='Cancellation Reasons (Day of Week)', 
    yaxis = dict(title = '# of Flights'
                                                            )
)

fig = go.Figure(data=data, layout=layout)
py.iplot(fig)

#### *cancellation by airlines*

In [71]:
dff = df[df.CANCELLATION_REASON=='Weather'].AIRLINE.value_counts()

trace1 = go.Bar(
    x=dff.index,
    y=dff.values,
    name = 'Weather',
    marker=dict(
        color = 'aqua'
    )
)

dff = df[df.CANCELLATION_REASON=='Airline/Carrier'].AIRLINE.value_counts()


trace2 = go.Bar(
    x=dff.index,
    y=dff.values,
    name='Airline/Carrier',
    marker=dict(
        color = 'red'
    )
)

dff = df[df.CANCELLATION_REASON=='National Air System'].AIRLINE.value_counts()


trace3 = go.Bar(
    x=dff.index,
    y=dff.values,
    name='National Air System',
    marker=dict(
        color = 'navy'
    )
)



data = [trace1,trace2,trace3]
layout = go.Layout(
    title='Cancellation Reasons (by airline)', 
    yaxis = dict(title = '# of Flights'
                                                        )
)

fig = go.Figure(data=data, layout=layout)
py.iplot(fig)

# Let's see some map visualization 

In [72]:
airport = dd.read_csv('airports.csv')

## arrival delay in origin and destination airport

In [73]:
def get_airport_plot(select):
    data_select = airport.rename(columns={'IATA_CODE': select})
    data_select_plot = dd.merge(data_select,
                               df.loc[:, [select, 'ARRIVAL_DELAY']]\
                               .groupby(select).mean().reset_index())
    
    data_select_plot['text_plot'] = ('Airport: ' + data_select_plot['AIRPORT'] + '<br>' 
                                    + 'City: ' + data_select_plot['CITY'] + '<br>'
                                    + 'State: ' + data_select_plot['STATE'] + '<br>'
                                    + 'Mean arrival Delay: '
                                    + ((data_select_plot['ARRIVAL_DELAY']).astype(str) + '<br>'))
    
    return data_select_plot

In [74]:
airport_origin_plot = get_airport_plot('ORIGIN_AIRPORT')
airport_origin_plot.head(5).T

Unnamed: 0,0,1,2,3,4
ORIGIN_AIRPORT,ABE,ABI,ABQ,ABR,ABY
AIRPORT,Lehigh Valley International Airport,Abilene Regional Airport,Albuquerque International Sunport,Aberdeen Regional Airport,Southwest Georgia Regional Airport
CITY,Allentown,Abilene,Albuquerque,Aberdeen,Albany
STATE,PA,TX,NM,SD,GA
COUNTRY,USA,USA,USA,USA,USA
LATITUDE,40.6524,32.4113,35.0402,45.4491,31.5355
LONGITUDE,-75.4404,-99.6819,-106.609,-98.4218,-84.1945
ARRIVAL_DELAY,6.24966,3.27208,4.76826,9.9095,7.58776
text_plot,Airport: Lehigh Valley International Airport<b...,Airport: Abilene Regional Airport<br>City: Abi...,Airport: Albuquerque International Sunport<br>...,Airport: Aberdeen Regional Airport<br>City: Ab...,Airport: Southwest Georgia Regional Airport<br...


In [75]:
def delay_pct(dataplot, titlehere):
    #data
    data = [dict(type='scattergeo',
                 lat=dataplot['LATITUDE'],
                 lon=dataplot['LONGITUDE'],
                 marker=dict(
                     autocolorscale=False, 
                     cmax=20, 
                     cmin=-2, 
                     color= dataplot['ARRIVAL_DELAY'],
                     colorbar=dict(title="Mean Delay"), 
                     #colorscale=scale, #'Viridis' 
                     line=dict(
                         color="rgba(102,102,102)", 
                         width=1
                     ), 
                     opacity=0.8, 
                     size=8
                 ),

                 text=dataplot['text_plot'],
                 mode='markers',
                )]
    
    #layout
    layout = dict(title= titlehere + '<br> Hover for value',
                 geo=dict(  scope='usa',
                            projection_type='albers usa',
                            showland = True,
                            landcolor = "rgb(250, 250, 250)",
                            subunitcolor = "rgb(217, 217, 217)",
                            countrycolor = "rgb(217, 217, 217)",
                            countrywidth = 0.5,
                            subunitwidth = 0.5
        ),
                         )
                 
    
    fig = dict(data=data, layout=layout)
    return py.iplot(fig)

In [76]:
delay_pct(airport_origin_plot, 'Flight arrival Delay of Origin Airports in 2015')

In [77]:
airport_destination_plot = get_airport_plot('DESTINATION_AIRPORT')

In [78]:
delay_pct(airport_destination_plot,
          'Flight arrival Delay of Destination Airports in 2015')

## departure delay in origin and destination airport

In [80]:
def dset_airport_plot(select):
    data_select = airport.rename(columns={'IATA_CODE': select})
    data_select_plot = dd.merge(data_select,
                               df.loc[:, [select, 'DEPARTURE_DELAY']]\
                               .groupby(select).mean().reset_index())
    
    data_select_plot['text_plot'] = ('Airport: ' + data_select_plot['AIRPORT'] + '<br>' 
                                    + 'City: ' + data_select_plot['CITY'] + '<br>'
                                    + 'State: ' + data_select_plot['STATE'] + '<br>'
                                    + 'Mean departure Delay: '
                                    + ((data_select_plot['DEPARTURE_DELAY']).astype(str) + '<br>'))
    
    return data_select_plot


In [81]:
airport_origin_plot = dset_airport_plot('ORIGIN_AIRPORT')
#airport_origin_plot.head(5).T

In [82]:
def delay_pct(dataplot, titlehere):
    #data
    data = [dict(type='scattergeo',
                 lat=dataplot['LATITUDE'],
                 lon=dataplot['LONGITUDE'],
                 marker=dict(
                     autocolorscale=False, 
                     cmax=30, 
                     cmin=0, 
                     color= dataplot['DEPARTURE_DELAY'],
                     colorbar=dict(title="Mean Delay"), 
                     #colorscale=scale, #'Viridis' 
                     line=dict(
                         color="rgba(102,102,102)", 
                         width=1
                     ), 
                     opacity=0.8, 
                     size=8
                 ),

                 text=dataplot['text_plot'],
                 mode='markers',
                )]
    
    #layout
    layout = dict(title= titlehere + '<br> Hover for value',
                 geo=dict(  scope='usa',
                            projection_type='albers usa',
                            showland = True,
                            landcolor = "rgb(250, 250, 250)",
                            subunitcolor = "rgb(217, 217, 217)",
                            countrycolor = "rgb(217, 217, 217)",
                            countrywidth = 0.5,
                            subunitwidth = 0.5
        ),
                         )
                 
    
    fig = dict(data=data, layout=layout)
    return py.iplot(fig)

In [83]:
delay_pct(airport_origin_plot, 'Flight departure Delay of Origin Airports in 2015')

In [84]:
airport_destination_plot = dset_airport_plot('DESTINATION_AIRPORT')

In [85]:
delay_pct(airport_destination_plot,
          'Flight departure Delay of Destination Airports in 2015')

# line on map on another notebook