# Cleaning Data - Flights

The data set which contains information on United State flight delays and performance comes from RITA. 

In [1]:
import pandas as pd

In [3]:
flights = pd.read_csv('2008.csv')

In [3]:
flights.head()

Unnamed: 0,Year,Month,DayofMonth,DayOfWeek,DepTime,CRSDepTime,ArrTime,CRSArrTime,UniqueCarrier,FlightNum,...,TaxiIn,TaxiOut,Cancelled,CancellationCode,Diverted,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay
0,2008,1,3,4,2003.0,1955,2211.0,2225,WN,335,...,4.0,8.0,0,,0,,,,,
1,2008,1,3,4,754.0,735,1002.0,1000,WN,3231,...,5.0,10.0,0,,0,,,,,
2,2008,1,3,4,628.0,620,804.0,750,WN,448,...,3.0,17.0,0,,0,,,,,
3,2008,1,3,4,926.0,930,1054.0,1100,WN,1746,...,3.0,7.0,0,,0,,,,,
4,2008,1,3,4,1829.0,1755,1959.0,1925,WN,3920,...,3.0,10.0,0,,0,2.0,0.0,0.0,0.0,32.0


The Variable descriptions are as follwes:
1. Year	1987-2008
2. Month	1-12
3. DayofMonth	1-31
4. DayOfWeek	1 (Monday) - 7 (Sunday)
5. DepTime	actual departure time (local, hhmm)
6. CRSDepTime	scheduled departure time (local, hhmm)
7. ArrTime	actual arrival time (local, hhmm)
8. CRSArrTime	scheduled arrival time (local, hhmm)
9. UniqueCarrier	unique carrier code
10. FlightNum	flight number
11. TailNum	plane tail number
12. ActualElapsedTime	in minutes
13. CRSElapsedTime	in minutes
14. AirTime	in minutes
15. ArrDelay	arrival delay, in minutes
16. DepDelay	departure delay, in minutes
17. Origin	origin IATA airport code
18. Dest	destination IATA airport code
19. Distance	in miles
20. TaxiIn	taxi in time, in minutes
21. TaxiOut	taxi out time in minutes
22. Cancelled	was the flight cancelled?
23. CancellationCode	reason for cancellation (A = carrier, B = weather, C = NAS, D = security)
24. Diverted	1 = yes, 0 = no
25. CarrierDelay	in minutes
26. WeatherDelay	in minutes
27. NASDelay	in minutes
28. SecurityDelay	in minutes
29. LateAircraftDelay	in minutes

In [4]:
fl=flights.loc[(flights.Cancelled == 0),['Month','DayofMonth','DayofWeek','ArrDelay','DepDelay','Origin','Dest','CarrierDelay','WeatherDelay','NASDelay','SecurityDelay','LateAircraftDelay']]

In [11]:
fl.head()

Unnamed: 0,Month,DayofMonth,DayofWeek,ArrDelay,DepDelay,Origin,Dest,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay
0,1,3,,-14.0,8.0,IAD,TPA,,,,,
1,1,3,,2.0,19.0,IAD,TPA,,,,,
2,1,3,,14.0,8.0,IND,BWI,,,,,
3,1,3,,-6.0,-4.0,IND,BWI,,,,,
4,1,3,,34.0,34.0,IND,BWI,2.0,0.0,0.0,0.0,32.0


In [5]:
fl_origin = fl.groupby(['Month','Origin'])[['DepDelay']].sum()

In [6]:
fl_origin.reset_index(inplace=True)

In [14]:
fl_origin.head()

Unnamed: 0,Month,Origin,DepDelay
0,1,ABE,5234.0
1,1,ABI,3383.0
2,1,ABQ,25870.0
3,1,ABY,1597.0
4,1,ACT,1811.0


In [8]:
fl_dest = fl.groupby(['Month','Dest'])[['ArrDelay']].sum()

In [9]:
fl_dest.reset_index(inplace=True)

In [13]:
fl_dest.head()

Unnamed: 0,Month,Dest,ArrDelay
0,1,ABE,5876.0
1,1,ABI,2204.0
2,1,ABQ,22234.0
3,1,ABY,1304.0
4,1,ACT,2279.0


In [22]:
fl2=pd.merge(fl_origin,fl_dest,left_on=['Origin','Month'],right_on=['Dest','Month'])

In [25]:
fl3=fl2.drop('Dest',axis=1)

In [26]:
fl3.head()

Unnamed: 0,Month,Origin,DepDelay,ArrDelay
0,1,ABE,5234.0,5876.0
1,1,ABI,3383.0,2204.0
2,1,ABQ,25870.0,22234.0
3,1,ABY,1597.0,1304.0
4,1,ACT,1811.0,2279.0


In [27]:
iata=pd.read_csv('airports.csv')

In [28]:
iata.head()

Unnamed: 0,iata,airport,city,state,country,lat,long
0,00M,Thigpen,Bay Springs,MS,USA,31.953765,-89.234505
1,00R,Livingston Municipal,Livingston,TX,USA,30.685861,-95.017928
2,00V,Meadow Lake,Colorado Springs,CO,USA,38.945749,-104.569893
3,01G,Perry-Warsaw,Perry,NY,USA,42.741347,-78.052081
4,01J,Hilliard Airpark,Hilliard,FL,USA,30.688012,-81.905944


In [30]:
fl4=pd.merge(fl3,iata,left_on='Origin',right_on='iata')

In [31]:
fl4.head()

Unnamed: 0,Month,Origin,DepDelay,ArrDelay,iata,airport,city,state,country,lat,long
0,1,ABE,5234.0,5876.0,ABE,Lehigh Valley International,Allentown,PA,USA,40.652363,-75.440402
1,2,ABE,3748.0,6697.0,ABE,Lehigh Valley International,Allentown,PA,USA,40.652363,-75.440402
2,3,ABE,3873.0,5521.0,ABE,Lehigh Valley International,Allentown,PA,USA,40.652363,-75.440402
3,4,ABE,-70.0,1746.0,ABE,Lehigh Valley International,Allentown,PA,USA,40.652363,-75.440402
4,5,ABE,1232.0,546.0,ABE,Lehigh Valley International,Allentown,PA,USA,40.652363,-75.440402


In [32]:
fl4.to_csv('2008_Airports_Delay.csv')