# Group Assignment - Reducer

### Reducer - Libraries & File I/O

In [1]:
# Libraries needed for reducer.
import sys
import string
import pandas as pd
import numpy as np
from math import radians, cos, sin, asin, sqrt
import json

In [2]:

# Define File I/O for reducer.
finput = open('groupassignmentdata_mapped_sorted.txt','r')
foutput = open('groupassignmentdata_reducerout.txt','w')

### Reducer - Distance Function

#### Haversine Function

In [3]:
def haversine(lon1, lat1, lon2, lat2):
    """
    Calculate the great circle distance in kilometers between two points 
    on the earth (specified in decimal degrees).
    """
    # Convert decimal degrees to radians.
    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])

    # Haversine formula. 
    dlon = lon2 - lon1 
    dlat = lat2 - lat1 
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a)) 
    # Earth radius in kilometers (use 3956 for miles).
    # Determines return value units.
    r = 6371
    return c * r

#### Lat. and Lon. of Beijing

In [4]:
# Latitude and Longitude of Beijing.
# Conversion from coordinates to lat./lon. done with 'fcc.gov'.
# https://www.fcc.gov/media/radio/dms-decimal
beijing_lat = 39.9 # 39°54′N (39 degrees, 54 minutes north)
beijing_lon = 116.4 # 116°24′E (116 degrees, 24 minutes east)

#### Example of Haversine Function Output

In [5]:
# Calculate distance between Mcgill university and Beijing.
mcgill_lat = 45.5041  # 45.5048° N
mcgill_lon = -73.5747  # 73.5772° W

mcgill_beijing = haversine(beijing_lon, beijing_lat, mcgill_lon, mcgill_lat)
print(f"The distance between McGill University and Beijing is {round(mcgill_beijing, 2)} km.")

The distance between McGill University and Beijing is 10466.65 km.


### Reducer - Output

In [6]:
# Function to output summarized data into file output.
def reduceroutput(data):
    dataline = json.dumps(data) + "\n"
    foutput.write(dataline)

### Reducer - Group Values by Key

In [7]:
# Global variables used by 'grouper' function.
previous_key = ""
data_array = []

In [8]:
# Function to group the values by key.
def grouper(line):
    
    global previous_key, data_array
    key, value = line.split("\t", 1)
    
    if (key == previous_key or previous_key == ""):
        data_array.append(json.loads(value))
    else:
        summarize(data_array)
        data_array = [json.loads(value)]
    
    previous_key = key

### Reducer - Summarization

In [9]:
# Function to summarize data including aggregation calculation of distance.
def summarize(data_array):

    dataframe = pd.DataFrame(data_array)
    
    data = {
        "ident": data_array[0]["ident"],
        "id": data_array[0]["id"],
        "distance": haversine(beijing_lon, beijing_lat, float(data_array[len(data_array)-1]["lon"]), float(data_array[len(data_array)-1]["lat"]))
    }
    
    reduceroutput(data)

### Reducer - Input Splitter

In [10]:
for line in finput:
    if (line != ""):
        grouper(line)

In [11]:
if (len(data_array) > 0):
    summarize(data_array)

foutput.close()
finput.close()

### Convert the Reducer Output in Text File to Dataframe

(i.e. "groupassignmentdata_reducerout.txt") to Pandas Dataframe and then to CSV file

In [12]:
# Code block to convert reducer output into dataframe.
with open('groupassignmentdata_reducerout.txt') as f:
    lines = f.readlines()

data_output = []
for line in lines:
    data_output.append(json.loads(line))

df = pd.DataFrame(data_output)
df

Unnamed: 0,ident,id,distance
0,40,40-1535813644-adhoc-0,10941.118810
1,9H8362,9H8362-1535606746-airline-0081,914.219056
2,9H8392,9H8392-1535606746-airline-0346,1370.952194
3,9H8400,9H8400-1535606746-airline-0462,430.081098
4,A07185,A07185-1535606746-airline-0261,19401.473742
...,...,...,...
9742,YZR7504,YZR7504-1535606759-airline-0101,1088.635975
9743,YZR7510,YZR7510-1535606759-airline-0131,1204.133586
9744,YZR7515,YZR7515-1535606759-airline-0420,1073.017833
9745,YZR7522,YZR7522-1535606759-airline-0499,1607.319845


### Sort the dataframe with the list of all flights by closest to furthest to Beijing

In [13]:
# Command to sort dataframe by distance in ascending order.
df_sorted = df.sort_values(by=['distance'])
df_sorted

Unnamed: 0,ident,id,distance
2841,CSN6284,CSN6284-1535606749-airline-0547,19.736027
2901,CSN8670,CSN8670-1535606749-airline-0070,29.207952
2748,CSN3107,CSN3107-1535606749-airline-0185,32.495838
2348,CHH7136,CHH7136-1535606751-airline-0050,32.870432
2826,CSN6118,CSN6118-1535606749-airline-0104,33.230264
...,...,...,...
1469,AUT2600,AUT2600-1535606747-airline-0008,19573.936391
1152,ANS860,ANS860-1535642100-schedule-0001,19588.829465
1196,ARG1681,ARG1681-1535606747-airline-0211,19645.969261
1475,AUT2881,AUT2881-1535606747-airline-0131,19718.507110


### Generate CSV file from the sorted dataframe above

In [14]:
# Command for CSV output of dataframe.
df_sorted.to_csv('flight_list_sorted_by_distance.csv', index=False)

------------------------------

### Data Analysis

#### [1] Analysis by Flight

In [15]:
# Reset the row index in dataframe for further analysis.
df_sorted = df_sorted.reset_index(drop=True)
df_sorted

Unnamed: 0,ident,id,distance
0,CSN6284,CSN6284-1535606749-airline-0547,19.736027
1,CSN8670,CSN8670-1535606749-airline-0070,29.207952
2,CSN3107,CSN3107-1535606749-airline-0185,32.495838
3,CHH7136,CHH7136-1535606751-airline-0050,32.870432
4,CSN6118,CSN6118-1535606749-airline-0104,33.230264
...,...,...,...
9742,AUT2600,AUT2600-1535606747-airline-0008,19573.936391
9743,ANS860,ANS860-1535642100-schedule-0001,19588.829465
9744,ARG1681,ARG1681-1535606747-airline-0211,19645.969261
9745,AUT2881,AUT2881-1535606747-airline-0131,19718.507110


In [16]:
# Print statements for Min, Max & Mean of distance data.
print(f"Closest Distance Data: Flight = {df_sorted['ident'][0]} | Distance = {df_sorted['distance'][0]}")
print(f"Farthest Distance Data: Flight = {df_sorted['ident'][9746]} | Distance = {df_sorted['distance'][9746]}")
print(f"Average Distance of All Flights: Total Number of Flights = {len(df_sorted)} | Average Distance = {np.mean(df_sorted['distance'])}")

Closest Distance Data: Flight = CSN6284 | Distance = 19.736027215626947
Farthest Distance Data: Flight = ARG1554 | Distance = 19721.58687817828
Average Distance of All Flights: Total Number of Flights = 9747 | Average Distance = 8381.598492434723


#### [2] Analysis by Airline
* [2-a] Create a table with the following fields:
    > - airlines (the first three letters of flight "ident")
    > - count (count of flights by the same airline)
    > - distance_sum (total distance per airline)
    > - distance_mean (average distance per airline)
* [2-b] Generate a table sorted by distance_mean in ascending order
* [2-c] Generate a table sorted by count in descending order

##### [2-a] Create a table with the following fields

In [17]:
# Prepare dataframe to create table with airline
df_sorted["distance_2"] = df_sorted["distance"] 
df_sorted["count"] = 1
df_sorted.info()
df_sorted

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9747 entries, 0 to 9746
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   ident       9747 non-null   object 
 1   id          9747 non-null   object 
 2   distance    9747 non-null   float64
 3   distance_2  9747 non-null   float64
 4   count       9747 non-null   int64  
dtypes: float64(2), int64(1), object(2)
memory usage: 380.9+ KB


Unnamed: 0,ident,id,distance,distance_2,count
0,CSN6284,CSN6284-1535606749-airline-0547,19.736027,19.736027,1
1,CSN8670,CSN8670-1535606749-airline-0070,29.207952,29.207952,1
2,CSN3107,CSN3107-1535606749-airline-0185,32.495838,32.495838,1
3,CHH7136,CHH7136-1535606751-airline-0050,32.870432,32.870432,1
4,CSN6118,CSN6118-1535606749-airline-0104,33.230264,33.230264,1
...,...,...,...,...,...
9742,AUT2600,AUT2600-1535606747-airline-0008,19573.936391,19573.936391,1
9743,ANS860,ANS860-1535642100-schedule-0001,19588.829465,19588.829465,1
9744,ARG1681,ARG1681-1535606747-airline-0211,19645.969261,19645.969261,1
9745,AUT2881,AUT2881-1535606747-airline-0131,19718.507110,19718.507110,1


In [18]:
# Write a function to get the first three letter of airline from flight data.
def get_airline(string):
    airline = string[:3]
    return airline

In [19]:
# Add 'airline' column by applying 'get_airline' function to column 'ident'.
df_sorted["airline"] = df_sorted["ident"].apply(get_airline)

# Check output.
df_sorted.info()
df_sorted

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9747 entries, 0 to 9746
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   ident       9747 non-null   object 
 1   id          9747 non-null   object 
 2   distance    9747 non-null   float64
 3   distance_2  9747 non-null   float64
 4   count       9747 non-null   int64  
 5   airline     9747 non-null   object 
dtypes: float64(2), int64(1), object(3)
memory usage: 457.0+ KB


Unnamed: 0,ident,id,distance,distance_2,count,airline
0,CSN6284,CSN6284-1535606749-airline-0547,19.736027,19.736027,1,CSN
1,CSN8670,CSN8670-1535606749-airline-0070,29.207952,29.207952,1,CSN
2,CSN3107,CSN3107-1535606749-airline-0185,32.495838,32.495838,1,CSN
3,CHH7136,CHH7136-1535606751-airline-0050,32.870432,32.870432,1,CHH
4,CSN6118,CSN6118-1535606749-airline-0104,33.230264,33.230264,1,CSN
...,...,...,...,...,...,...
9742,AUT2600,AUT2600-1535606747-airline-0008,19573.936391,19573.936391,1,AUT
9743,ANS860,ANS860-1535642100-schedule-0001,19588.829465,19588.829465,1,ANS
9744,ARG1681,ARG1681-1535606747-airline-0211,19645.969261,19645.969261,1,ARG
9745,AUT2881,AUT2881-1535606747-airline-0131,19718.507110,19718.507110,1,AUT


In [20]:
# Generate Table.
table = df_sorted.pivot_table(values=['distance', 'count', 'distance_2'], index=['airline'],
                    aggfunc={'count': np.sum, 'distance_2': np.mean, 'distance': np.sum})

table = table.reset_index()  # reset the index row to change 'airline' from index to column

# Rename the columns.
table = table.rename({'distance': 'distance_sum', 'distance_2': 'distance_mean'}, axis=1)

# Check dataframe.
table.info()
table

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 660 entries, 0 to 659
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   airline        660 non-null    object 
 1   count          660 non-null    int64  
 2   distance_sum   660 non-null    float64
 3   distance_mean  660 non-null    float64
dtypes: float64(2), int64(1), object(1)
memory usage: 20.8+ KB


Unnamed: 0,airline,count,distance_sum,distance_mean
0,40,1,10941.118810,10941.118810
1,9H8,3,2715.252349,905.084116
2,A07,1,19401.473742,19401.473742
3,AAF,3,27282.585416,9094.195139
4,AAH,1,8057.555237,8057.555237
...,...,...,...,...
655,XLF,3,24924.042682,8308.014227
656,XOJ,6,63871.915928,10645.319321
657,XSR,2,21356.161106,10678.080553
658,YV3,1,12932.696488,12932.696488


#### [2-b] Generate a table sorted by distance_mean in ascending order

In [21]:
# Sort 'table_sorted_distance' dataframe by 'distance_mean'.
table_sorted_distance = table.sort_values(by=['distance_mean'])
table_sorted_distance

Unnamed: 0,airline,count,distance_sum,distance_mean
228,GDC,2,141.856696,70.928348
159,CUA,6,2900.995180,483.499197
329,LTU,1,520.822957,520.822957
498,QDA,6,4231.301527,705.216921
248,HBH,6,5122.236125,853.706021
...,...,...,...,...
55,ATM,2,37633.291233,18816.645617
60,AUT,14,269486.942019,19249.067287
82,BOV,1,19281.817971,19281.817971
2,A07,1,19401.473742,19401.473742


In [22]:
# Reset the row index in dataframe for furhter analysis.
table_sorted_distance = table_sorted_distance.reset_index(drop=True)
table_sorted_distance

Unnamed: 0,airline,count,distance_sum,distance_mean
0,GDC,2,141.856696,70.928348
1,CUA,6,2900.995180,483.499197
2,LTU,1,520.822957,520.822957
3,QDA,6,4231.301527,705.216921
4,HBH,6,5122.236125,853.706021
...,...,...,...,...
655,ATM,2,37633.291233,18816.645617
656,AUT,14,269486.942019,19249.067287
657,BOV,1,19281.817971,19281.817971
658,A07,1,19401.473742,19401.473742


In [23]:
print(f"Lowest Distance Mean: Airline = {table_sorted_distance['airline'][0]} | Distance Mean = {table_sorted_distance['distance_mean'][0]}")
print(f"Highest Distance Mean: Airline = {table_sorted_distance['airline'][659]} | Distance Mean = {table_sorted_distance['distance_mean'][659]}")

Lowest Distance Mean: Airline = GDC | Distance Mean = 70.92834777629636
Highest Distance Mean: Airline = ANS | Distance Mean = 19588.829464803355


#### [2-c] Generate a table sorted by count in descending order

In [24]:
table_sorted_count = table.sort_values(by=['count'], ascending=False)
table_sorted_count

Unnamed: 0,airline,count,distance_sum,distance_mean
5,AAL,443,4.862450e+06,10976.185108
168,DAL,389,4.094948e+06,10526.859213
558,SWA,363,3.942718e+06,10861.482880
606,UAL,337,3.443044e+06,10216.747454
522,RYR,239,2.013670e+06,8425.397276
...,...,...,...,...
118,CGI,1,9.192029e+03,9192.028538
482,PEG,1,1.076254e+04,10762.540842
117,CGH,1,7.851673e+03,7851.673380
116,CGD,1,1.013179e+04,10131.785410


In [25]:
# Reset the row index in dataframe for further analysis.
table_sorted_count = table_sorted_count.reset_index(drop=True)
table_sorted_count

Unnamed: 0,airline,count,distance_sum,distance_mean
0,AAL,443,4.862450e+06,10976.185108
1,DAL,389,4.094948e+06,10526.859213
2,SWA,363,3.942718e+06,10861.482880
3,UAL,337,3.443044e+06,10216.747454
4,RYR,239,2.013670e+06,8425.397276
...,...,...,...,...
655,CGI,1,9.192029e+03,9192.028538
656,PEG,1,1.076254e+04,10762.540842
657,CGH,1,7.851673e+03,7851.673380
658,CGD,1,1.013179e+04,10131.785410


In [26]:
print(f"Most Flight Counts: Airline = {table_sorted_count['airline'][0]} | Number of Counts = {table_sorted_count['count'][0]} \
| Distance Mean = {table_sorted_count['distance_mean'][0]}")

Most Flight Counts: Airline = AAL | Number of Counts = 443 | Distance Mean = 10976.185107712423
