# Assignment 3

Import libraries and define common helper functions

In [1]:
import os
import sys
import gzip
import json
from pathlib import Path
import csv

import pandas as pd
import s3fs
import pyarrow as pa
from pyarrow.json import read_json
import pyarrow.parquet as pq
import fastavro
from fastavro import parse_schema
from fastavro import writer
import pygeohash
import snappy
import jsonschema
from jsonschema.exceptions import ValidationError


endpoint_url='https://storage.budsc.midwest-datascience.com'

current_dir = Path(os.getcwd()).absolute()
schema_dir = current_dir.joinpath('schemas')
results_dir = current_dir.joinpath('results')
results_dir.mkdir(parents=True, exist_ok=True)


def read_jsonl_data():
    s3 = s3fs.S3FileSystem(
        anon=True,
        client_kwargs={
            'endpoint_url': endpoint_url
        }
    )
    src_data_path = 'data/processed/openflights/routes.jsonl.gz'
    with s3.open(src_data_path, 'rb') as f_gz:
        with gzip.open(f_gz, 'rb') as f:
            records = [json.loads(line) for line in f.readlines()]
        

    return records

Load the records from https://storage.budsc.midwest-datascience.com/data/processed/openflights/routes.jsonl.gz 

In [2]:
records = read_jsonl_data()

## 3.1

### 3.1.a JSON Schema

In [3]:
def validate_jsonl_data(records):
    schema_path = schema_dir.joinpath('routes-schema.json')
    with open(schema_path) as f:
        schema = json.load(f)

    validation_csv_path = results_dir.joinpath('validation-results.csv')    
    with open(validation_csv_path, 'w', newline='') as f:    
        writer = csv.writer(f)
        for i, record in enumerate(records):
            try:
                ## TODO: Validate record 
                jsonschema.validate(record, schema)
#                 pass
            except ValidationError as e:
                ## Print message if invalid record
                f.write(f"Error: {e.message}; failed validating {e.validator} in schema {e.schema_path}\r\n")
                print(e)

            
validate_jsonl_data(records)

### 3.1.b Avro

In [4]:
def create_avro_dataset(records):
    schema_path = schema_dir.joinpath('routes.avsc')
    data_path = results_dir.joinpath('routes.avro')
    
    ## TODO: Use fastavro to create Avro dataset
    # load schema .avro file
    with open(schema_path,'r') as f:
        schema = json.load(f)
    # parse schema
    parsed_schema = parse_schema(schema)
    # write record according to schema
    with open(data_path, 'wb') as out:
        writer(out, parsed_schema, records)

        
create_avro_dataset(records)

In [5]:
# # verify data reads back:
# from fastavro import reader
# with open(data_path, 'rb') as fo:
#     avro_reader = reader(fo)
#     for record in avro_reader:
#         print(record)

### 3.1.c Parquet

In [6]:
def create_parquet_dataset():
    src_data_path = 'data/processed/openflights/routes.jsonl.gz'
    parquet_output_path = results_dir.joinpath('routes.parquet')
    s3 = s3fs.S3FileSystem(
        anon=True,
        client_kwargs={
            'endpoint_url': endpoint_url
        }
    )
    
    with s3.open(src_data_path, 'rb') as f_gz:
        with gzip.open(f_gz, 'rb') as f:
            ## TODO: Use Apache Arrow to create Parquet table and save the dataset
            
            # read json into Parquet table:
            table = read_json(f)
    
    # write Parquet table to `.parquet` file:
    pq.write_table(table, parquet_output_path)

    
create_parquet_dataset()

### 3.1.d Protocol Buffers

In [7]:
sys.path.insert(0, os.path.abspath('routes_pb2'))

import routes_pb2

def _airport_to_proto_obj(airport):
    obj = routes_pb2.Airport()
    if airport is None:
        return None
    if airport.get('airport_id') is None:
        return None

    obj.airport_id = airport.get('airport_id')
    if airport.get('name'):
        obj.name = airport.get('name')
    if airport.get('city'):
        obj.city = airport.get('city')
    if airport.get('iata'):
        obj.iata = airport.get('iata')
    if airport.get('icao'):
        obj.icao = airport.get('icao')
    if airport.get('altitude'):
        obj.altitude = airport.get('altitude')
    if airport.get('timezone'):
        obj.timezone = airport.get('timezone')
    if airport.get('dst'):
        obj.dst = airport.get('dst')
    if airport.get('tz_id'):
        obj.tz_id = airport.get('tz_id')
    if airport.get('type'):
        obj.type = airport.get('type')
    if airport.get('source'):
        obj.source = airport.get('source')

    obj.latitude = airport.get('latitude')
    obj.longitude = airport.get('longitude')

    return obj


def _airline_to_proto_obj(airline):
    obj = routes_pb2.Airline()
    ## TODO: Create an Airline obj using Protocol Buffers API
    # Check for airline / id
    if airline is None:
        return None
    if airline.get('airline_id') is None:
        return None
    
    # Get airline info
    obj.airline_id = airline.get('airline_id')
    if airline.get('name'):
        obj.name = airline.get('name')
    if airline.get('alias'):
        obj.alias = airline.get('alias')
    if airline.get('iata'):
        obj.iata = airline.get('iata')
    if airline.get('icao'):
        obj.icao = airline.get('icao')
    if airline.get('callsign'):
        obj.callsign = airline.get('callsign')
    if airline.get('country'):
        obj.country = airline.get('country')
    obj.active = airline.get('active') # boolean
        
    return obj


def create_protobuf_dataset(records):
    routes = routes_pb2.Routes()
    for record in records:
        route = routes_pb2.Route()
        
        ## TODO: Implement the code to create the Protocol Buffers Dataset
        # Copy 'airline' data
        airline = _airline_to_proto_obj(record.get('airline'))
        if airline:
            route.airline.CopyFrom(airline)
        
        # Copy 'src_airport' data
        src_airport = _airport_to_proto_obj(record.get('src_airport'))
        if src_airport:
            route.src_airport.CopyFrom(src_airport)
        
        # Copy 'dst_airport' data
        dst_airport = _airport_to_proto_obj(record.get('dst_airport'))
        if dst_airport:
            route.dst_airport.CopyFrom(dst_airport)
        
        # Get 'codeshare' boolean
        route.codeshare = record.get('codeshare')
        
        # Get 'equipment' and iterate through for multiple
        equipment = record.get('equipment')
        for equip in equipment:
            route.equipment.append(equip)
        
        # Add generated route to db of routes
        routes.route.append(route)

    data_path = results_dir.joinpath('routes.pb')

    with open(data_path, 'wb') as f:
        f.write(routes.SerializeToString())
        
    compressed_path = results_dir.joinpath('routes.pb.snappy')
    
    with open(compressed_path, 'wb') as f:
        f.write(snappy.compress(routes.SerializeToString()))

In [8]:
create_protobuf_dataset(records)

## 3.2

### 3.2.a Simple Geohash Index

In [9]:
geo_records = read_jsonl_data()

In [10]:
def create_hash_dirs(records):
    geoindex_dir = results_dir.joinpath('geoindex')
    geoindex_dir.mkdir(exist_ok=True, parents=True)
#     hashes = []

    ## TODO: Create hash index
    hashes, hash_set = [], set()
    for record in records: # iterate records
        origin_data = record.get('src_airport') # get source airport info
        if origin_data: # if source airport available, get lat/lon
            lat, lon = origin_data.get('latitude'), origin_data.get('longitude')
            hash_set.add(pygeohash.encode(lat, lon, precision=3)) # get 3-digit unique hash
            record['src_airport']['geohash'] = pygeohash.encode(lat, lon) # add full hash to record

    hashes = sorted(list(hash_set)) # sort unique hashes
    index = {value: [] for value in hashes} # set up hash index
    
    for record in records: 
        # iterate through records, add to appropriate hash index
        if record.get('src_airport'):
            geohash = record['src_airport'].get('geohash')
            index[geohash[:3]].append(record)
    
    for key, values in index.items():
        # create folder / subfolder directories by short hash key
        output_dir = geoindex_dir.joinpath(str(key[:1])).joinpath(str(key[:2]))
        output_dir.mkdir(exist_ok=True, parents=True)
        output_path = output_dir.joinpath(f'{key}.jsonl.gz')
        # save record to appropriate subfolder/file
        with gzip.open(output_path, 'w') as f:
            json_output = '\n'.join([json.dumps(value) for value in values])
            f.write(json_output.encode('utf-8'))

In [11]:
create_hash_dirs(geo_records)

### 3.2.b Simple Search Feature

In [12]:
def airport_search(latitude=41.1499988, longitude=-95.91779, distance=1000):
    ## TODO: Create simple search to return nearest airport
    homeHash = pygeohash.encode(latitude, longitude)
    airportSet = set()
    for record in geo_records:
        airport = record['src_airport']
        if airport:
            airportHash = pygeohash.encode(airport['latitude'], airport['longitude'])
            distToLoc = pygeohash.geohash_approximate_distance(homeHash, airportHash)/1000
            if distToLoc <= distance:
                airportSet.add(airport['name'])
    airportList = sorted(list(airportSet))
    
    print(f'The following airports are within {distance} km of ({latitude}, {longitude}):\n')
    for airport in airportList:
        print(' -', airport)
#     pass

41.1499988

-95.91779

In [13]:
lat = float(input("Enter your latitude:\t"))
lon = float(input("Enter your longitude:\t"))
dist = float(input("Input search radius (km): "))

airport_search(lat, lon, dist)

Enter your latitude:	41.1499988
Enter your longitude:	-95.91779
Input search radius (km): 420
The following airports are within 420.0 km of (41.1499988, -95.91779):

 - Eppley Airfield
 - Lincoln Airport
