In [None]:
# Install all modules to help with data cleaning and storage
!pip install flatdict==4.0.1
!pip install google-cloud-bigquery
!pip install --upgrade google-cloud-bigquery-storage

In [None]:
# Import get and post requests from library
from requests import get,post
import os
import flatdict
from google.cloud import bigquery
from google.oauth2 import service_account

In [None]:
# documentation - https://aviationstack.com/documentation

In [None]:
# Api keys
# os.environ["API_KEY"]
key="a686bdcebdac602cfbb96ff529bd9cb8"
weather_api_key="0dc74d9415a7e5a964398b39942cf3d9"

In [None]:
# Get info on flights
flights_url = "http://api.aviationstack.com/v1/flights"
airlines_url = "http://api.aviationstack.com/v1/airlines"
# Urls for api detailing current weather and historical weather
current_weather_url = "http://api.aviationstack.com/v1/current"
historical_weather_url = "http://api.aviationstack.com/v1/historical"

In [None]:
# Helper function to get data from the apis
def get_from_api(url,query,api_key=key):
    return get(url+"?access_key="+api_key+query).json()

In [None]:
ba_flights = get_from_api(flights_url, "&airline_iata=BA")
# try getting data current weather in london
london_weather = get_from_api(current_weather_url, "&query=london", weather_api_key)
london_weather

In [None]:
# Not used right now
# [airline for airline in airlines['data'] if airline['airline_name'] == 'British Airways']

In [None]:
ba_flights = get_from_api(flights_url, "&airline_iata=BA&offset=100")

In [None]:
# Use Google's inbuilt functions to load in my keys for my account
credentials = service_account.Credentials.from_service_account_file(
    'credentials.json'
)

In [None]:
# Construct a BigQuery client object
client = bigquery.Client(credentials=credentials)

In [None]:
def create_dataset(dataset_id):
    # Construct a full Dataset ID in the format `project.dataset`.
    dataset_id = "{}.{}".format(client.project, dataset_id)

    # Construct a Dataset object
    dataset = bigquery.Dataset(dataset_id)

    # Specify the geographic location where the dataset should reside
    dataset.location = "US"

    # Create the dataset
    try:
        dataset = client.create_dataset(dataset)  # API request
        print("Created dataset {}.{}".format(client.project, dataset.dataset_id))
    except Exception as e:
        print("Error in creating dataset: ", e)
        
# Make dataset
create_dataset('weather_data')

In [91]:
from google.cloud.bigquery import SchemaField

# Use a schema to create a new table for each of the data sources, flights and weather
def setup_bigquery_table():
    table_id = "{}.{}.{}".format('data-engineering-416410', 'weather_data', 'weather info updated 4')
    
    schema = [
        SchemaField("request__type", "STRING", mode="REQUIRED"),
        SchemaField("request__query", "STRING", mode="REQUIRED"),
        SchemaField("request__language", "STRING", mode="REQUIRED"),
        SchemaField("request__unit", "STRING", mode="REQUIRED"),
        SchemaField("location__country", "STRING", mode="REQUIRED"),
        SchemaField("location__name", "STRING", mode="REQUIRED"),
        SchemaField("location__region", "STRING", mode="REQUIRED"),
        SchemaField("location__lat", "STRING", mode="REQUIRED"),
        SchemaField("location__lon", "STRING", mode="REQUIRED"),
        SchemaField("location__timezone_id", "STRING", mode="REQUIRED"),
        SchemaField("location__localtime", "DATE", mode="REQUIRED"),
        SchemaField("location__localtime_epoch", "INTEGER", mode="REQUIRED"),
        SchemaField("location__utc_offset", "STRING", mode="REQUIRED"),
        SchemaField("current__observation_time", "STRING", mode="REQUIRED"),
        SchemaField("current__temperature", "INTEGER", mode="REQUIRED"),
        SchemaField("current__weather_code", "INTEGER", mode="REQUIRED"),
        SchemaField("current__weather_icons", "STRING", mode="REQUIRED"),
        SchemaField("current__weather_descriptions", "STRING", mode="REQUIRED"),
        SchemaField("current__wind_speed", "INTEGER", mode="REQUIRED"),
        SchemaField("current__wind_degree", "INTEGER", mode="REQUIRED"),
        SchemaField("current__wind_dir", "STRING", mode="REQUIRED"),
        SchemaField("current__pressure", "INTEGER", mode="REQUIRED"),
        SchemaField("current__precip", "INTEGER", mode="REQUIRED"),
        SchemaField("current__humidity", "INTEGER", mode="REQUIRED"),
        SchemaField("current__cloudcover", "INTEGER", mode="REQUIRED"),
        SchemaField("current__feelslike", "INTEGER", mode="REQUIRED"),
        SchemaField("current__uv_index", "INTEGER", mode="REQUIRED"),
        SchemaField("current__visibility", "INTEGER", mode="REQUIRED"),
        SchemaField("current__is_day", "STRING", mode="REQUIRED"),
    ]
    
    table = bigquery.Table(table_id, schema=schema)
    try:
        client.create_table(table)
        print("Created table {}.{}.{}".format(table.project, table.dataset_id, table.table_id))
    except Exception as e:
        print("Table creation encountered an error: ", e)

# Call this function to make the table for weather data
setup_bigquery_table()

Created table data-engineering-416410.weather_data.weather info updated 4


# Process

In [None]:
# Send the rows to a bigquery table, depending on the table id provided. if no id is provided it will default to flight data
def send_to_bigquery(rows_to_insert, table_id="data-engineering-416410.flight_data.BA flight info"):

    errors = client.insert_rows_json(table_id, rows_to_insert)  # Make an API request.
    if not errors:
        print("New rows have been added.")
    else:
        print("Encountered errors while inserting rows: {}".format(errors))

In [89]:
# Flatten a deeply nested entry
def flatten(row):
    return dict(flatdict.FlatDict(row, delimiter="__"))

# This will format the times to be used in Google BigQuery
def fix_datetime(dictionary):
    dict_copy = dictionary.copy()
    for entry in dict_copy.items():
        if entry[0] in ["departure__scheduled","departure__estimated","departure__actual","departure__estimated_runway","departure__actual_runway","arrival__scheduled","arrival__estimated","arrival__actual","arrival__estimated_runway","arrival__actual_runway", "location__localtime"]:
            if entry[1] is not None:
                dict_copy[entry[0]] = entry[1][:-6]
        if entry[0] in ['current__weather_icons', 'current__weather_descriptions']:
            dict_copy[entry[0]] = 'N/A'
    if dict_copy.get('flight_date'):
        dict_copy['flight_date'] = dict_copy['flight_date']+"T00:00:00"
    # del dict_copy['']
    return dict_copy

In [None]:
def get_flight_data_and_add_to_bigquery():
    for i in range(0,6000,100):
        rows_to_insert = []
        flight_page1 = get_from_api(flights_url,"&airline_iata=BA&departure_iata=LHR&offset="+str(i))
        for item in flight_page1['data']:
           rows_to_insert.append(fix_datetime(flatten(item)))
        send_to_bigquery(rows_to_insert)
        rows_to_insert = []
        flight_page2 = get_from_api(flights_url,"&airline_iata=BA&arrival_iata=LHR&offset="+str(i))
        for item in flight_page2['data']:
           rows_to_insert.append(fix_datetime(flatten(item)))
        send_to_bigquery(rows_to_insert)

get_flight_data_and_add_to_bigquery()

In [92]:
# Get data on weather and then put this data into big query
def get_weather_data_and_add_to_bigquery():
    # Inquire for todays data, due to a limitation on how far back you can request data
    for i in range(0,100,100):
        rows_to_insert = []
        weather_page = get_from_api(current_weather_url, "&query=london", weather_api_key)
        print(fix_datetime(flatten(weather_page)))
        rows_to_insert.append(fix_datetime(flatten(weather_page)))
        send_to_bigquery(rows_to_insert, table_id='data-engineering-416410.weather_data.weather info updated 4')

get_weather_data_and_add_to_bigquery()

{'request__type': 'City', 'request__query': 'London, United Kingdom', 'request__language': 'en', 'request__unit': 'm', 'location__name': 'London', 'location__country': 'United Kingdom', 'location__region': 'City of London, Greater London', 'location__lat': '51.517', 'location__lon': '-0.106', 'location__timezone_id': 'Europe/London', 'location__localtime': '2024-05-02', 'location__localtime_epoch': 1714690620, 'location__utc_offset': '1.0', 'current__observation_time': '09:57 PM', 'current__temperature': 11, 'current__weather_code': 296, 'current__weather_icons': 'N/A', 'current__weather_descriptions': 'N/A', 'current__wind_speed': 17, 'current__wind_degree': 260, 'current__wind_dir': 'W', 'current__pressure': 1005, 'current__precip': 0, 'current__humidity': 94, 'current__cloudcover': 100, 'current__feelslike': 9, 'current__uv_index': 1, 'current__visibility': 5, 'current__is_day': 'no'}
New rows have been added.
