In [2]:
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
from datetime import datetime
import dateutil
import json
from google.cloud import bigquery
from google.cloud.exceptions import NotFound


city_file = "/Users/drewwhite/Desktop/Epicodus/team-week3/drew-work/cities.json"

with open(city_file) as f:
    cities = json.load(f)

data = []

for city in cities:
    response = requests.get(city['NWS_URL'])
    soup = BeautifulSoup(response.content, 'html.parser')

    location = soup.find('h2', {'class': 'panel-title'})
    lat_lon_elev = soup.find('span', {'class': 'smallTxt'}).text.strip()
    lat, lon, elev = re.findall(r'[-+]?\d*\.\d+|\d+', lat_lon_elev)
    temperature = soup.find('p', {'class': 'myforecast-current-lrg'})
    humidity_elem = soup.find('td', text='Humidity')
    humidity = humidity_elem.find_next('td').text.strip() if humidity_elem else 'NA'
    wind_speed_elem = soup.find('td', text='Wind Speed')
    wind_speed = wind_speed_elem.find_next('td').text.strip() if wind_speed_elem else 'NA'
    barometer_elem = soup.find('td', text='Barometer')
    barometer = barometer_elem.find_next('td').text.strip() if barometer_elem else 'NA'
    dewpoint_elem = soup.find('td', text='Dewpoint')
    dewpoint = dewpoint_elem.find_next('td').text.strip() if dewpoint_elem else 'NA'
    visibility_elem = soup.find('td', text='Visibility')
    visibility = visibility_elem.find_next('td').text.strip() if visibility_elem else 'NA'
    wind_chill_elem = soup.find('td', text='Wind Chill')
    wind_chill = wind_chill_elem.find_next('td').text.strip() if wind_chill_elem else 'NA'
    last_update_elem = soup.find('td', text='Last update')
    last_update = last_update_elem.find_next('td').text.strip() if last_update_elem else 'NA'

    data.append({
        'location': city['Name'],
        'lat': lat,
        'lon': lon,
        'elev_ft': elev,
        'temperature': temperature.text if temperature else 'NA',
        'humidity': humidity,
        'wind_speed': wind_speed,
        'barometer': barometer,
        'dewpoint': dewpoint,
        'vis_miles': visibility,
        'wind_chill': wind_chill,
        'last_update': last_update
    })

df = pd.DataFrame(data)

# Split the 'location' column into separate 'city' and 'state' columns
df[['city', 'state']] = df['location'].str.split(', ', expand=True)

# Convert 'lat' and 'lon' columns to float type
df[['lat', 'lon']] = df[['lat', 'lon']].astype(float)

# Convert 'elev' column to int type
df['elev_ft'] = df['elev_ft'].astype(int)

# Extract the numeric part of the temperature string and convert it to int
df['temp_f'] = df['temperature'].str.extract('(\d+)').astype(int)

# Convert temperature to Celsius and add to new column 'temp_c'
df['temp_c'] = (df['temp_f'] - 32) * 5/9

# Round 'temp_c' to nearest integer and cast to int type
df['temp_c'] = df['temp_c'].round().astype(int)

# Convert 'humidity' column to float type
df['humidity'] = df['humidity'].str.extract('(\d+)', expand=False).astype(float) / 100

# Split wind speed values into components and convert speed to int type
df['wind_speed'] = df['wind_speed'].str.extract('(\d+)', expand=False).fillna(0).astype(int)

# Set any missing or non-numeric wind speed values to 0
df['wind_speed'] = df['wind_speed'].replace('Calm', 0)

# Convert 'barometer' column to float type, and convert inches to millibars
df['barometer'] = df['barometer'].apply(lambda x: float(x.split()[0]) * 33.8639 if 'in' in x and x != 'NA' else None)

# Round 'barometer' to two decimal places
df['barometer'] = df['barometer'].round(2)

# Split 'dewpoint' column into separate 'dewpoint_f' and 'dewpoint_c' columns
df[['dewpoint_f', 'dewpoint_c']] = df['dewpoint'].str.extract('(\d+).*?(\d+)', expand=True).astype(int)

# Convert 'vis_miles' column to float type
df['vis_miles'] = df['vis_miles'].str.extract('(\d+\.\d+|\d+)', expand=False).astype(float).round(2)

# Split 'wind_chill' column into separate 'wind_chill_f' and 'wind_chill_c' columns
df[['wind_chill_f', 'wind_chill_c']] = df['wind_chill'].str.extract('(\d+).*?(\d+)', expand=True).astype(float)

# Convert 'last_update' column to datetime type with the desired format and time zone
df['last_update'] = df['last_update'].apply(lambda x: dateutil.parser.parse(x, tzinfos={'CST': dateutil.tz.tzoffset(None, -21600)}))

# Convert 'last_update' column to UTC
df['last_update'] = df['last_update'].apply(lambda x: x.astimezone(dateutil.tz.tzutc()))

# Drop columns that were split into two values
df = df.drop(['temperature', 'dewpoint', 'wind_chill'], axis=1)



# Set up the BigQuery client
client = bigquery.Client()

# Set the project and dataset IDs
project_id = "deb-dev-dw"
dataset_id = "weather"
table_id = "daily"

# Set the table schema
schema = [
    bigquery.SchemaField("location", "STRING"),
    bigquery.SchemaField("lat", "FLOAT"),
    bigquery.SchemaField("lon", "FLOAT"),
    bigquery.SchemaField("elev_ft", "INTEGER"),
    bigquery.SchemaField("humidity", "FLOAT"),
    bigquery.SchemaField("wind_speed", "INTEGER"),
    bigquery.SchemaField("barometer", "FLOAT"),
    bigquery.SchemaField("vis_miles", "FLOAT"),
    bigquery.SchemaField("dewpoint_f", "INTEGER"),
    bigquery.SchemaField("dewpoint_c", "INTEGER"),
    bigquery.SchemaField("wind_chill_f", "FLOAT"),
    bigquery.SchemaField("wind_chill_c", "FLOAT"),
    bigquery.SchemaField("city", "STRING"),
    bigquery.SchemaField("state", "STRING"),
    bigquery.SchemaField("temp_f", "INTEGER"),
    bigquery.SchemaField("temp_c", "INTEGER"),
    bigquery.SchemaField("last_update", "TIMESTAMP"),
]

# Check if the dataset exists, and create it if it does not
try:
    dataset_ref = client.dataset(dataset_id)
    dataset = client.get_dataset(dataset_ref)
except NotFound:
    dataset_ref = client.dataset(dataset_id)
    dataset = bigquery.Dataset(dataset_ref)
    dataset.location = "US"
    dataset = client.create_dataset(dataset)

# Get a reference to the table
table_ref = dataset.table(table_id)

# Create the table if it doesn't exist
try:
    client.get_table(table_ref)
except NotFound:
    table = bigquery.Table(table_ref, schema=schema)
    table = client.create_table(table)

# Write the DataFrame to BigQuery
job_config = bigquery.LoadJobConfig(write_disposition=bigquery.WriteDisposition.WRITE_APPEND)
job = client.load_table_from_dataframe(df, table_ref, job_config=job_config)
job.result()



LoadJob<project=deb-dev-dw, location=US, id=574ce686-b5e6-4c93-a235-271d38c75e33>

In [10]:
city_file = "/Users/drewwhite/Desktop/Epicodus/team-week3/drew-work/cities.json"

def scrape_weather_data(city_file):
    with open(city_file) as f:
        cities = json.load(f)

    data = []

    for city in cities:
        response = requests.get(city['NWS_URL'])
        soup = BeautifulSoup(response.content, 'html.parser')

        location = soup.find('h2', {'class': 'panel-title'})
        lat_lon_elev = soup.find('span', {'class': 'smallTxt'}).text.strip()
        lat, lon, elev = re.findall(r'[-+]?\d*\.\d+|\d+', lat_lon_elev)
        temperature = soup.find('p', {'class': 'myforecast-current-lrg'})
        humidity_elem = soup.find('td', text='Humidity')
        humidity = humidity_elem.find_next('td').text.strip() if humidity_elem else 'NA'
        wind_speed_elem = soup.find('td', text='Wind Speed')
        wind_speed = wind_speed_elem.find_next('td').text.strip() if wind_speed_elem else 'NA'
        barometer_elem = soup.find('td', text='Barometer')
        barometer = barometer_elem.find_next('td').text.strip() if barometer_elem else 'NA'
        dewpoint_elem = soup.find('td', text='Dewpoint')
        dewpoint = dewpoint_elem.find_next('td').text.strip() if dewpoint_elem else 'NA'
        visibility_elem = soup.find('td', text='Visibility')
        visibility = visibility_elem.find_next('td').text.strip() if visibility_elem else 'NA'
        wind_chill_elem = soup.find('td', text='Wind Chill')
        wind_chill = wind_chill_elem.find_next('td').text.strip() if wind_chill_elem else 'NA'
        last_update_elem = soup.find('td', text='Last update')
        last_update = last_update_elem.find_next('td').text.strip() if last_update_elem else 'NA'

        data.append({
            'location': city['Name'],
            'lat': lat,
            'lon': lon,
            'elev_ft': elev,
            'temperature': temperature.text if temperature else 'NA',
            'humidity': humidity,
            'wind_speed': wind_speed,
            'barometer': barometer,
            'dewpoint': dewpoint,
            'vis_miles': visibility,
            'wind_chill': wind_chill,
            'last_update': last_update
        })

    df = pd.DataFrame(data)

    return df

scrape_weather_data(city_file)


  app.launch_new_instance()


Unnamed: 0,location,lat,lon,elev_ft,temperature,humidity,wind_speed,barometer,dewpoint,vis_miles,wind_chill,last_update
0,"Portland, OR",45.59578,122.60917,20,41°F,73%,N 4 MPH,30.2 in (1022.69 mb),33°F (1°C),10.00 mi,38°F (3°C),14 Feb 01:53 PM PST
1,"San Diego, CA",32.73361,117.18306,13,56°F,61%,WSW 18 MPH,29.93 in (1013.55 mb),43°F (6°C),10.00 mi,,14 Feb 01:51 PM PST
2,"Duluth, MN",46.72,92.04,607,34°F,100%,NE 22 mph,29.37 in,34°F (1°C),1.75 mi,22°F (-6°C),14 Feb 4:15 pm CST
3,"Minneapolis, MN",44.88,93.23,840,38°F,89%,E 14 mph,29.24 in (990.8 mb),35°F (2°C),4.00 mi,30°F (-1°C),14 Feb 3:53 pm CST
4,"Salt Lake City, UT",40.77069,111.96503,4226,32°F,58%,N 11 MPH,29.61 in (1002.71 mb),19°F (-7°C),9.00 mi,23°F (-5°C),14 Feb 02:54 PM MST
5,"Denver, CO",39.71,104.76,5577,36°F,49%,E 17 mph,29.35 in (993.6 mb),19°F (-7°C),10.00 mi,26°F (-3°C),14 Feb 2:58 pm MST
6,"San Francisco, CA",37.77056,122.42694,150,47°F,68%,NA NA MPH,,37°F (3°C),,,14 Feb 01:43 PM PST
7,"New York City, NY",40.78,73.97,154,53°F,25%,W 6 mph,30.13 in (1019.5 mb),18°F (-8°C),10.00 mi,51°F (11°C),14 Feb 4:51 pm EST
8,"Portland, ME",43.64,70.3,72,41°F,55%,N 7 mph,30.07 in (1018.3 mb),26°F (-3°C),10.00 mi,36°F (2°C),14 Feb 4:51 pm EST
9,"Seattle, WA",47.54548,122.3147,20,43°F,60%,N 5 MPH,30.21 in (1023.03 mb),30°F (-1°C),10.00 mi,40°F (4°C),14 Feb 01:55 PM PST


In [13]:
def transform_weather_data(df):
    transformations = [
        (['city', 'state'], df['location'].str.split(', ', expand=True)),
        (['lat', 'lon'], df[['lat', 'lon']].astype(float)),
        ('elev_ft', df['elev_ft'].astype(int)),
        ('temp_f', df['temperature'].str.extract('(\d+)').astype(int)),
        ('temp_c', (df['temp_f'] - 32) * 5/9),
        ('temp_c', df['temp_c'].round().astype(int)),
        ('humidity', df['humidity'].str.extract('(\d+)', expand=False).astype(float) / 100),
        ('wind_speed', df['wind_speed'].str.extract('(\d+)', expand=False).fillna(0).astype(int)),
        ('wind_speed', df['wind_speed'].replace('Calm', 0)),
        ('barometer', df['barometer'].apply(lambda x: float(x.split()[0]) * 33.8639 if 'in' in x and x != 'NA' else None)),
        ('barometer', df['barometer'].round(2)),
        (['dewpoint_f', 'dewpoint_c'], df['dewpoint'].str.extract('(\d+).*?(\d+)', expand=True).astype(int)),
        ('vis_miles', df['vis_miles'].str.extract('(\d+\.\d+|\d+)', expand=False).astype(float).round(2)),
        (['wind_chill_f', 'wind_chill_c'], df['wind_chill'].str.extract('(\d+).*?(\d+)', expand=True).astype(float)),
        ('last_update', df['last_update'].apply(lambda x: dateutil.parser.parse(x, tzinfos={'CST': dateutil.tz.tzoffset(None, -21600)}))),
        ('last_update', df['last_update'].apply(lambda x: x.astimezone(dateutil.tz.tzutc()))),
    ]
    for col, transform in transformations:
        if isinstance(col, list):
            df[col] = transform
        else:
            df[col] = transform

    # Drop columns that were split into two values
    df = df.drop(['temperature', 'dewpoint', 'wind_chill'], axis=1)

    return df
df

Unnamed: 0,location,lat,lon,elev_ft,humidity,wind_speed,barometer,vis_miles,last_update,city,state,temp_f,temp_c,dewpoint_f,dewpoint_c,wind_chill_f,wind_chill_c
0,"Portland, OR",45.59578,122.60917,20,0.73,4,1022.69,10.0,2023-02-14 21:53:00+00:00,Portland,OR,41,5,33,1,38.0,3.0
1,"San Diego, CA",32.73361,117.18306,13,0.61,18,1013.55,10.0,2023-02-14 21:51:00+00:00,San Diego,CA,56,13,43,6,,
2,"Duluth, MN",46.72,92.04,607,1.0,22,994.58,1.75,2023-02-14 22:15:00+00:00,Duluth,MN,34,1,34,1,22.0,6.0
3,"Minneapolis, MN",44.88,93.23,840,0.89,14,990.18,4.0,2023-02-14 21:53:00+00:00,Minneapolis,MN,38,3,35,2,30.0,1.0
4,"Salt Lake City, UT",40.77069,111.96503,4226,0.58,11,1002.71,9.0,2023-02-14 22:54:00+00:00,Salt Lake City,UT,32,0,19,7,23.0,5.0
5,"Denver, CO",39.71,104.76,5577,0.49,17,993.91,10.0,2023-02-14 22:58:00+00:00,Denver,CO,36,2,19,7,26.0,3.0
6,"San Francisco, CA",37.77056,122.42694,150,0.68,0,,,2023-02-14 21:43:00+00:00,San Francisco,CA,47,8,37,3,,
7,"New York City, NY",40.78,73.97,154,0.25,6,1020.32,10.0,2023-02-15 00:51:00+00:00,New York City,NY,53,12,18,8,51.0,11.0
8,"Portland, ME",43.64,70.3,72,0.55,7,1018.29,10.0,2023-02-15 00:51:00+00:00,Portland,ME,41,5,26,3,36.0,2.0
9,"Seattle, WA",47.54548,122.3147,20,0.62,6,1023.37,10.0,2023-02-14 21:53:00+00:00,Seattle,WA,42,6,30,1,38.0,3.0


In [17]:
def write_weather_data_to_bigquery(df):


    client = bigquery.Client()
    project_id = "deb-dev-dw"
    dataset_id = "weather"
    table_id = "daily"

    schema = [
        bigquery.SchemaField("location", "STRING"),
        bigquery.SchemaField("lat", "FLOAT"),
        bigquery.SchemaField("lon", "FLOAT"),
        bigquery.SchemaField("elev_ft", "INTEGER"),
        bigquery.SchemaField("humidity", "FLOAT"),
        bigquery.SchemaField("wind_speed", "INTEGER"),
        bigquery.SchemaField("barometer", "FLOAT"),
        bigquery.SchemaField("vis_miles", "FLOAT"),
        bigquery.SchemaField("dewpoint_f", "INTEGER"),
        bigquery.SchemaField("dewpoint_c", "INTEGER"),
        bigquery.SchemaField("wind_chill_f", "FLOAT"),
        bigquery.SchemaField("wind_chill_c", "FLOAT"),
        bigquery.SchemaField("city", "STRING"),
        bigquery.SchemaField("state", "STRING"),
        bigquery.SchemaField("temp_f", "INTEGER"),
        bigquery.SchemaField("temp_c", "INTEGER"),
        bigquery.SchemaField("last_update", "TIMESTAMP"),
    ]

    try:
        dataset_ref = client.dataset(dataset_id)
        dataset = client.get_dataset(dataset_ref)
    except NotFound:
        dataset_ref = client.dataset(dataset_id)
        dataset = bigquery.Dataset(dataset_ref)
        dataset.location = "US"
        dataset = client.create_dataset(dataset)

    table_ref = dataset.table(table_id)

    try:
        client.get_table(table_ref)
    except NotFound:
        table = bigquery.Table(table_ref, schema=schema)
        table = client.create_table(table)

    job_config = bigquery.LoadJobConfig(write_disposition=bigquery.WriteDisposition.WRITE_APPEND)
    job = client.load_table_from_dataframe(df, table_ref, job_config=job_config)
    job.result()

write_weather_data_to_bigquery(df)

In [None]:
from datetime import datetime, timedelta
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
from datetime import datetime
import dateutil
import json
from google.cloud.exceptions import NotFound
from airflow import DAG
from airflow.operators.python_operator import PythonOperator
from airflow.operators.empty import EmptyOperator
from google.cloud import bigquery

city_file = "/Users/drewwhite/Desktop/Epicodus/team-week3/drew-work/cities.json"

def scrape_weather_data(city_file):
    with open(city_file) as f:
        cities = json.load(f)

    data = []

    for city in cities:
        response = requests.get(city['NWS_URL'])
        soup = BeautifulSoup(response.content, 'html.parser')

        location = soup.find('h2', {'class': 'panel-title'})
        lat_lon_elev = soup.find('span', {'class': 'smallTxt'}).text.strip()
        lat, lon, elev = re.findall(r'[-+]?\d*\.\d+|\d+', lat_lon_elev)
        temperature = soup.find('p', {'class': 'myforecast-current-lrg'})
        humidity_elem = soup.find('td', text='Humidity')
        humidity = humidity_elem.find_next('td').text.strip() if humidity_elem else 'NA'
        wind_speed_elem = soup.find('td', text='Wind Speed')
        wind_speed = wind_speed_elem.find_next('td').text.strip() if wind_speed_elem else 'NA'
        barometer_elem = soup.find('td', text='Barometer')
        barometer = barometer_elem.find_next('td').text.strip() if barometer_elem else 'NA'
        dewpoint_elem = soup.find('td', text='Dewpoint')
        dewpoint = dewpoint_elem.find_next('td').text.strip() if dewpoint_elem else 'NA'
        visibility_elem = soup.find('td', text='Visibility')
        visibility = visibility_elem.find_next('td').text.strip() if visibility_elem else 'NA'
        wind_chill_elem = soup.find('td', text='Wind Chill')
        wind_chill = wind_chill_elem.find_next('td').text.strip() if wind_chill_elem else 'NA'
        last_update_elem = soup.find('td', text='Last update')
        last_update = last_update_elem.find_next('td').text.strip() if last_update_elem else 'NA'

        data.append({
            'location': city['Name'],
            'lat': lat,
            'lon': lon,
            'elev_ft': elev,
            'temperature': temperature.text if temperature else 'NA',
            'humidity': humidity,
            'wind_speed': wind_speed,
            'barometer': barometer,
            'dewpoint': dewpoint,
            'vis_miles': visibility,
            'wind_chill': wind_chill,
            'last_update': last_update
        })

    df = pd.DataFrame(data)

    return df

def transform_weather_data(df):
    transformations = [
        (['city', 'state'], df['location'].str.split(', ', expand=True)),
        (['lat', 'lon'], df[['lat', 'lon']].astype(float)),
        ('elev_ft', df['elev_ft'].astype(int)),
        ('temp_f', df['temperature'].str.extract('(\d+)').astype(int)),
        ('temp_c', (df['temp_f'] - 32) * 5/9),
        ('temp_c', df['temp_c'].round().astype(int)),
        ('humidity', df['humidity'].str.extract('(\d+)', expand=False).astype(float) / 100),
        ('wind_speed', df['wind_speed'].str.extract('(\d+)', expand=False).fillna(0).astype(int)),
        ('wind_speed', df['wind_speed'].replace('Calm', 0)),
        ('barometer', df['barometer'].apply(lambda x: float(x.split()[0]) * 33.8639 if 'in' in x and x != 'NA' else None)),
        ('barometer', df['barometer'].round(2)),
        (['dewpoint_f', 'dewpoint_c'], df['dewpoint'].str.extract('(\d+).*?(\d+)', expand=True).astype(int)),
        ('vis_miles', df['vis_miles'].str.extract('(\d+\.\d+|\d+)', expand=False).astype(float).round(2)),
        (['wind_chill_f', 'wind_chill_c'], df['wind_chill'].str.extract('(\d+).*?(\d+)', expand=True).astype(float)),
        ('last_update', df['last_update'].apply(lambda x: dateutil.parser.parse(x, tzinfos={'CST': dateutil.tz.tzoffset(None, -21600)}))),
        ('last_update', df['last_update'].apply(lambda x: x.astimezone(dateutil.tz.tzutc()))),
    ]
    for col, transform in transformations:
        if isinstance(col, list):
            df[col] = transform
        else:
            df[col] = transform

    # Drop columns that were split into two values
    df = df.drop(['temperature', 'dewpoint', 'wind_chill'], axis=1)

    return df

def write_weather_data_to_bigquery():

    client = bigquery.Client()
    project_id = "deb-dev-dw"
    dataset_id = "weather"
    table_id = "daily"

    schema = [
        bigquery.SchemaField("location", "STRING"),
        bigquery.SchemaField("lat", "FLOAT"),
        bigquery.SchemaField("lon", "FLOAT"),
        bigquery.SchemaField("elev_ft", "INTEGER"),
        bigquery.SchemaField("humidity", "FLOAT"),
        bigquery.SchemaField("wind_speed", "INTEGER"),
        bigquery.SchemaField("barometer", "FLOAT"),
        bigquery.SchemaField("vis_miles", "FLOAT"),
        bigquery.SchemaField("dewpoint_f", "INTEGER"),
        bigquery.SchemaField("dewpoint_c", "INTEGER"),
        bigquery.SchemaField("wind_chill_f", "FLOAT"),
        bigquery.SchemaField("wind_chill_c", "FLOAT"),
        bigquery.SchemaField("city", "STRING"),
        bigquery.SchemaField("state", "STRING"),
        bigquery.SchemaField("temp_f", "INTEGER"),
        bigquery.SchemaField("temp_c", "INTEGER"),
        bigquery.SchemaField("last_update", "TIMESTAMP"),
    ]

    try:
        dataset_ref = client.dataset(dataset_id)
        dataset = client.get_dataset(dataset_ref)
    except NotFound:
        dataset_ref = client.dataset(dataset_id)
        dataset = bigquery.Dataset(dataset_ref)
        dataset.location = "US"
        dataset = client.create_dataset(dataset)

    table_ref = dataset.table(table_id)

    try:
        client.get_table(table_ref)
    except NotFound:
        table = bigquery.Table(table_ref, schema=schema)
        table = client.create_table(table)

    job_config = bigquery.LoadJobConfig(write_disposition=bigquery.WriteDisposition.WRITE_APPEND)
    job = client.load_table_from_dataframe(df, table_ref, job_config=job_config)
    job.result()

default_args = {
    'owner': 'Drew White',
    'depends_on_past': False,
    'start_date': datetime(2023, 2, 14),
    'retries': 1,
    'retry_delay': timedelta(minutes=5)
}

with DAG(
    'weather_data_pipeline',
    description='Scrapes National Weather Service website every 12 hours, transforms data and loads to bigquery'
    default_args=default_args,
    schedule_interval='0 0,12 * * *',
) as dag:

scrape_weather_data_task = PythonOperator(
    task_id='scrape_weather_data',
    python_callable=scrape_weather_data
)

transform_weather_data_task = PythonOperator(
    task_id='transform_weather_data',
    python_callable=transform_weather_data
)

write_weather_data_to_bigquery_task = PythonOperator(
    task_id='write_weather_data_to_bigquery',
    python_callable=write_weather_data_to_bigquery
)

done = EmptyOperator(task_id='done')

scrape_weather_data_task >> transform_weather_data_task >> write_weather_data_to_bigquery_task >> done

In [16]:
PROJECT_ID = "deb-dev-dw"
DATASET_ID = "weather"
DAILY_TABLE_ID = "daily"
WEEKLY_TABLE_ID = "weekly_avg"


def calculate_weekly_averages():
    client = bigquery.Client()
    query = f"""
    SELECT location, city, state, lat, lon,
        ROUND(AVG(temp_f), 1) AS temp_f_avg,
        ROUND(AVG(temp_c), 1) AS temp_c_avg,
        ROUND(AVG(humidity), 1) AS humidity_avg,
        ROUND(AVG(barometer), 1) AS barometer_avg,
        ROUND(AVG(dewpoint_f), 1) AS dewpoint_f_avg,
        ROUND(AVG(dewpoint_c), 1) AS dewpoint_c_avg,
        CURRENT_TIMESTAMP() AS modified_at
    FROM `{PROJECT_ID}.{DATASET_ID}.{DAILY_TABLE_ID}`
    WHERE DATE_SUB(CURRENT_DATE(), INTERVAL 7 DAY) <= DATE(last_update)
    GROUP BY location, city, state, lat, lon
"""
    weekly_df = client.query(query).to_dataframe()
    return weekly_df

print(calculate_weekly_averages())


              location            city state       lat        lon  temp_f_avg  \
0   Salt Lake City, UT  Salt Lake City    UT  40.77069  111.96503        32.3   
1           Duluth, MN          Duluth    MN  46.72000   92.04000        34.0   
2           Denver, CO          Denver    CO  39.71000  104.76000        36.7   
3         Portland, ME        Portland    ME  43.64000   70.30000        40.3   
4      Minneapolis, MN     Minneapolis    MN  44.88000   93.23000        37.7   
5         Portland, OR        Portland    OR  45.59578  122.60917        42.3   
6          Seattle, WA         Seattle    WA  47.54548  122.31470        42.5   
7    San Francisco, CA   San Francisco    CA  37.77056  122.42694        48.0   
8        Baltimore, MD       Baltimore    MD  39.28000   76.62000        52.3   
9    New York City, NY   New York City    NY  40.78000   73.97000        52.7   
10       San Diego, CA       San Diego    CA  32.73361  117.18306        55.7   

    temp_c_avg  humidity_av

In [15]:

def write_weekly_averages_to_bigquery(df):
    client = bigquery.Client()
    project_id = "deb-dev-dw"
    dataset_id = "weather"
    table_id = "weekly_avg"

    schema = [
        bigquery.SchemaField("location", "STRING"),
        bigquery.SchemaField("city", "STRING"),
        bigquery.SchemaField("state", "STRING"),
        bigquery.SchemaField("lat", "FLOAT"),
        bigquery.SchemaField("lon", "FLOAT"),
        bigquery.SchemaField("temp_f_avg", "FLOAT"),
        bigquery.SchemaField("temp_c_avg", "FLOAT"),
        bigquery.SchemaField("humidity_avg", "FLOAT"),
        bigquery.SchemaField("barometer_avg", "FLOAT"),
        bigquery.SchemaField("dewpoint_f_avg", "FLOAT"),
        bigquery.SchemaField("dewpoint_c_avg", "FLOAT"),
        bigquery.SchemaField("modified_at", "TIMESTAMP")
    ]

    dataset_ref = client.dataset(dataset_id)
    table_ref = dataset_ref.table(table_id)

    try:
        client.get_table(table_ref)
    except NotFound:
        table = bigquery.Table(table_ref, schema=schema)
        table = client.create_table(table)

    job_config = bigquery.LoadJobConfig(write_disposition=bigquery.WriteDisposition.WRITE_APPEND)
    job = client.load_table_from_dataframe(df, table_ref, job_config=job_config)
    job.result()

write_weekly_averages_to_bigquery(calculate_weekly_averages())


In [26]:
import datetime as dt
from google.cloud import bigquery
from airflow import DAG
from airflow.providers.google.cloud.operators.bigquery import BigQueryInsertJobOperator
from airflow.decorators import dag, task
from airflow.operators.python_operator import EmptyOperator

PROJECT_ID = "deb-dev-dw"
DATASET_ID = "weather"
DAILY_TABLE_ID = "daily"
WEEKLY_TABLE_ID = "weekly_avg"

SCHEMA = [
    bigquery.SchemaField("location", "STRING"),
    bigquery.SchemaField("city", "STRING"),
    bigquery.SchemaField("state", "STRING"),
    bigquery.SchemaField("lat", "FLOAT"),
    bigquery.SchemaField("lon", "FLOAT"),
    bigquery.SchemaField("temp_f_avg", "FLOAT"),
    bigquery.SchemaField("temp_c_avg", "FLOAT"),
    bigquery.SchemaField("humidity_avg", "FLOAT"),
    bigquery.SchemaField("barometer_avg", "FLOAT"),
    bigquery.SchemaField("dewpoint_f_avg", "FLOAT"),
    bigquery.SchemaField("dewpoint_c_avg", "FLOAT"),
    bigquery.SchemaField("modified_at", "TIMESTAMP")
]

@task
def calculate_weekly_averages():
    client = bigquery.Client()
    query = f"""
    SELECT location, city, state, lat, lon,
        ROUND(AVG(temp_f), 1) AS temp_f_avg,
        ROUND(AVG(temp_c), 1) AS temp_c_avg,
        ROUND(AVG(humidity), 1) AS humidity_avg,
        ROUND(AVG(barometer), 1) AS barometer_avg,
        ROUND(AVG(dewpoint_f), 1) AS dewpoint_f_avg,
        ROUND(AVG(dewpoint_c), 1) AS dewpoint_c_avg,
        CURRENT_TIMESTAMP() AS modified_at
    FROM `{PROJECT_ID}.{DATASET_ID}.{DAILY_TABLE_ID}`
    WHERE DATE_SUB(CURRENT_DATE(), INTERVAL 7 DAY) <= DATE(last_update)
    GROUP BY location, city, state, lat, lon
"""
    weekly_df = client.query(query).to_dataframe()
    return weekly_df

@task
def write_weekly_avg_to_bq(df, **context):
    client = bigquery.Client()

    dataset_ref = client.dataset(DATASET_ID)
    table_ref = dataset_ref.table(WEEKLY_TABLE_ID)

    try:
        client.get_table(table_ref)
    except NotFound:
        table = bigquery.Table(table_ref, schema=SCHEMA)
        table = client.create_table(table)

    job_config = bigquery.LoadJobConfig(write_disposition=bigquery.WriteDisposition.WRITE_APPEND)
    job = client.load_table_from_dataframe(df, table_ref, job_config=job_config)
    job.result()
    # Create the BigQuery insert job operator
    insert_job = BigQueryInsertJobOperator(
        task_id='insert_weekly_avg_to_bq',
        project_id=PROJECT_ID,
        dataset_id=DATASET_ID,
        table_id=WEEKLY_TABLE_ID,
        schema_fields=SCHEMA,
        bigquery_conn_id='google_cloud_default',
        time_partitioning={'type': 'DAY'},
        use_legacy_sql=False,
        template_suffix=f"_{dt.datetime.now().strftime('%Y%m%d_%H%M%S')}",
        task_concurrency=1,
        dag=dag
    )

    # Execute the BigQuery insert job operator
    insert_job.execute(context={
        'ti': context['ti'],
        'weekly_averages_df': df
    })

@dag(
    schedule_interval="@weekly",
    description='Calculates weekly averages of weather data in bigquery',
    start_date=datetime.utcnow(),
    catchup=False,
    default_view='graph',
    is_paused_upon_creation=True,
    tags=['averages', 'weekly averages'],
)

def weekly_avg():

    calculate_weekly_avg_task = calculate_weekly_averages()

    write_weekly_avg_task = write_weekly_avg_to_bq(calculate_weekly_averages)

    insert_weekly_avg_to_bq_task = BigQueryInsertJobOperator(
        task_id='insert_weekly_avg_to_bq',
        project_id=PROJECT_ID,
        dataset_id=DATASET_ID,
        table_id=WEEKLY_TABLE_ID,
        schema_fields=SCHEMA,
        bigquery_conn_id='google_cloud_default',
        time_partitioning={'type': 'DAY'},
        use_legacy_sql=False,
        template_suffix=f"_{dt.datetime.now().strftime('%Y%m%d_%H%M%S')}",
        task_concurrency=1,
        dag=dag
    )

    done = EmptyOperator(task_id='done')

    calculate_weekly_avg_task >> write_weekly_avg_task >> insert_weekly_avg_to_bq_task >> done


ModuleNotFoundError: No module named 'airflow'