**PreRequisites:**
1. Install the following in your dev environment:<br>
    a. google-cloud-bigquery: pip.exe install google-cloud-bigquery<br>
    b. db-types: pip install db-dtypes<br>
2. Install gcloud CLI <br>
    a. Install directions (with download link): https://cloud.google.com/sdk/docs/install<br>
    > i. pay attention to where it installs!<br>
    > ii. It says to leave all the shortcut, open terminal options checked. I received errors when it ran "gcloud info --run-diagnostics" and I ignored them for now...<br>
    
    b. Add this to your PATH environmental variables (for me this was C:\Users\vt_be\AppData\Local\Google\Cloud SDK\google-cloud-sdk)<br>
    c. reboot!<br>
    d. open git bash, switch to dev environment<br>
    > i. "gcloud info --run-diagnostics" now ran without issue<br>
    ii. add authentication (this opens browser to connect your google account):  gcloud auth application-default login<br>
    
    e. I also needed to set up a Big Query Project: mostly followed https://cloud.google.com/bigquery/docs/sandbox<br>
    > i. I didn't see the stuff mentioned in #3 but otherwise worked<br>
    > ii. Note that when you create the project, an id is generated that is project name - #### (for me BootCamp-Weather:  bootcamp-weather-400118<br>
    
    f. Add the project to default - back to gitbash: gcloud auth application-default set-quota-project <project-id><br>
    g. In the downloaded notebook, add the project id to the client = bigquery.Client("project-id") in the first cell<br>
    

**Credit:**
* Big Query calls adapted from https://www.kaggle.com/code/crained/noaa-dataset-with-google-bigquery
* SQL calls adapted from GitHub BigQuery documentation: https://github.com/googleapis/python-bigquery

In [1]:
# My project name (don't think can be shared across people) is stored in a config.py file as "google_project"
# Since this is unique to user, I added config.py to the gitignore. You must create your own config.py file with project name
from config import google_project
# bigquery and pandas work well together for dataframes!
import pandas as pd
import os
# Follow the prerequisite instructions to get bigquery going
from google.cloud import bigquery
# Create a "Client" object reference a google project for which your system has been authenticated
client = bigquery.Client(google_project)

## Query for the Stations static info ##

In [2]:
# Station IDs (usaf) remained unique over time, but names changed. 
# This query will sort by begin date descending so we can use the remove duplicates function to keep the latest name

QUERY = (
    'SELECT usaf, name, state, lat, lon '
    'FROM `bigquery-public-data.noaa_gsod.stations` '
    'WHERE country = "US" AND state <> "None" '
    'ORDER BY begin DESC'
    )
# API request
stations_result = client.query(QUERY)  

# Waits for query to finish
stations_data = stations_result.result()  

# Put the last query into a dataframe
stations_data_df = stations_data.to_dataframe()

# Remove duplicate stations by usaf
stations_data_df = stations_data_df.drop_duplicates("usaf")

# and export
stations_data_df.to_json("GEOJSON_data/Stations.json", orient="records")
stations_data_df.to_csv("GEOJSON_data/Stations.csv")

print(len(stations_data_df))
stations_data_df.head()

3790


Unnamed: 0,usaf,name,state,lat,lon
0,720511,BRITTON MUNI,SD,45.815,-97.743
1,723062,PINEY ISLAND,NC,35.02,-76.46
2,A00030,CONNELLSVILLE AIRPORT,PA,39.959,-79.657
3,720844,SPANISH PEAKS,CO,37.697,-104.785
4,724916,MARINA MUNI,CA,36.682,-121.762


## Query for the statistics data ##
### Query for temperature stats (similar filtering) ###

In [3]:
aggregate_query = (
    'SELECT s.usaf, '
    'MIN(g.min) AS min_temp, '
    'AVG(g.temp) AS mean_temp, '
    'MAX(g.max) AS max_temp, '
    'FROM `bigquery-public-data.noaa_gsod.gsod2022` AS g '
    'INNER JOIN `bigquery-public-data.noaa_gsod.stations` AS s ON g.stn = s.usaf '
    'WHERE s.country = "US" AND s.state <> "None" '
    # This line below removes the 'not a reading' so we can run stats on those columns
    'AND g.min <> 9999.9 AND g.max <> 9999.9 '
    'GROUP BY s.usaf '
    )

# API request
station_temp_result = client.query(aggregate_query)  

# Waits for query to finish
station_temp_data = station_temp_result.result()  

# Put the last query into a dataframe
station_temp_df = station_temp_result.to_dataframe()

# # and export
# state_temp_station.to_json("GEOJSON_data/Station_temp_sample.json", orient="records")
# state_temp_station.to_csv("GEOJSON_data/Station_temp_sample.csv")

print(len(station_temp_df))
station_temp_df

2522


Unnamed: 0,usaf,min_temp,mean_temp,max_temp
0,701043,-27.9,21.075649,54.0
1,702040,-5.1,30.590909,55.9
2,702490,-27.4,32.636119,77.0
3,702606,-7.6,24.352830,55.4
4,703406,-22.0,42.212132,82.4
...,...,...,...,...
2517,A07053,17.6,34.047619,57.2
2518,720641,12.2,38.080000,62.6
2519,997742,46.9,65.450000,81.3
2520,997177,40.1,46.860000,54.9


### Query for total snow in a year ###

In [4]:
# Perform a query 
aggregate_query = (
    'SELECT s.usaf, '
    'SUM(g.sndp) AS total_snow '
    'FROM `bigquery-public-data.noaa_gsod.gsod2022` AS g '
    'INNER JOIN `bigquery-public-data.noaa_gsod.stations` AS s ON g.stn = s.usaf '
    'WHERE s.country = "US" AND s.state <> "None" '
    'AND g.sndp <> 999.9 '
    'GROUP BY s.usaf '
    'ORDER BY total_snow DESC')
station_snow_result = client.query(aggregate_query)  # API request
station_snow_data = station_snow_result.result()  # Waits for query to finish

# Put the last query into a dataframe
station_snow_df = station_snow_result.to_dataframe()

station_snow_df

Unnamed: 0,usaf,total_snow
0,702606,14721.2
1,702650,8622.6
2,702490,7887.4
3,702615,6871.8
4,701740,5199.6
...,...,...
276,726980,1.2
277,722630,1.2
278,723320,1.2
279,722680,1.2


### Query for total precipitation in a year ###

In [5]:
# Perform a query 
aggregate_query = (
    'SELECT s.usaf, '
    'SUM(g.prcp) AS total_precipitation '
    'FROM `bigquery-public-data.noaa_gsod.gsod2022` AS g '
    'INNER JOIN `bigquery-public-data.noaa_gsod.stations` AS s ON g.stn = s.usaf '
    'WHERE s.country = "US" AND s.state <> "None" '
    'AND g.prcp <> 99.9 '
    'GROUP BY s.usaf')
station_prcp_result = client.query(aggregate_query)  # API request
station_prcp_data = station_prcp_result.result()  # Waits for query to finish

# Put the last query into a dataframe
station_prcp_df = station_prcp_result.to_dataframe()

station_prcp_df

Unnamed: 0,usaf,total_precipitation
0,700860,12400.22
1,702040,8000.06
2,702223,1229.90
3,702490,10613.66
4,702606,8199.18
...,...,...
2518,998166,0.00
2519,720641,0.00
2520,702084,199.98
2521,701190,99.99


### Query for count of days with tornadoes in a year ###
SchemaField('tornado_funnel_cloud', 'STRING', 'NULLABLE', None, 'Indicators (1 = yes, 0 = no/not reported) for the occurrence during the day', (), None)

In [6]:
# Perform a query 
# Include month and day so can remove duplicate "yes" on the same day
QUERY = (
    'SELECT s.usaf, g.mo, g.da, '
    'g.tornado_funnel_cloud '
    'FROM `bigquery-public-data.noaa_gsod.gsod2022` AS g '
    'INNER JOIN `bigquery-public-data.noaa_gsod.stations` AS s ON g.stn = s.usaf '
    'WHERE s.country = "US" AND s.state <> "None" '
    'AND tornado_funnel_cloud = "1" '
    )

station_tornado_result = client.query(QUERY)  # API request
station_tornado_data = station_tornado_result.result()  # Waits for query to finish

# Put the last query into a dataframe
station_tornado_df = station_tornado_result.to_dataframe()

# Remove duplicate station day reports
station_tornado_nodup = station_tornado_df.drop_duplicates()
# Drop month and day
station_tornado_nodup = station_tornado_nodup[["usaf", "tornado_funnel_cloud"]]
# Count how many in the year for each station
station_tornado_count = station_tornado_nodup.groupby("usaf").count()

print(len(station_tornado_count))
station_tornado_count.head()

33


Unnamed: 0_level_0,tornado_funnel_cloud
usaf,Unnamed: 1_level_1
720381,1
722010,4
722015,8
722020,2
722030,1


### Query for count of days with hail in a year ###
SchemaField('hail', 'STRING', 'NULLABLE', None, 'Indicators (1 = yes, 0 = no/not reported) for the occurrence during the day', (), None)

In [7]:
# Perform a query 
# Include month and day so can remove duplicate "yes" on the same day
QUERY = (
    'SELECT s.usaf, g.mo, g.da, '
    'g.hail '
    'FROM `bigquery-public-data.noaa_gsod.gsod2022` AS g '
    'INNER JOIN `bigquery-public-data.noaa_gsod.stations` AS s ON g.stn = s.usaf '
    'WHERE s.country = "US" AND s.state <> "None" '
    'AND g.hail = "1" '
    )

station_hail_result = client.query(QUERY)  # API request
station_hail_data = station_hail_result.result()  # Waits for query to finish

# Put the last query into a dataframe
station_hail_df = station_hail_result.to_dataframe()

# Remove duplicate station day reports
station_hail_nodup = station_hail_df.drop_duplicates()
# Drop month and day
station_hail_nodup = station_hail_nodup[["usaf", "hail"]]
# Count how many in the year for each station
station_hail_count = station_hail_nodup.groupby("usaf").count()

print(len(station_hail_count))
station_hail_count.head()

113


Unnamed: 0_level_0,hail
usaf,Unnamed: 1_level_1
702650,2
702730,1
702910,1
704140,12
720619,1


## Merge all the data frames together ##

In [11]:
station_temp_merge = pd.merge(stations_data_df, station_temp_df, how ="inner", on = "usaf")

st_temp_snow_merge = pd.merge(station_temp_merge, station_snow_df, how ="outer", on = "usaf")

st_temp_snow_prcp = pd.merge(st_temp_snow_merge, station_prcp_df, how ="outer", on = "usaf")

station_stats_torn = pd.merge(st_temp_snow_prcp, station_tornado_count, how ="outer", on = "usaf")

station_all = pd.merge(station_stats_torn, station_hail_count, how ="outer", on = "usaf")

# Looking at the csv file, the total snow = 0 brought in a 0 where temperatures and hail are NaN
# Removing those rows
station_all = station_all.dropna(subset=["min_temp", "mean_temp", "max_temp"])

# Convert the remaining NaN to zero
station_all = station_all.fillna(0)

# Sort by states and name for easy dropdown menu population
station_all = station_all.sort_values(by=["state", "name"], ascending = [True, True])

# # and export
# station_all.to_json("GEOJSON_data/stations_all.json", orient="records")
# station_all.to_csv("GEOJSON_data/stations_all.csv")
# station_all.to_json("GEOJSON_data/stations_all.js", orient="records")

station_all.head(-30)

Unnamed: 0,usaf,name,state,lat,lon,min_temp,mean_temp,max_temp,total_snow,total_precipitation,tornado_funnel_cloud,hail
2000,704540,ADAK (NAS),AK,51.883,-176.650,12.9,41.957418,69.1,0.0,2.924200e+02,0.0,0.0
682,997380,ADAK ISLAND,AK,51.870,-176.630,18.7,42.015254,64.9,0.0,0.000000e+00,0.0,0.0
738,703926,AKHIOK,AK,56.933,-154.183,8.1,37.401887,71.1,0.0,3.092500e+02,0.0,0.0
547,702686,AKIAK,AK,60.903,-161.231,-20.0,34.427976,75.9,0.0,1.220168e+04,0.0,0.0
13,999999,ALEKNAGIK 1 NNE,AK,59.284,-158.615,-57.1,52.816638,122.4,0.0,4.902942e+08,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
1931,724275,WHEELING OHIO CO,WV,40.167,-80.650,-4.0,52.852055,91.9,0.0,8.494000e+01,0.0,0.0
2225,724140,YEAGER AIRPORT,WV,38.379,-81.590,0.0,55.358082,95.0,67.3,6.073000e+01,0.0,1.0
105,720516,AFTON MUNI,WY,42.711,-110.942,-23.8,36.280387,93.2,0.0,1.069893e+04,0.0,0.0
135,720977,ALPHA (BURNS),WY,41.333,-104.267,-23.8,46.881159,124.7,0.0,4.061400e+02,0.0,0.0


### Add temporary severity rating ###

In [14]:
station_all["severity_rating"] = station_all["tornado_funnel_cloud"]*3 + station_all["hail"]

# and export
station_all.to_json("GEOJSON_data/stations_all.json", orient="records")
station_all.to_csv("GEOJSON_data/stations_all.csv")
station_all.to_json("GEOJSON_data/stations_all.js", orient="records")

station_all.head(-30)

Unnamed: 0,usaf,name,state,lat,lon,min_temp,mean_temp,max_temp,total_snow,total_precipitation,tornado_funnel_cloud,hail,severity_rating
2000,704540,ADAK (NAS),AK,51.883,-176.650,12.9,41.957418,69.1,0.0,2.924200e+02,0.0,0.0,0.0
682,997380,ADAK ISLAND,AK,51.870,-176.630,18.7,42.015254,64.9,0.0,0.000000e+00,0.0,0.0,0.0
738,703926,AKHIOK,AK,56.933,-154.183,8.1,37.401887,71.1,0.0,3.092500e+02,0.0,0.0,0.0
547,702686,AKIAK,AK,60.903,-161.231,-20.0,34.427976,75.9,0.0,1.220168e+04,0.0,0.0,0.0
13,999999,ALEKNAGIK 1 NNE,AK,59.284,-158.615,-57.1,52.816638,122.4,0.0,4.902942e+08,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1931,724275,WHEELING OHIO CO,WV,40.167,-80.650,-4.0,52.852055,91.9,0.0,8.494000e+01,0.0,0.0,0.0
2225,724140,YEAGER AIRPORT,WV,38.379,-81.590,0.0,55.358082,95.0,67.3,6.073000e+01,0.0,1.0,1.0
105,720516,AFTON MUNI,WY,42.711,-110.942,-23.8,36.280387,93.2,0.0,1.069893e+04,0.0,0.0,0.0
135,720977,ALPHA (BURNS),WY,41.333,-104.267,-23.8,46.881159,124.7,0.0,4.061400e+02,0.0,0.0,0.0


In [15]:
# Add "let stations_all = " to the beginning of the file
with open("GEOJSON_data/stations_all.js", 'r+') as file:
        content = file.read()
        file.seek(0, 0)
        file.write("let stations_all = " + content)
        file.close()