In [27]:
import pandas as pd
import numpy as np
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_absolute_error
import requests
from bs4 import BeautifulSoup
import re
import dateutil
import sys
import logging
from typing import List
from google.cloud import bigquery
from google.cloud import bigquery
from google.oauth2 import service_account

In [28]:
#---------------------Data Cleaning/Transformation-----------------------------------------
#read csv to dataframe
weather_df = pd.read_csv('./data/noaa_weather_data.csv', index_col='DATE')
#rename and filter columns which can be useful
core_weather_df = weather_df[['NAME','PRCP', 'SNOW','SNWD', 'TMAX','TMIN']].copy()
core_weather_df.columns = ['name','precip', 'snow', 'snow_depth', 'temp_max','temp_min']
#fill missing data points
core_weather_df['precip'] = core_weather_df['precip'].fillna(0)
core_weather_df['snow'] = core_weather_df['snow'].fillna(0)
core_weather_df['snow_depth'] = core_weather_df['snow_depth'].fillna(0)
#map the name to a number variable to be used as a predictor
core_weather_df['name_num'] = pd.factorize(core_weather_df['name'])[0]
#'ffill' ('forward fill') fills the value in with the value from the previous date this will apply to the temp_min/max columns since the others have already been filled in
core_weather_df = core_weather_df.fillna(method='ffill')
#change date index to a datetime data type
core_weather_df.index = pd.to_datetime(core_weather_df.index)
#make a visual representation of the data and remove cities that have large chunks of missing data
core_weather_df = core_weather_df[core_weather_df.name.isin(['BEMIDJI, MN US', 'SHARJAH INTER. AIRP, AE']) == False]
# core_weather_df.groupby(by='name').plot()
#create a column 'target_temp_max' by shifting all values in the temp_max column back a day...creates a column based on temperatures from tomorrows temps
core_weather_df['target_temp_max_day_1'] = core_weather_df.groupby('name')['temp_max'].shift(-1)
core_weather_df['target_temp_max_day_2'] = core_weather_df.groupby('name')['target_temp_max_day_1'].shift(-1)
core_weather_df['target_temp_max_day_3'] = core_weather_df.groupby('name')['target_temp_max_day_2'].shift(-1)
core_weather_df['target_temp_max_day_4'] = core_weather_df.groupby('name')['target_temp_max_day_3'].shift(-1)
core_weather_df['target_temp_max_day_5'] = core_weather_df.groupby('name')['target_temp_max_day_4'].shift(-1)
core_weather_df['target_temp_max_day_6'] = core_weather_df.groupby('name')['target_temp_max_day_5'].shift(-1)
core_weather_df['target_temp_max_day_7'] = core_weather_df.groupby('name')['target_temp_max_day_6'].shift(-1)
core_weather_df['target_temp_max_day_8'] = core_weather_df.groupby('name')['target_temp_max_day_7'].shift(-1)
core_weather_df['target_temp_max_day_9'] = core_weather_df.groupby('name')['target_temp_max_day_8'].shift(-1)
core_weather_df['target_temp_max_day_10'] = core_weather_df.groupby('name')['target_temp_max_day_9'].shift(-1)
#remove the last row since its a NaN value cuz that value would be in the future
core_weather_df = core_weather_df.dropna()
#map each citiy to a numerical values to be used in the regression analysis
core_weather_df['name'] = core_weather_df['name'].astype('category')
core_weather_df['city_number'] = core_weather_df['name'].cat.codes
#training set includes all data before and including dec 12th 2021, this is teh set that is used to train the model to predict future data
training_set = core_weather_df.loc[:'2021-12-31']
#test set includes all data after and including jan 1st 2022
test_set = core_weather_df.loc['2022-01-01':]

In [29]:
#-----------------create/test data model--------------------------------
#using Ridge regression to minimize overfitting
regression = Ridge(alpha=.1)
predictors = ['precip','temp_max','temp_min','city_number']
#training set includes all data before and including dec 12th 2021, this is teh set that is used to train the model to predict future data
training_set = core_weather_df.loc[:'2021-12-31']
# #test set includes all data after and including jan 1st 2022
test_set = test_set.loc['2022-01-01':]

# test_set_shifted = test_set.shift(365, freq='D')
# test_set2= test_set_shifted.loc['2023-01-01':'2023-12-31']

#train the model based on the predictors
regression.fit(training_set[predictors], training_set['target_temp_max_day_1'])
predictions = regression.predict(test_set[predictors])

combined_df = pd.concat([test_set[['name','target_temp_max_day_1']], pd.Series(predictions, index = test_set.index)], axis=1)
combined_df.columns = ['name','actual','Predicted_temp_next_day']

display(combined_df.tail())

Unnamed: 0_level_0,name,actual,Predicted_temp_next_day
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2023-02-10,"ORLANDO EXECUTIVE AIRPORT, FL US",84.0,83.125554
2023-02-11,"ORLANDO EXECUTIVE AIRPORT, FL US",72.0,83.602435
2023-02-12,"ORLANDO EXECUTIVE AIRPORT, FL US",71.0,71.718667
2023-02-13,"ORLANDO EXECUTIVE AIRPORT, FL US",77.0,70.124692
2023-02-14,"ORLANDO EXECUTIVE AIRPORT, FL US",82.0,74.84006


In [30]:
#------------scrape current data an append to historical dataset---------------------------------
#-------------------Extract-------------------------
urls = ["https://forecast.weather.gov/MapClick.php?lat=57.0826&lon=-135.2692#.Y-vs_9LMJkg",
        'https://forecast.weather.gov/MapClick.php?lat=45.5118&lon=-122.6756#.Y-vtHNLMJkg',
        'https://forecast.weather.gov/MapClick.php?lat=35.0842&lon=-106.649#.ZAKIR9LMJhE',
        'https://forecast.weather.gov/MapClick.php?lat=44.8017&lon=-68.7708#.ZAKIY9LMJhE',
        'https://forecast.weather.gov/MapClick.php?lat=64.8453&lon=-147.7221#.ZAKId9LMJhE',
        'https://forecast.weather.gov/MapClick.php?lat=20.8986&lon=-156.4305#.ZAKIjtLMJhE',
        'https://forecast.weather.gov/MapClick.php?lat=48.1786&lon=-114.3037#.ZAKIqNLMJhE',
        'https://forecast.weather.gov/MapClick.php?lat=37.4142&lon=-79.143#.ZAKIwtLMJhE',
        'https://forecast.weather.gov/MapClick.php?lat=42.9467&lon=-87.8967#.ZAKI0tLMJhE',
        'https://forecast.weather.gov/MapClick.php?lat=28.4272&lon=-81.308#.ZAKI5NLMJhE',
        'https://forecast.weather.gov/MapClick.php?lat=39.8784&lon=-75.2402#.ZAKJAdLMJhE'
        ]
        
combined_df = pd.DataFrame()

for url in urls:
    r = requests.get(url)
    soup = BeautifulSoup(r.content,"html.parser")

    #various containers
    item1 = soup.find_all(id='current_conditions-summary')
    item2 = soup.find_all(id='current_conditions_detail')
    item4 = soup.find_all(id='tombstone-container')

    #raw data
    temp_f = [item.find(class_="myforecast-current-lrg").get_text() for item in item1]
    temp_min = soup.find('p', {'class': 'temp temp-low'}).text.strip()
    temp_max = soup.find('p', {'class': 'temp temp-high'}).text.strip()


    #df of temperatures
    df_temperature = pd.DataFrame({"temp" : temp_f,'tempmin': temp_min,'tempmax': temp_max})

    #df_2 is a df of current conditions in detail (Humidity, Wind Speed, Barometer, Dewpoint, Visibility, Last update)
    table = soup.find_all('table')
    df_2 = pd.read_html(str(table))[0]
    df_2 = df_2.rename(columns={'1':'metrics'})
    # df_2['1'] = df_2['1'].fillna(0)
    df_2 = df_2.pivot(columns=0, values=1).ffill().dropna().reset_index().drop(columns=['index'])

    #merge both dataframes
    temp_df=pd.concat([df_temperature,df_2],axis=1)

    #scrape lattitude, longitude, and elevation 
    lat_lon_elev = soup.find('span', {'class': 'smallTxt'}).text.strip()
    lat, lon, elev = re.findall(r'[-+]?\d*\.\d+|\d+', lat_lon_elev)

    #scrape name
    station = soup.find('h2', {'class': 'panel-title'}).text.strip()

    #add location, lat, long, and elev to source_df
    temp_df['elevation_ft'] = elev
    temp_df['latitude'] = lat
    temp_df['longitude'] = lon
    temp_df['weather_station'] = station

    combined_df = pd.concat([temp_df, combined_df], ignore_index=True, sort=False)

display(combined_df)

Unnamed: 0,temp,tempmin,tempmax,Barometer,Dewpoint,Humidity,Last update,Visibility,Wind Chill,Wind Speed,elevation_ft,latitude,longitude,weather_station,Heat Index
0,52°F,Low: 32 °F,High: 44 °F,30.14 in (1020.5 mb),26°F (-3°C),37%,9 Mar 3:54 pm EST,10.00 mi,47°F (8°C),N 18 G 24 mph,7,39.87,75.23,"Philadelphia, Philadelphia International Airpo...",
1,79°F,Low: 60 °F,High: 80 °F,30.10 in (1019.0 mb),52°F (11°C),39%,9 Mar 3:53 pm EST,10.00 mi,,Vrbl 6 mph,89,28.42,81.32,Orlando International Airport (KMCO),79°F (26°C)
2,38°F,Low: 32 °F,High: 39 °F,30.27 in (1025.6 mb),27°F (-3°C),65%,9 Mar 2:52 pm CST,10.00 mi,30°F (-1°C),NE 12 mph,722,42.96,87.9,"Milwaukee, General Mitchell International Airp...",
3,62°F,Low: 38 °F,High: 62 °F,30.12 in (1020.2 mb),20°F (-7°C),20%,9 Mar 3:54 pm EST,10.00 mi,,Calm,935,37.32,79.21,"Lynchburg, Lynchburg Regional Airport (KLYH)",
4,28°F,Low: 17 °F,High: 31 °F,30.1 in (1019.3 mb),10°F (-12°C),46%,09 Mar 01:35 PM MST,10.00 mi,,N 3 MPH,2972,48.30417,114.26361,"Kalispell, Glacier Park International Airport ...",
5,80°F,Low: 60 °F,High: 79 °F,29.92 in (1013.5 mb),57°F (14°C),45%,9 Mar 10:54 am HST,10.00 mi,,W 12 G 23 mph,52,20.89,156.44,"Kahului, Kahului Airport (PHOG)",80°F (27°C)
6,24°F,Low: -1 °F,High: 29 °F,30.38 in (1029.9 mb),19°F (-7°C),81%,9 Mar 11:53 am AKST,1.25 mi,13°F (-11°C),SW 12 mph,433,64.8,147.88,"Fairbanks, Fairbanks International Airport (PAFA)",
7,41°F,Low: 23 °F,High: 41 °F,30.08 in (1018.6 mb),23°F (-5°C),49%,9 Mar 3:53 pm EST,10.00 mi,34°F (1°C),N 12 mph,190,44.8,68.82,"Bangor, Bangor International Airport (KBGR)",
8,58°F,Low: 34 °F,High: 62 °F,30.14 in (1016.4 mb),-2°F (-19°C),8%,9 Mar 12:52 pm MST,10.00 mi,,W 8 G 18 mph,5351,35.04,106.61,"Albuquerque, Albuquerque International Airport...",
9,43°F,Low: 37 °F,High: 45 °F,29.78 in (1008.47 mb),36°F (2°C),75%,09 Mar 12:20 PM PST,10.00 mi,35°F (2°C),E 19 MPH,20,45.59578,122.60917,"Portland, Portland International Airport (KPDX)",


In [31]:
urls = [
        'https://www.localconditions.com/weather-philadelphia-pennsylvania/19019/past.php',
        'https://www.localconditions.com/weather-orlando-florida/32801/past.php',
        'https://www.localconditions.com/weather-milwaukee-wisconsin/53201/past.php',
        'https://www.localconditions.com/weather-lynchburg-virginia/24501/past.php',
        'https://www.localconditions.com/weather-kalispell-montana/59901/past.php',
        'https://www.localconditions.com/weather-kahului-hawaii/96732/past.php',
        'https://www.localconditions.com/weather-fairbanks-alaska/99701/past.php',
        'https://www.localconditions.com/weather-bangor-maine/04401/past.php',
        'https://www.localconditions.com/weather-albuquerque-new-mexico/87101/past.php',
        'https://www.localconditions.com/weather-portland-oregon/97201/past.php',
        "https://www.localconditions.com/weather-sitka-alaska/99835/past.php"
        ]

precip_df = pd.DataFrame()

for url in urls:
        r = requests.get(url)
        soup = BeautifulSoup(r.content,"html.parser")
        details = soup.select_one(".past_weather_express")
        # Find all div elements with class="panel"
        panel_divs = soup.find_all('div', {'class': 'panel'})
        # Extract the text content of each div element and store it in a list
        panel_texts = [panel_div.text.strip() for panel_div in panel_divs]
        # Print the list of extracted text content
        data = panel_texts[1]
        data = [item.strip() for item in data]
        data = [item for item in data if item]
        data = data[77:90]
        data="".join(data)
        df = pd.DataFrame([data], columns=['precip'])
        precip_df = pd.concat([precip_df, df], ignore_index=True, sort=False)
        
precip_df['precip'] = precip_df['precip'].str.extract(pat='(\d+\.?\d*)').astype(float)
precip_df = precip_df.fillna(0)

source_df =pd.concat([combined_df,precip_df],axis=1)
display(source_df.head(11))

Unnamed: 0,temp,tempmin,tempmax,Barometer,Dewpoint,Humidity,Last update,Visibility,Wind Chill,Wind Speed,elevation_ft,latitude,longitude,weather_station,Heat Index,precip
0,52°F,Low: 32 °F,High: 44 °F,30.14 in (1020.5 mb),26°F (-3°C),37%,9 Mar 3:54 pm EST,10.00 mi,47°F (8°C),N 18 G 24 mph,7,39.87,75.23,"Philadelphia, Philadelphia International Airpo...",,0.0
1,79°F,Low: 60 °F,High: 80 °F,30.10 in (1019.0 mb),52°F (11°C),39%,9 Mar 3:53 pm EST,10.00 mi,,Vrbl 6 mph,89,28.42,81.32,Orlando International Airport (KMCO),79°F (26°C),0.0
2,38°F,Low: 32 °F,High: 39 °F,30.27 in (1025.6 mb),27°F (-3°C),65%,9 Mar 2:52 pm CST,10.00 mi,30°F (-1°C),NE 12 mph,722,42.96,87.9,"Milwaukee, General Mitchell International Airp...",,0.0
3,62°F,Low: 38 °F,High: 62 °F,30.12 in (1020.2 mb),20°F (-7°C),20%,9 Mar 3:54 pm EST,10.00 mi,,Calm,935,37.32,79.21,"Lynchburg, Lynchburg Regional Airport (KLYH)",,0.0
4,28°F,Low: 17 °F,High: 31 °F,30.1 in (1019.3 mb),10°F (-12°C),46%,09 Mar 01:35 PM MST,10.00 mi,,N 3 MPH,2972,48.30417,114.26361,"Kalispell, Glacier Park International Airport ...",,0.0
5,80°F,Low: 60 °F,High: 79 °F,29.92 in (1013.5 mb),57°F (14°C),45%,9 Mar 10:54 am HST,10.00 mi,,W 12 G 23 mph,52,20.89,156.44,"Kahului, Kahului Airport (PHOG)",80°F (27°C),0.216
6,24°F,Low: -1 °F,High: 29 °F,30.38 in (1029.9 mb),19°F (-7°C),81%,9 Mar 11:53 am AKST,1.25 mi,13°F (-11°C),SW 12 mph,433,64.8,147.88,"Fairbanks, Fairbanks International Airport (PAFA)",,0.044
7,41°F,Low: 23 °F,High: 41 °F,30.08 in (1018.6 mb),23°F (-5°C),49%,9 Mar 3:53 pm EST,10.00 mi,34°F (1°C),N 12 mph,190,44.8,68.82,"Bangor, Bangor International Airport (KBGR)",,0.015
8,58°F,Low: 34 °F,High: 62 °F,30.14 in (1016.4 mb),-2°F (-19°C),8%,9 Mar 12:52 pm MST,10.00 mi,,W 8 G 18 mph,5351,35.04,106.61,"Albuquerque, Albuquerque International Airport...",,0.0
9,43°F,Low: 37 °F,High: 45 °F,29.78 in (1008.47 mb),36°F (2°C),75%,09 Mar 12:20 PM PST,10.00 mi,35°F (2°C),E 19 MPH,20,45.59578,122.60917,"Portland, Portland International Airport (KPDX)",,0.04


In [32]:
#-----------Data Transformations-----------------

# Convert 'lat' and 'lon' columns to float type
source_df['latitude'] = source_df['latitude'].astype(float)
source_df['longitude'] = source_df['longitude'].astype(float)

# Convert 'elev' column to int type
source_df['elevation_ft'] = source_df['elevation_ft'].astype(int)

# Extract the numeric part of the temperature string and convert it to int
source_df['temp'] = source_df['temp'].str.extract('(\d+)').astype(float)

# Extract the numeric part of the tempmin string and convert it to int
source_df['tempmin'] = source_df['tempmin'].str.extract('(\d+)').astype(float)

# Extract the numeric part of the temperature string and convert it to int
source_df['tempmax'] = source_df['tempmax'].str.extract('(\d+)').astype(float)

# Split wind speed values into components and convert speed to int type
source_df['Wind Speed'] = source_df['Wind Speed'].str.extract('(\d+)', expand=False).fillna(0).astype(float)

# Convert 'humidity' column to int type
source_df['Humidity'] = source_df['Humidity'].str.extract('(\d+)', expand=False).astype(float)

# Convert 'barometer' column to float type, and convert inches to millibars
source_df['Barometer'] = round(source_df['Barometer'].apply(lambda x: float(x.split()[0]) * 33.8639 if 'in' in x and x != 'NA' else None), 2)

# Convert 'Visibility' column to float type
source_df['Visibility'] = source_df['Visibility'].str.extract('(\d+\.\d+|\d+)', expand=False).astype(float).round(2)

#Convert 'last_update' column to UTC
source_df['Last update'] = source_df['Last update'].apply(lambda x: dateutil.parser.parse(x, tzinfos={"EST": -5 * 3600, "CST": -6 * 3600, "MST": -7 * 3600,"PST": -8 * 3600,"AKST": -9 * 3600,"HST": -10 * 3600}))
source_df['Last update'] = source_df['Last update'].apply(lambda x: x.astimezone(dateutil.tz.tzutc()))
source_df['datetime'] = source_df['Last update'].dt.strftime('%Y-%m-%d')
source_df['datetime'] = pd.to_datetime(source_df['datetime'])

# make wind chill a float if exists and only display degree F
try:
    source_df[['Wind Chill']] = source_df['Wind Chill'].str.extract('(\d+)', expand=True).astype(float)
except:
    None

# extract the numeric value of dewpoint and only display the degree n farenheit
source_df[['Dewpoint']] = source_df['Dewpoint'].str.extract('(\d+)', expand=True).astype(float)

#change precip data type to float
source_df['precip'] = source_df['precip'].astype(float)

#rename weather station column to the city
def rename_station(value):
    if value == 'Portland, Portland International Airport (KPDX)':
        return 'PORTLAND INTERNATIONAL AIRPORT, OR US'
    elif value == 'Sitka - Sitka Airport (PASI)':
        return 'SITKA AIRPORT, AK US'
    elif value == 'Philadelphia, Philadelphia International Airport (KPHL)':
        return 'PHILADELPHIA INTERNATIONAL AIRPORT, PA US'
    elif value == 'Orlando International Airport (KMCO)':
        return 'ORLANDO EXECUTIVE AIRPORT, FL US'
    elif value == 'Milwaukee, General Mitchell International Airport (KMKE)':
        return 'MILWAUKEE MITCHELL AIRPORT, WI US'
    elif value == 'Lynchburg, Lynchburg Regional Airport (KLYH)':
        return 'LYNCHBURG REGIONAL AIRPORT, VA US'
    elif value == 'Kalispell, Glacier Park International Airport (KGPI)':
        return 'KALISPELL GLACIER AIRPORT, MT US'
    elif value == 'Kahului, Kahului Airport (PHOG)':
        return 'KAHULUI AIRPORT, HI US'
    elif value == 'Fairbanks, Fairbanks International Airport (PAFA)':
        return 'FAIRBANKS INTERNATIONAL AIRPORT, AK US'
    elif value == 'Albuquerque, Albuquerque International Airport (KABQ)':
        return 'ALBUQUERQUE INTERNATIONAL AIRPORT, NM US'
    elif value == 'Bangor, Bangor International Airport (KBGR)':
        return 'BANGOR INTERNATIONAL AIRPORT, ME US'
source_df['name'] = source_df['weather_station'].map(rename_station)

#change the names and order of columns to better fit the historical data
source_df = source_df.rename({'Humidity': 'humidity', 'Wind Speed': 'windspeed', 'Visibility': 'visibility','Wind Chill': 'windchill','Dewpoint':'dewpoint','tempmax':'temp_max','tempmin':'temp_min'}, axis=1) 
#this line only includes necesarry columns
source_df = source_df[['name','datetime','precip','temp_max','temp_min','temp','windchill','dewpoint','humidity','windspeed','visibility']]
source_df = source_df.fillna(0)
source_df = source_df.set_index(['datetime'])
source_df.index.names = ['DATE']

#combine source df and test set so the test set had the most current info
test_set_forecast = pd.concat([test_set, source_df])
#map each citiy to a numerical values to be used in the regression analysis
test_set_forecast['name'] = test_set_forecast['name'].astype('category')
test_set_forecast['city_number'] = test_set_forecast['name'].cat.codes
test_set_forecast = test_set_forecast[['name','precip','temp_max','temp_min','city_number']]
display(source_df.info())

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 11 entries, 2023-03-09 to 2023-03-09
Data columns (total 10 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   name        11 non-null     object 
 1   precip      11 non-null     float64
 2   temp_max    11 non-null     float64
 3   temp_min    11 non-null     float64
 4   temp        11 non-null     float64
 5   windchill   11 non-null     float64
 6   dewpoint    11 non-null     float64
 7   humidity    11 non-null     float64
 8   windspeed   11 non-null     float64
 9   visibility  11 non-null     float64
dtypes: float64(9), object(1)
memory usage: 968.0+ bytes


None

In [33]:
display(source_df.head(11))

Unnamed: 0_level_0,name,precip,temp_max,temp_min,temp,windchill,dewpoint,humidity,windspeed,visibility
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2023-03-09,"PHILADELPHIA INTERNATIONAL AIRPORT, PA US",0.0,44.0,32.0,52.0,47.0,26.0,37.0,18.0,10.0
2023-03-09,"ORLANDO EXECUTIVE AIRPORT, FL US",0.0,80.0,60.0,79.0,0.0,52.0,39.0,6.0,10.0
2023-03-09,"MILWAUKEE MITCHELL AIRPORT, WI US",0.0,39.0,32.0,38.0,30.0,27.0,65.0,12.0,10.0
2023-03-09,"LYNCHBURG REGIONAL AIRPORT, VA US",0.0,62.0,38.0,62.0,0.0,20.0,20.0,0.0,10.0
2023-03-09,"KALISPELL GLACIER AIRPORT, MT US",0.0,31.0,17.0,28.0,0.0,10.0,46.0,3.0,10.0
2023-03-09,"KAHULUI AIRPORT, HI US",0.216,79.0,60.0,80.0,0.0,57.0,45.0,12.0,10.0
2023-03-09,"FAIRBANKS INTERNATIONAL AIRPORT, AK US",0.044,29.0,1.0,24.0,13.0,19.0,81.0,12.0,1.25
2023-03-09,"BANGOR INTERNATIONAL AIRPORT, ME US",0.015,41.0,23.0,41.0,34.0,23.0,49.0,12.0,10.0
2023-03-09,"ALBUQUERQUE INTERNATIONAL AIRPORT, NM US",0.0,62.0,34.0,58.0,0.0,2.0,8.0,8.0,10.0
2023-03-09,"PORTLAND INTERNATIONAL AIRPORT, OR US",0.04,45.0,37.0,43.0,35.0,36.0,75.0,19.0,10.0


In [34]:
#-----------------create/test data model--------------------------------
#using Ridge regression to minimize overfitting
regression = Ridge(alpha=.1)
predictors = ['precip','temp_max','temp_min']
#training set includes all data before and including dec 12th 2021, this is teh set that is used to train the model to predict future data
training_set = core_weather_df.loc[:'2021-12-31']
# #test set includes all data after and including jan 1st 2022
#test_set = test_set.loc['2022-01-01':]

# test_set_shifted = test_set.shift(365, freq='D')
# test_set2= test_set_shifted.loc['2023-01-01':'2023-12-31']

#train the model based on the predictors
regression.fit(training_set[predictors], training_set['target_temp_max_day_1'])
predictions = regression.predict(test_set_forecast[predictors])

combined_df = pd.concat([test_set_forecast[['name','temp_max']], pd.Series(predictions, index = test_set_forecast.index)], axis=1)
combined_df.columns = ['name','current_day_max_temp','Predicted_high_temp_1_day']
combined_df = combined_df.astype({"Predicted_high_temp_1_day": int, "current_day_max_temp": int})

core_weather_df.name.unique()
display(combined_df.tail(15))


Unnamed: 0_level_0,name,current_day_max_temp,Predicted_high_temp_1_day
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2023-02-11,"ORLANDO EXECUTIVE AIRPORT, FL US",84,83
2023-02-12,"ORLANDO EXECUTIVE AIRPORT, FL US",72,71
2023-02-13,"ORLANDO EXECUTIVE AIRPORT, FL US",71,70
2023-02-14,"ORLANDO EXECUTIVE AIRPORT, FL US",77,75
2023-03-09,"PHILADELPHIA INTERNATIONAL AIRPORT, PA US",44,45
2023-03-09,"ORLANDO EXECUTIVE AIRPORT, FL US",80,79
2023-03-09,"MILWAUKEE MITCHELL AIRPORT, WI US",39,41
2023-03-09,"LYNCHBURG REGIONAL AIRPORT, VA US",62,61
2023-03-09,"KALISPELL GLACIER AIRPORT, MT US",31,32
2023-03-09,"KAHULUI AIRPORT, HI US",79,78


In [35]:
display(test_set_forecast.head())
display(test_set)

Unnamed: 0_level_0,name,precip,temp_max,temp_min,city_number
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2022-01-01,"FAIRBANKS INTERNATIONAL AIRPORT, AK US",0.0,-7.0,-17.0,2
2022-01-02,"FAIRBANKS INTERNATIONAL AIRPORT, AK US",0.0,-16.0,-47.0,2
2022-01-03,"FAIRBANKS INTERNATIONAL AIRPORT, AK US",0.0,-14.0,-45.0,2
2022-01-04,"FAIRBANKS INTERNATIONAL AIRPORT, AK US",0.0,-19.0,-37.0,2
2022-01-05,"FAIRBANKS INTERNATIONAL AIRPORT, AK US",0.0,-26.0,-38.0,2


Unnamed: 0_level_0,name,precip,snow,snow_depth,temp_max,temp_min,name_num,target_temp_max_day_1,target_temp_max_day_2,target_temp_max_day_3,target_temp_max_day_4,target_temp_max_day_5,target_temp_max_day_6,target_temp_max_day_7,target_temp_max_day_8,target_temp_max_day_9,target_temp_max_day_10,city_number
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2022-01-01,"FAIRBANKS INTERNATIONAL AIRPORT, AK US",0.00,0.1,33.9,-7.0,-17.0,0,-16.0,-14.0,-19.0,-26.0,-6.0,-26.0,-30.0,2.0,3.0,7.0,2
2022-01-02,"FAIRBANKS INTERNATIONAL AIRPORT, AK US",0.00,0.0,33.1,-16.0,-47.0,0,-14.0,-19.0,-26.0,-6.0,-26.0,-30.0,2.0,3.0,7.0,12.0,2
2022-01-03,"FAIRBANKS INTERNATIONAL AIRPORT, AK US",0.00,0.0,33.1,-14.0,-45.0,0,-19.0,-26.0,-6.0,-26.0,-30.0,2.0,3.0,7.0,12.0,21.0,2
2022-01-04,"FAIRBANKS INTERNATIONAL AIRPORT, AK US",0.00,0.0,33.1,-19.0,-37.0,0,-26.0,-6.0,-26.0,-30.0,2.0,3.0,7.0,12.0,21.0,20.0,2
2022-01-05,"FAIRBANKS INTERNATIONAL AIRPORT, AK US",0.00,0.0,33.1,-26.0,-38.0,0,-6.0,-26.0,-30.0,2.0,3.0,7.0,12.0,21.0,20.0,7.0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-02-10,"ORLANDO EXECUTIVE AIRPORT, FL US",0.21,0.0,0.0,84.0,66.0,12,84.0,72.0,71.0,77.0,82.0,85.0,87.0,71.0,84.0,84.0,7
2023-02-11,"ORLANDO EXECUTIVE AIRPORT, FL US",0.01,0.0,0.0,84.0,67.0,12,72.0,71.0,77.0,82.0,85.0,87.0,71.0,84.0,84.0,85.0,7
2023-02-12,"ORLANDO EXECUTIVE AIRPORT, FL US",0.02,0.0,0.0,72.0,54.0,12,71.0,77.0,82.0,85.0,87.0,71.0,84.0,84.0,85.0,87.0,7
2023-02-13,"ORLANDO EXECUTIVE AIRPORT, FL US",0.00,0.0,0.0,71.0,49.0,12,77.0,82.0,85.0,87.0,71.0,84.0,84.0,85.0,87.0,88.0,7


In [36]:
def max_temp_10_day_forecast(predictors, core_weather_df,test_set_forecast, regression):
    
    training_set = core_weather_df.loc[:'2021-12-31']
    test_set_forecast = test_set_forecast[['name','precip','temp_max','temp_min','city_number']]
    
    regression.fit(training_set[predictors], training_set['target_temp_max_day_1'])
    predictions = regression.predict(test_set_forecast[predictors])
    combined_df = pd.concat([test_set_forecast[['name','temp_max']], pd.Series(predictions, index = test_set_forecast.index)], axis=1)
    combined_df.columns = ['name','current_day_max_temp','Predicted_high_temp_day_1']

    regression.fit(training_set[predictors], training_set['target_temp_max_day_2'])
    predictions = regression.predict(test_set_forecast[predictors])
    combined_df = pd.concat([combined_df[['name','current_day_max_temp','Predicted_high_temp_day_1']], pd.Series(predictions, index = test_set_forecast.index)], axis=1)
    combined_df.columns = ['name','current_day_max_temp','Predicted_high_temp_day_1','Predicted_high_temp_day_2']

    regression.fit(training_set[predictors], training_set['target_temp_max_day_3'])
    predictions = regression.predict(test_set_forecast[predictors])
    combined_df = pd.concat([combined_df[['name','current_day_max_temp','Predicted_high_temp_day_1','Predicted_high_temp_day_2']], pd.Series(predictions, index = test_set_forecast.index)], axis=1)
    combined_df.columns = ['name','current_day_max_temp','Predicted_high_temp_day_1','Predicted_high_temp_day_2','Predicted_high_temp_day_3']

    regression.fit(training_set[predictors], training_set['target_temp_max_day_4'])
    predictions = regression.predict(test_set_forecast[predictors])
    combined_df = pd.concat([combined_df[['name','current_day_max_temp','Predicted_high_temp_day_1','Predicted_high_temp_day_2','Predicted_high_temp_day_3']], pd.Series(predictions, index = test_set_forecast.index)], axis=1)
    combined_df.columns = ['name','current_day_max_temp','Predicted_high_temp_day_1','Predicted_high_temp_day_2','Predicted_high_temp_day_3','Predicted_high_temp_day_4']
    
    regression.fit(training_set[predictors], training_set['target_temp_max_day_5'])
    predictions = regression.predict(test_set_forecast[predictors])
    combined_df = pd.concat([combined_df[['name','current_day_max_temp','Predicted_high_temp_day_1','Predicted_high_temp_day_2','Predicted_high_temp_day_3','Predicted_high_temp_day_4']], pd.Series(predictions, index = test_set_forecast.index)], axis=1)
    combined_df.columns = ['name','current_day_max_temp','Predicted_high_temp_day_1','Predicted_high_temp_day_2','Predicted_high_temp_day_3','Predicted_high_temp_day_4','Predicted_high_temp_day_5']
    
    regression.fit(training_set[predictors], training_set['target_temp_max_day_6'])
    predictions = regression.predict(test_set_forecast[predictors])
    combined_df = pd.concat([combined_df[['name','current_day_max_temp','Predicted_high_temp_day_1','Predicted_high_temp_day_2','Predicted_high_temp_day_3','Predicted_high_temp_day_4','Predicted_high_temp_day_5']], pd.Series(predictions, index = test_set_forecast.index)], axis=1)
    combined_df.columns = ['name','current_day_max_temp','Predicted_high_temp_day_1','Predicted_high_temp_day_2','Predicted_high_temp_day_3','Predicted_high_temp_day_4','Predicted_high_temp_day_5','Predicted_high_temp_day_6']
    
    regression.fit(training_set[predictors], training_set['target_temp_max_day_7'])
    predictions = regression.predict(test_set_forecast[predictors])
    combined_df = pd.concat([combined_df[['name','current_day_max_temp','Predicted_high_temp_day_1','Predicted_high_temp_day_2','Predicted_high_temp_day_3','Predicted_high_temp_day_4','Predicted_high_temp_day_5','Predicted_high_temp_day_6']], pd.Series(predictions, index = test_set_forecast.index)], axis=1)
    combined_df.columns = ['name','current_day_max_temp','Predicted_high_temp_day_1','Predicted_high_temp_day_2','Predicted_high_temp_day_3','Predicted_high_temp_day_4','Predicted_high_temp_day_5','Predicted_high_temp_day_6','Predicted_high_temp_day_7']
    
    regression.fit(training_set[predictors], training_set['target_temp_max_day_8'])
    predictions = regression.predict(test_set_forecast[predictors])
    combined_df = pd.concat([combined_df[['name','current_day_max_temp','Predicted_high_temp_day_1','Predicted_high_temp_day_2','Predicted_high_temp_day_3','Predicted_high_temp_day_4','Predicted_high_temp_day_5','Predicted_high_temp_day_6','Predicted_high_temp_day_7']], pd.Series(predictions, index = test_set_forecast.index)], axis=1)
    combined_df.columns = ['name','current_day_max_temp','Predicted_high_temp_day_1','Predicted_high_temp_day_2','Predicted_high_temp_day_3','Predicted_high_temp_day_4','Predicted_high_temp_day_5','Predicted_high_temp_day_6','Predicted_high_temp_day_7','Predicted_high_temp_day_8']
    
    regression.fit(training_set[predictors], training_set['target_temp_max_day_9'])
    predictions = regression.predict(test_set_forecast[predictors])
    combined_df = pd.concat([combined_df[['name','current_day_max_temp','Predicted_high_temp_day_1','Predicted_high_temp_day_2','Predicted_high_temp_day_3','Predicted_high_temp_day_4','Predicted_high_temp_day_5','Predicted_high_temp_day_6','Predicted_high_temp_day_7','Predicted_high_temp_day_8']], pd.Series(predictions, index = test_set_forecast.index)], axis=1)
    combined_df.columns = ['name','current_day_max_temp','Predicted_high_temp_day_1','Predicted_high_temp_day_2','Predicted_high_temp_day_3','Predicted_high_temp_day_4','Predicted_high_temp_day_5','Predicted_high_temp_day_6','Predicted_high_temp_day_7','Predicted_high_temp_day_8','Predicted_high_temp_day_9']
    
    regression.fit(training_set[predictors], training_set['target_temp_max_day_10'])
    predictions = regression.predict(test_set_forecast[predictors])
    combined_df = pd.concat([combined_df[['name','current_day_max_temp','Predicted_high_temp_day_1','Predicted_high_temp_day_2','Predicted_high_temp_day_3','Predicted_high_temp_day_4','Predicted_high_temp_day_5','Predicted_high_temp_day_6','Predicted_high_temp_day_7','Predicted_high_temp_day_8','Predicted_high_temp_day_9']], pd.Series(predictions, index = test_set_forecast.index)], axis=1)
    combined_df.columns = ['name','current_day_max_temp','Predicted_high_temp_day_1','Predicted_high_temp_day_2','Predicted_high_temp_day_3','Predicted_high_temp_day_4','Predicted_high_temp_day_5','Predicted_high_temp_day_6','Predicted_high_temp_day_7','Predicted_high_temp_day_8','Predicted_high_temp_day_9','Predicted_high_temp_day_10']
    
    combined_df['date'] = combined_df.index

    combined_df = combined_df.astype({'current_day_max_temp':int,'Predicted_high_temp_day_1':int,'Predicted_high_temp_day_2':int,'Predicted_high_temp_day_3':int,'Predicted_high_temp_day_4':int,'Predicted_high_temp_day_5':int,'Predicted_high_temp_day_6':int,'Predicted_high_temp_day_7':int,'Predicted_high_temp_day_8':int,'Predicted_high_temp_day_9':int,'Predicted_high_temp_day_10':int})

    combined_df = combined_df.iloc[-11:]

    return combined_df






In [37]:
display(max_temp_10_day_forecast(predictors, core_weather_df,test_set_forecast, regression))

Unnamed: 0_level_0,name,current_day_max_temp,Predicted_high_temp_day_1,Predicted_high_temp_day_2,Predicted_high_temp_day_3,Predicted_high_temp_day_4,Predicted_high_temp_day_5,Predicted_high_temp_day_6,Predicted_high_temp_day_7,Predicted_high_temp_day_8,Predicted_high_temp_day_9,Predicted_high_temp_day_10,date
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2023-03-09,"PHILADELPHIA INTERNATIONAL AIRPORT, PA US",44,45,46,47,47,47,47,47,47,47,47,2023-03-09
2023-03-09,"ORLANDO EXECUTIVE AIRPORT, FL US",80,79,78,78,78,78,78,78,78,77,77,2023-03-09
2023-03-09,"MILWAUKEE MITCHELL AIRPORT, WI US",39,41,42,43,43,44,44,44,44,44,44,2023-03-09
2023-03-09,"LYNCHBURG REGIONAL AIRPORT, VA US",62,61,61,60,60,60,60,60,60,60,60,2023-03-09
2023-03-09,"KALISPELL GLACIER AIRPORT, MT US",31,32,33,34,34,34,35,35,35,35,35,2023-03-09
2023-03-09,"KAHULUI AIRPORT, HI US",79,78,77,77,77,77,77,77,77,77,77,2023-03-09
2023-03-09,"FAIRBANKS INTERNATIONAL AIRPORT, AK US",29,28,28,29,29,29,29,29,29,29,30,2023-03-09
2023-03-09,"BANGOR INTERNATIONAL AIRPORT, ME US",41,41,42,42,42,43,43,43,43,43,43,2023-03-09
2023-03-09,"ALBUQUERQUE INTERNATIONAL AIRPORT, NM US",62,60,60,59,59,59,59,59,59,59,59,2023-03-09
2023-03-09,"PORTLAND INTERNATIONAL AIRPORT, OR US",45,47,48,48,49,49,49,49,49,49,49,2023-03-09


In [38]:
training_set = core_weather_df.loc[:'2021-12-31']
test_set_forecast = test_set_forecast[['name','precip','temp_max','temp_min','city_number']]

regression.fit(training_set[predictors], training_set['target_temp_max_day_1'])
predictions = regression.predict(test_set_forecast[predictors])
combined_df = pd.concat([test_set_forecast[['name','temp_max']], pd.Series(predictions, index = test_set_forecast.index)], axis=1)
combined_df.columns = ['name','current_day_max_temp','Predicted_high_temp_day_1']

regression.fit(training_set[predictors], training_set['target_temp_max_day_2'])
predictions = regression.predict(test_set_forecast[predictors])
combined_df = pd.concat([combined_df[['name','current_day_max_temp','Predicted_high_temp_day_1']], pd.Series(predictions, index = test_set_forecast.index)], axis=1)
combined_df.columns = ['name','current_day_max_temp','Predicted_high_temp_day_1','Predicted_high_temp_day_2']

regression.fit(training_set[predictors], training_set['target_temp_max_day_3'])
predictions = regression.predict(test_set_forecast[predictors])
combined_df = pd.concat([combined_df[['name','current_day_max_temp','Predicted_high_temp_day_1','Predicted_high_temp_day_2']], pd.Series(predictions, index = test_set_forecast.index)], axis=1)
combined_df.columns = ['name','current_day_max_temp','Predicted_high_temp_day_1','Predicted_high_temp_day_2','Predicted_high_temp_day_3']

regression.fit(training_set[predictors], training_set['target_temp_max_day_4'])
predictions = regression.predict(test_set_forecast[predictors])
combined_df = pd.concat([combined_df[['name','current_day_max_temp','Predicted_high_temp_day_1','Predicted_high_temp_day_2','Predicted_high_temp_day_3']], pd.Series(predictions, index = test_set_forecast.index)], axis=1)
combined_df.columns = ['name','current_day_max_temp','Predicted_high_temp_day_1','Predicted_high_temp_day_2','Predicted_high_temp_day_3','Predicted_high_temp_day_4']

regression.fit(training_set[predictors], training_set['target_temp_max_day_5'])
predictions = regression.predict(test_set_forecast[predictors])
combined_df = pd.concat([combined_df[['name','current_day_max_temp','Predicted_high_temp_day_1','Predicted_high_temp_day_2','Predicted_high_temp_day_3','Predicted_high_temp_day_4']], pd.Series(predictions, index = test_set_forecast.index)], axis=1)
combined_df.columns = ['name','current_day_max_temp','Predicted_high_temp_day_1','Predicted_high_temp_day_2','Predicted_high_temp_day_3','Predicted_high_temp_day_4','Predicted_high_temp_day_5']

regression.fit(training_set[predictors], training_set['target_temp_max_day_6'])
predictions = regression.predict(test_set_forecast[predictors])
combined_df = pd.concat([combined_df[['name','current_day_max_temp','Predicted_high_temp_day_1','Predicted_high_temp_day_2','Predicted_high_temp_day_3','Predicted_high_temp_day_4','Predicted_high_temp_day_5']], pd.Series(predictions, index = test_set_forecast.index)], axis=1)
combined_df.columns = ['name','current_day_max_temp','Predicted_high_temp_day_1','Predicted_high_temp_day_2','Predicted_high_temp_day_3','Predicted_high_temp_day_4','Predicted_high_temp_day_5','Predicted_high_temp_day_6']

regression.fit(training_set[predictors], training_set['target_temp_max_day_7'])
predictions = regression.predict(test_set_forecast[predictors])
combined_df = pd.concat([combined_df[['name','current_day_max_temp','Predicted_high_temp_day_1','Predicted_high_temp_day_2','Predicted_high_temp_day_3','Predicted_high_temp_day_4','Predicted_high_temp_day_5','Predicted_high_temp_day_6']], pd.Series(predictions, index = test_set_forecast.index)], axis=1)
combined_df.columns = ['name','current_day_max_temp','Predicted_high_temp_day_1','Predicted_high_temp_day_2','Predicted_high_temp_day_3','Predicted_high_temp_day_4','Predicted_high_temp_day_5','Predicted_high_temp_day_6','Predicted_high_temp_day_7']

regression.fit(training_set[predictors], training_set['target_temp_max_day_8'])
predictions = regression.predict(test_set_forecast[predictors])
combined_df = pd.concat([combined_df[['name','current_day_max_temp','Predicted_high_temp_day_1','Predicted_high_temp_day_2','Predicted_high_temp_day_3','Predicted_high_temp_day_4','Predicted_high_temp_day_5','Predicted_high_temp_day_6','Predicted_high_temp_day_7']], pd.Series(predictions, index = test_set_forecast.index)], axis=1)
combined_df.columns = ['name','current_day_max_temp','Predicted_high_temp_day_1','Predicted_high_temp_day_2','Predicted_high_temp_day_3','Predicted_high_temp_day_4','Predicted_high_temp_day_5','Predicted_high_temp_day_6','Predicted_high_temp_day_7','Predicted_high_temp_day_8']

regression.fit(training_set[predictors], training_set['target_temp_max_day_9'])
predictions = regression.predict(test_set_forecast[predictors])
combined_df = pd.concat([combined_df[['name','current_day_max_temp','Predicted_high_temp_day_1','Predicted_high_temp_day_2','Predicted_high_temp_day_3','Predicted_high_temp_day_4','Predicted_high_temp_day_5','Predicted_high_temp_day_6','Predicted_high_temp_day_7','Predicted_high_temp_day_8']], pd.Series(predictions, index = test_set_forecast.index)], axis=1)
combined_df.columns = ['name','current_day_max_temp','Predicted_high_temp_day_1','Predicted_high_temp_day_2','Predicted_high_temp_day_3','Predicted_high_temp_day_4','Predicted_high_temp_day_5','Predicted_high_temp_day_6','Predicted_high_temp_day_7','Predicted_high_temp_day_8','Predicted_high_temp_day_9']

regression.fit(training_set[predictors], training_set['target_temp_max_day_10'])
predictions = regression.predict(test_set_forecast[predictors])
combined_df = pd.concat([combined_df[['name','current_day_max_temp','Predicted_high_temp_day_1','Predicted_high_temp_day_2','Predicted_high_temp_day_3','Predicted_high_temp_day_4','Predicted_high_temp_day_5','Predicted_high_temp_day_6','Predicted_high_temp_day_7','Predicted_high_temp_day_8','Predicted_high_temp_day_9']], pd.Series(predictions, index = test_set_forecast.index)], axis=1)
combined_df.columns = ['name','current_day_max_temp','Predicted_high_temp_day_1','Predicted_high_temp_day_2','Predicted_high_temp_day_3','Predicted_high_temp_day_4','Predicted_high_temp_day_5','Predicted_high_temp_day_6','Predicted_high_temp_day_7','Predicted_high_temp_day_8','Predicted_high_temp_day_9','Predicted_high_temp_day_10']

combined_df['date'] = combined_df.index

combined_df = combined_df.astype({'current_day_max_temp':int,'Predicted_high_temp_day_1':int,'Predicted_high_temp_day_2':int,'Predicted_high_temp_day_3':int,'Predicted_high_temp_day_4':int,'Predicted_high_temp_day_5':int,'Predicted_high_temp_day_6':int,'Predicted_high_temp_day_7':int,'Predicted_high_temp_day_8':int,'Predicted_high_temp_day_9':int,'Predicted_high_temp_day_10':int})

combined_df = combined_df.iloc[-11:]



In [39]:
PROJECT_ID = "deb-01-372116"
DATASET_ID = "Weather_Forecaster"
DAILY_TABLE_ID = "ten_day_forecast"

SCHEMA = [
            # indexes are written if only named in the schema
            bigquery.SchemaField('name', 'STRING', mode='REQUIRED'),
            bigquery.SchemaField('current_day_max_temp', 'INT64', mode='NULLABLE'),
            bigquery.SchemaField('Predicted_high_temp_day_1', 'INT64', mode='NULLABLE'),
            bigquery.SchemaField('Predicted_high_temp_day_2', 'INT64', mode='NULLABLE'),
            bigquery.SchemaField('Predicted_high_temp_day_3', 'INT64', mode='NULLABLE'),
            bigquery.SchemaField('Predicted_high_temp_day_4', 'INT64', mode='NULLABLE'),
            bigquery.SchemaField('Predicted_high_temp_day_5', 'INT64', mode='NULLABLE'),
            bigquery.SchemaField('Predicted_high_temp_day_6', 'INT64', mode='NULLABLE'),
            bigquery.SchemaField('Predicted_high_temp_day_7', 'INT64', mode='NULLABLE'),
            bigquery.SchemaField('Predicted_high_temp_day_8', 'INT64', mode='NULLABLE'),
            bigquery.SchemaField('Predicted_high_temp_day_9', 'INT64', mode='NULLABLE'),
            bigquery.SchemaField('Predicted_high_temp_day_10', 'INT64', mode='REQUIRED'),
            bigquery.SchemaField('date', 'DATETIME', mode='NULLABLE'),
        ]

#change the date column back to a datetime dtype
# df = pd.DataFrame(data)

client = bigquery.Client()

try:
    dataset_ref = client.dataset(DATASET_ID)
    dataset = client.get_dataset(dataset_ref)
except:
    dataset_ref = client.dataset(DATASET_ID)
    dataset = bigquery.Dataset(dataset_ref)
    # dataset.location = "US"
    dataset = client.create_dataset(dataset)

table_ref = dataset.table(DAILY_TABLE_ID)

try:
    client.get_table(table_ref)
except:
    table = bigquery.Table(table_ref, schema=SCHEMA)
    table = client.create_table(table)

job_config = bigquery.LoadJobConfig(write_disposition=bigquery.WriteDisposition.WRITE_TRUNCATE)
job = client.load_table_from_dataframe(combined_df, table_ref, job_config=job_config)
job.result()

  ", ".join(field.name for field in unknown_type_fields)


LoadJob<project=deb-01-372116, location=US, id=2b942840-abb8-4a09-8356-9bbe3c7980a4>