# Download weather data

For documentation of the available data see https://www.visualcrossing.com/resources/documentation/weather-data/what-historical-weather-measures-are-available/

In [1]:
import csv
import codecs
import urllib.request
import sys
import pandas as pd
import datetime
import math

Define variables

In [2]:
# Get dataframe of all countries
df = pd.read_csv("countries.csv")

# Tags for naming the resulting csv-files
data_tags = {"2021": '_weather_data.csv', "2020": '_weather_data_2020.csv'}

# 1000 records per day are possible for free per key. A record is a data set for one date and one location.
query_key_1 = 'SVLNNSJNRK2D57FMK77EUUFY5' # Emil's key
query_key_2 = 'MTUW5B3YVNE4N4N6V3WVQX4X9' # Sebastian's key

Define function to format a location string for the weather data retrieval function

In [3]:
# dictionary of cities that are not recognized by the API and respective long-lat-coordinates
unavailable_cities = {
    'Baku': '40.379610,49.843607',
    'Riga': '56.949650,24.105186',
    'Kiev': '50.450100,30.523399'
}

def format_city_string_for_request(city):
    # if city unavailable in API, replace by coordinates
    if city in unavailable_cities.keys():
        city = unavailable_cities[city]
        
    # else, replace spaces
    else:
        number_of_spaces = city.count(" ")
        city = city.replace(" ", "%20", number_of_spaces)
    
    return city

Define function to get weather data of one city from a start to an end data as CSV file
* country: Dataframe of one country according to format in our "countries.json" file, i.e., with keys "iso" and "capital"
* start_date and end_date: strings in YYYY-MM-DD format

In [4]:
def download_weather_data(country, start_date, end_date, query_key, filename):
    
    ## Setting up the weather data input parameters
    # This is the core of our weather query URL
    BaseURL = 'https://weather.visualcrossing.com/VisualCrossingWebServices/rest/services/weatherdata/'

    # Set up the location parameter for our query
    QueryLocation = '&location=' + format_city_string_for_request(country["capital"])

    # Set up the query type parameter for our query ('FORECAST' or 'HISTORY')
    QueryType= 'HISTORY'

    # Set up the key parameter for our query
    QueryKey = '&key=' + query_key

    # Set up the date parameters for our query. Used only for historical weather data requests
    FromDateParam = start_date
    ToDateParam = end_date
    
    
    ## Construct requests to form a single URL
    # Set up the specific parameters based on the type of query
    if QueryType == 'FORECAST':
        #print(' - Fetching forecast data')
        QueryTypeParams = 'forecast?&aggregateHours=24&unitGroup=us&shortColumnNames=false'
    else:
        #print(' - Fetching history for date: ', FromDateParam,'-',ToDateParam)

        # History requests require a date.  We use the same date for start and end since we only want to query a single date in this example
        QueryDate = '&startDateTime=' + FromDateParam + 'T00:00:00&endDateTime=' +ToDateParam + 'T00:00:00'
        QueryTypeParams = 'history?&aggregateHours=24&unitGroup=us&dayStartTime=0:0:00&dayEndTime=0:0:00' + QueryDate

    # Build the entire query
    URL = BaseURL + QueryTypeParams + QueryLocation + QueryKey
    #print(' - Running query URL: ', URL)
    
    
    ## Download as CSV file
    urllib.request.urlretrieve(URL, filename)
    
    print(' - Weather data downloaded for: ', country["iso2"])
    #print()

## For downloading additional data to existing data sets

### Newer data
Iterate through all countries in dataframe to download newer weather data and append to existing csv files

In [9]:
filename = 'weather_data\\temp.csv'
# Year of new data
year = "2021"
# End date for request in 'YYYY-MM-DD' format
end_date = year + '-07-07'

for index, row in df.iterrows():
    path = 'weather_data\\' + row['iso2'] + data_tags[year]
    
    # get latest date in current file
    df_country_weather = pd.read_csv(path)
    last_index = df_country_weather.index[-1]
    latest_date = df_country_weather['Date time'].loc[last_index]
    latest_date = datetime.datetime.strptime(latest_date, '%m/%d/%Y')

    # determine start date for new data
    start_date = latest_date + datetime.timedelta(days=1)
    start_date_str = start_date.strftime('%Y-%m-%d')
    
    # download weather data only for days that have not been downloaded yet
    if start_date < datetime.datetime.strptime(end_date, '%Y-%m-%d'):  
        
        download_weather_data(row, start_date_str, end_date, query_key_2, filename)
        
        # append new data to existing file
        df_new_data = pd.read_csv(filename)
        if not df_new_data.columns[0][:3] == "You" and type(df_new_data['Date time'].loc[0]) == str:
            df_country_weather = df_country_weather.append(df_new_data, ignore_index=True)
            df_country_weather.to_csv(path, index=False)
        else:
            print("!Data limit exceed for key!")

 - Fetching history for date:  2021-07-05 - 2021-07-07
 - Running query URL:  https://weather.visualcrossing.com/VisualCrossingWebServices/rest/services/weatherdata/history?&aggregateHours=24&unitGroup=us&dayStartTime=0:0:00&dayEndTime=0:0:00&startDateTime=2021-07-05T00:00:00&endDateTime=2021-07-07T00:00:00&location=Tirana&key=MTUW5B3YVNE4N4N6V3WVQX4X9
 - Weather data downloaded for:  AL

 - Fetching history for date:  2021-07-05 - 2021-07-07
 - Running query URL:  https://weather.visualcrossing.com/VisualCrossingWebServices/rest/services/weatherdata/history?&aggregateHours=24&unitGroup=us&dayStartTime=0:0:00&dayEndTime=0:0:00&startDateTime=2021-07-05T00:00:00&endDateTime=2021-07-07T00:00:00&location=Andorra%20la%20Vella&key=MTUW5B3YVNE4N4N6V3WVQX4X9
 - Weather data downloaded for:  AD

 - Fetching history for date:  2021-07-05 - 2021-07-07
 - Running query URL:  https://weather.visualcrossing.com/VisualCrossingWebServices/rest/services/weatherdata/history?&aggregateHours=24&unitGroup=

### Older data
Iterate through all countries in dataframe to download older weather data and append to existing csv files

In [35]:
filename = 'weather_data\\temp.csv'

# Year of new data
year = "2020"

# Start date for request in 'YYYY-MM-DD' format
start_date = year + '-05-16'

for index, row in df.iterrows():
    path = 'weather_data\\' + row['iso2'] + data_tags[year]

    # get oldest date in current file
    df_country_weather = pd.read_csv(path)
    oldest_date = df_country_weather['Date time'].loc[0] 
    oldest_date = datetime.datetime.strptime(oldest_date, '%m/%d/%Y')

    # download weather data only for days that have not been downloaded yet
    if oldest_date > datetime.datetime.strptime(start_date, '%Y-%m-%d'): 

        # determine end date for new data
        end_date = oldest_date - datetime.timedelta(days=1)
        end_date_str = end_date.strftime('%Y-%m-%d')

        download_weather_data(row, start_date, end_date_str, query_key_1, filename)

        # append new data at the top of existing file
        df_new_data = pd.read_csv(filename)
        if not df_new_data.columns[0][:3] == "You":
            df_country_weather = pd.concat([df_new_data, df_country_weather], ignore_index=True)
            df_country_weather.to_csv(path, index=False)
        else:
            print("!Data limit exceed for key!")

 - Fetching history for date:  2020-05-16 - 2020-05-31
 - Running query URL:  https://weather.visualcrossing.com/VisualCrossingWebServices/rest/services/weatherdata/history?&aggregateHours=24&unitGroup=us&dayStartTime=0:0:00&dayEndTime=0:0:00&startDateTime=2020-05-16T00:00:00&endDateTime=2020-05-31T00:00:00&location=Tallinn&key=SVLNNSJNRK2D57FMK77EUUFY5
 - Weather data downloaded for:  EE

 - Fetching history for date:  2020-05-16 - 2020-05-31
 - Running query URL:  https://weather.visualcrossing.com/VisualCrossingWebServices/rest/services/weatherdata/history?&aggregateHours=24&unitGroup=us&dayStartTime=0:0:00&dayEndTime=0:0:00&startDateTime=2020-05-16T00:00:00&endDateTime=2020-05-31T00:00:00&location=Helsinki&key=SVLNNSJNRK2D57FMK77EUUFY5
 - Weather data downloaded for:  FI

 - Fetching history for date:  2020-05-16 - 2020-05-31
 - Running query URL:  https://weather.visualcrossing.com/VisualCrossingWebServices/rest/services/weatherdata/history?&aggregateHours=24&unitGroup=us&dayStart

## For downloading whole new data sets

In [11]:
start_date = '2021-06-01' 
end_date = '2021-06-29'
for index, row in df.iterrows():
    filename = 'weather_data\\' + row["iso2"] + '_weather_data.csv'#'_' + start_date + 'to' + end_date + '.csv'
    download_weather_data(row, start_date, end_date, query_key, filename)

 - Fetching history for date:  2021-06-01 - 2021-06-29
 - Running query URL:  https://weather.visualcrossing.com/VisualCrossingWebServices/rest/services/weatherdata/history?&aggregateHours=24&unitGroup=us&dayStartTime=0:0:00&dayEndTime=0:0:00&startDateTime=2021-06-01T00:00:00&endDateTime=2021-06-29T00:00:00&location=Tirana&key=SVLNNSJNRK2D57FMK77EUUFY5
 - Weather data downloaded for:  AL

 - Fetching history for date:  2021-06-01 - 2021-06-29
 - Running query URL:  https://weather.visualcrossing.com/VisualCrossingWebServices/rest/services/weatherdata/history?&aggregateHours=24&unitGroup=us&dayStartTime=0:0:00&dayEndTime=0:0:00&startDateTime=2021-06-01T00:00:00&endDateTime=2021-06-29T00:00:00&location=Andorra%20la%20Vella&key=SVLNNSJNRK2D57FMK77EUUFY5
 - Weather data downloaded for:  AD

 - Fetching history for date:  2021-06-01 - 2021-06-29
 - Running query URL:  https://weather.visualcrossing.com/VisualCrossingWebServices/rest/services/weatherdata/history?&aggregateHours=24&unitGroup=

Only for specific countries:

In [7]:
countries = ["GB", "UA", "CH", "TR", "ES", "SE", "SI", "SK", "RS", "SM", "RO", "RU", "PL", "PT", "NO", "MK", "NL"]
start_date = '2020-06-01' 
end_date = '2020-07-01'
filename_strings = ['_weather_data.csv']
for index, row in df.iterrows():
    if row["iso2"] in countries:
        filename = 'weather_data\\' + row["iso2"] + '_' + start_date + 'to' + end_date + '.csv'
        download_weather_data(row, start_date, end_date, query_key, filename)

 - Fetching history for date:  2020-06-01 - 2020-07-01
 - Running query URL:  https://weather.visualcrossing.com/VisualCrossingWebServices/rest/services/weatherdata/history?&aggregateHours=24&unitGroup=us&dayStartTime=0:0:00&dayEndTime=0:0:00&startDateTime=2020-06-01T00:00:00&endDateTime=2020-07-01T00:00:00&location=Amsterdam&key=SVLNNSJNRK2D57FMK77EUUFY5
 - Weather data downloaded for:  NL

 - Fetching history for date:  2020-06-01 - 2020-07-01
 - Running query URL:  https://weather.visualcrossing.com/VisualCrossingWebServices/rest/services/weatherdata/history?&aggregateHours=24&unitGroup=us&dayStartTime=0:0:00&dayEndTime=0:0:00&startDateTime=2020-06-01T00:00:00&endDateTime=2020-07-01T00:00:00&location=Skopje&key=SVLNNSJNRK2D57FMK77EUUFY5
 - Weather data downloaded for:  MK

 - Fetching history for date:  2020-06-01 - 2020-07-01
 - Running query URL:  https://weather.visualcrossing.com/VisualCrossingWebServices/rest/services/weatherdata/history?&aggregateHours=24&unitGroup=us&dayStart

## Old

In [58]:
""" # Test for one location
row = df.loc[5]
path = 'weather_data\\' + row['iso2'] + '_weather_data.csv'
df_country_weather = pd.read_csv(path)
last_index = df_country_weather.index[-1]
latest_date = df_country_weather['Date time'].loc[last_index]
# reformat
start_date = latest_date[-4:] + '-' + latest_date[:2] + '-' + latest_date[3:5]
if datetime.datetime.strptime(start_date, '%Y-%m-%d') < datetime.datetime.strptime(end_date, '%Y-%m-%d'):                          
    download_weather_data(row, start_date, end_date, query_key)
    df_new_data = pd.read_csv('weather_data\\temp.csv')
    df_country_weather = df_country_weather.append(df_new_data, ignore_index=True)
    df_country_weather.to_csv(path, index=False)  
"""

 - Fetching history for date:  2021-06-22 - 2021-06-27
 - Running query URL:  https://weather.visualcrossing.com/VisualCrossingWebServices/rest/services/weatherdata/history?&aggregateHours=24&unitGroup=us&dayStartTime=0:0:00&dayEndTime=0:0:00&startDateTime=2021-06-22T00:00:00&endDateTime=2021-06-27T00:00:00&location=Minsk&key=SVLNNSJNRK2D57FMK77EUUFY5
 - Weather data downloaded for:  BY



In [16]:
# Rename all files
import os
path = 'weather_data'
files = os.listdir(path)

for index, file in enumerate(files):
    if file[-5:] == "1.csv":
        new_filename = file[:2] + "_weather_data_2020.csv"
        #print(new_filename)
        os.rename(os.path.join(path, file), os.path.join(path, new_filename))

Drop first column of data csv if incorrect label.

In [29]:
for index, row in df.iterrows():
    path = 'weather_data\\' + row['iso2'] + data_tags[year]

    df_country_weather = pd.read_csv(path)
    if df_country_weather.columns[0] != "Address":
        df_country_weather = df_country_weather.drop(columns=[df_country_weather.columns[0]])
        df_country_weather.to_csv(path, index=False)