In [7]:
import pandas as pd
import os
from urllib import request
import time

<h1># 01 - Wetterdaten: Herunterladen der Dateien der für die definierten Station-IDs </h1>
<hr>
<p><b>Hinweis:</b> Dateifpade sind absolut angegeben und müssen entsprechend der eigenen Verzeichnisstruktur angepasst werden!</p>
<hr>

In [8]:
# Wetterstation-IDs der jeweiligen Städte (manuell ermittelt)
# https://www.ncei.noaa.gov/maps/hourly/


stations_id_berlin = [
 '10382099999'
,'10385099999'    
]

stations_id_tokyo = [
 "47671099999"
,"47662099999"
,"47687099999"
]

stations_id_london = [
 "03768399999"
]

stations_id_newyork = [
 "72505394728"
,"72502014734"
]

stations_id_chicago = [
"72534014819"
]

In [9]:
# Funktion zum Herunterladen der einzelnen Dateien: Pro Jahr und Station-ID
def download_file(src, tgt, year, station_ids):
    # Quellpfad mit "Jahr"
    year_url = src + str(year) + '/'
    # Über die Stationen iterieren und Daten herunterladen
    for id in station_ids:
        full_url = year_url + id + '.csv' # Quell-Url
        full_local_file = tgt + id + '_' + str(year) + '.csv' # Ziel-Url
        try:
            request.urlretrieve(full_url, full_local_file) # Datei herunterladen            
        except request.HTTPError:
            # Falls Datei nicht vorhanden, dann überspringen
            print('Nicht gefunden: ' + full_url)
        time.sleep(5) # Times sleep, um häufige Anfragen an den Server zu reduzieren

In [10]:
# Jahre, Quell- und Ziel-URL definieren
years = range(2007,2020)
main_url = 'https://www.ncei.noaa.gov/data/global-hourly/access/'
download_path = '/home/paul/python_projects/masterthesis/data/wetter/downloaded/'

In [11]:
# Daten herunterladen: Berlin
for year in years:
    download_file(main_url, download_path, year, stations_id_berlin)


In [12]:
# Daten herunterladen: Tokyo
for year in years:
    download_file(main_url, download_path, year, stations_id_tokyo) 


In [13]:
# Daten herunterladen: London
for year in years:
    download_file(main_url, download_path, year, stations_id_london) 


In [14]:
# Daten herunterladen: NewYork
for year in years:
    download_file(main_url, download_path, year, stations_id_newyork) 


In [16]:
# Daten herunterladen: Chicago
for year in years:
    download_file(main_url, download_path, year, stations_id_chicago) 


In [17]:
# Anzahl der Datensätze ermitteltn
dir_list = os.listdir(download_path)
print(dir_list)
total_lines = 0
total_lines_over_all = 0
for file in dir_list:
    with open(download_path + file) as myfile:
        total_lines = sum(1 for line in myfile)
    total_lines_over_all = total_lines_over_all + total_lines

print(total_lines_over_all) # Inklusive Header-Zeile

['47671099999_2012.csv', '10382099999_2011.csv', '72502014734_2010.csv', '47671099999_2009.csv', '47687099999_2013.csv', '10385099999_2014.csv', 'README.txt', '47662099999_2011.csv', '10385099999_2017.csv', '47687099999_2008.csv', '72502014734_2015.csv', '03768399999_2010.csv', '03768399999_2008.csv', '47687099999_2007.csv', '47687099999_2012.csv', '72534014819_2008.csv', '47671099999_2008.csv', '03768399999_2014.csv', '10385099999_2007.csv', '10382099999_2015.csv', '03768399999_2012.csv', '72534014819_2007.csv', '72505394728_2015.csv', '47662099999_2017.csv', '72502014734_2012.csv', '72534014819_2011.csv', '47662099999_2010.csv', '03768399999_2018.csv', '47687099999_2018.csv', '47687099999_2019.csv', '10385099999_2010.csv', '72534014819_2013.csv', '47687099999_2017.csv', '03768399999_2016.csv', '10385099999_2016.csv', '72534014819_2010.csv', '10382099999_2012.csv', '72505394728_2016.csv', '10382099999_2013.csv', '47662099999_2015.csv', '10385099999_2018.csv', '72534014819_2016.csv', '