In [2]:
import pandas as pd
import numpy as np
import statsmodels as sm
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt
from dateutil.parser import parse
import time
import os
import pickle

<h1># 04 - Wetterdaten: Vorbereitung - Schritt 2 </h1>
<hr>
<p><b>Hinweis:</b> Dateifpade sind absolut angegeben und müssen entsprechend der eigenen Verzeichnisstruktur angepasst werden!</p>
<hr>
<p>Hier wird das Temperatur-Attribute auf mehrere Spalten verteilt, Datentypen angepasst und eine Qualitätsprüfung der Temperatur druchgefürt</p>
<hr>

In [3]:
# Anzahl der angezeigten Zeilen in JNP einstellen
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 1000)

In [4]:
# Pickl-Dateipfade vorbereiten:
path_to_working_weather_files = '/home/paul/python_projects/masterthesis/data/wetter/'

pkl_file_wetter_step_1 = 'daten_wetter_step_1.p'

In [5]:
# Pickl-Dateien einlesen
df_wetter_pkl = pickle.load(open(path_to_working_weather_files + pkl_file_wetter_step_1, 'rb'))
df_wetter_pkl.head()

Unnamed: 0,Jahr,Ort,Datum,Datum_Startzeit_UTC,STATION,SOURCE,LATITUDE,LONGITUDE,ELEVATION,NAME,REPORT_TYPE,QUALITY_CONTROL,TMP,DATE_TIME
0,2007,Berlin,2007-09-30,2007-09-30 07:15:00,10382099999,4,52.559686,13.287711,37.18,"TEGEL, GM",FM-15,V020,1301,2007-09-30 07:20:00
1,2007,Berlin,2007-09-30,2007-09-30 07:15:00,10382099999,4,52.559686,13.287711,37.18,"TEGEL, GM",FM-15,V020,1401,2007-09-30 07:50:00
2,2007,Berlin,2007-09-30,2007-09-30 07:15:00,10382099999,4,52.559686,13.287711,37.18,"TEGEL, GM",FM-15,V020,1401,2007-09-30 08:20:00
3,2007,Berlin,2007-09-30,2007-09-30 07:15:00,10382099999,4,52.559686,13.287711,37.18,"TEGEL, GM",FM-15,V020,1401,2007-09-30 08:50:00
4,2007,Berlin,2007-09-30,2007-09-30 07:15:00,10382099999,4,52.559686,13.287711,37.18,"TEGEL, GM",FM-15,V020,1501,2007-09-30 09:20:00


In [6]:
# Datensätze prüfen: Wie viele Wetterdaten sind pro Jahr und Ort verfügbar
df_wetter_pkl.groupby(['Ort', 'Jahr'], as_index=True).agg(
    {     
        'Ort':['count'] #,'max','min']
    }
)

Unnamed: 0_level_0,Unnamed: 1_level_0,Ort
Unnamed: 0_level_1,Unnamed: 1_level_1,count
Ort,Jahr,Unnamed: 2_level_2
Berlin,2007,12
Berlin,2008,12
Berlin,2009,12
Berlin,2010,12
Berlin,2011,12
Berlin,2012,12
Berlin,2013,12
Berlin,2014,12
Berlin,2015,12
Berlin,2016,12


In [7]:
# Nicht benötigte Attribute entfernen
df_wetter_1 = df_wetter_pkl.drop(columns=['SOURCE','LATITUDE','LONGITUDE','REPORT_TYPE','QUALITY_CONTROL']).copy()
df_wetter_1.head()

Unnamed: 0,Jahr,Ort,Datum,Datum_Startzeit_UTC,STATION,ELEVATION,NAME,TMP,DATE_TIME
0,2007,Berlin,2007-09-30,2007-09-30 07:15:00,10382099999,37.18,"TEGEL, GM",1301,2007-09-30 07:20:00
1,2007,Berlin,2007-09-30,2007-09-30 07:15:00,10382099999,37.18,"TEGEL, GM",1401,2007-09-30 07:50:00
2,2007,Berlin,2007-09-30,2007-09-30 07:15:00,10382099999,37.18,"TEGEL, GM",1401,2007-09-30 08:20:00
3,2007,Berlin,2007-09-30,2007-09-30 07:15:00,10382099999,37.18,"TEGEL, GM",1401,2007-09-30 08:50:00
4,2007,Berlin,2007-09-30,2007-09-30 07:15:00,10382099999,37.18,"TEGEL, GM",1501,2007-09-30 09:20:00


In [8]:
# TMP Spalte beinhaltet mehrere Informationen die in einzelne Spalten getrennt werden müssen
# Daher müssen zusätzliche Spalten eingebunden werden
df_wetter_1['TMP_AT'] = None # AIR-TEMPERATURE-OBSERVATION air temperature
df_wetter_1['TMP_AT_QC'] = None # AIR-TEMPERATURE-OBSERVATION air temperature quality code

df_wetter_1.head()

Unnamed: 0,Jahr,Ort,Datum,Datum_Startzeit_UTC,STATION,ELEVATION,NAME,TMP,DATE_TIME,TMP_AT,TMP_AT_QC
0,2007,Berlin,2007-09-30,2007-09-30 07:15:00,10382099999,37.18,"TEGEL, GM",1301,2007-09-30 07:20:00,,
1,2007,Berlin,2007-09-30,2007-09-30 07:15:00,10382099999,37.18,"TEGEL, GM",1401,2007-09-30 07:50:00,,
2,2007,Berlin,2007-09-30,2007-09-30 07:15:00,10382099999,37.18,"TEGEL, GM",1401,2007-09-30 08:20:00,,
3,2007,Berlin,2007-09-30,2007-09-30 07:15:00,10382099999,37.18,"TEGEL, GM",1401,2007-09-30 08:50:00,,
4,2007,Berlin,2007-09-30,2007-09-30 07:15:00,10382099999,37.18,"TEGEL, GM",1501,2007-09-30 09:20:00,,


In [9]:
# Neue Spalten befüllen
for i, row in df_wetter_1.iterrows():
    tmp_splitted = row['TMP'].split(",")
    df_wetter_1.loc[i,['TMP_AT']] = tmp_splitted[0]
    df_wetter_1.loc[i,['TMP_AT_QC']] = tmp_splitted[1]

df_wetter_1.head()

Unnamed: 0,Jahr,Ort,Datum,Datum_Startzeit_UTC,STATION,ELEVATION,NAME,TMP,DATE_TIME,TMP_AT,TMP_AT_QC
0,2007,Berlin,2007-09-30,2007-09-30 07:15:00,10382099999,37.18,"TEGEL, GM",1301,2007-09-30 07:20:00,130,1
1,2007,Berlin,2007-09-30,2007-09-30 07:15:00,10382099999,37.18,"TEGEL, GM",1401,2007-09-30 07:50:00,140,1
2,2007,Berlin,2007-09-30,2007-09-30 07:15:00,10382099999,37.18,"TEGEL, GM",1401,2007-09-30 08:20:00,140,1
3,2007,Berlin,2007-09-30,2007-09-30 07:15:00,10382099999,37.18,"TEGEL, GM",1401,2007-09-30 08:50:00,140,1
4,2007,Berlin,2007-09-30,2007-09-30 07:15:00,10382099999,37.18,"TEGEL, GM",1501,2007-09-30 09:20:00,150,1


In [10]:
# TMP-Datenqualität prüfen
'''
0 = Passed gross limits check
1 = Passed all quality control checks
2 = Suspect
3 = Erroneous
4 = Passed gross limits check, data originate from an NCEI data source
5 = Passed all quality control checks, data originate from an NCEI data source
6 = Suspect, data originate from an NCEI data source
7 = Erroneous, data originate from an NCEI data source
9 = Passed gross limits check if element is present
A = Data value flagged as suspect, but accepted as a good value
C = Temperature and dew point received from Automated Weather Observing System (AWOS) are reported in
whole degrees Celsius. Automated QC flags these values, but they are accepted as valid.
I = Data value not originally in data, but inserted by validator
M = Manual changes made to value based on information provided by NWS or FAA
P = Data value not originally flagged as suspect, but replaced by validator
R = Data value replaced with value computed by NCEI software
U = Data value replaced with edited value

A general domain comprised of the numeric characters (0-9), a plus sign (+), and a minus sign (-). +9999 = Missing.
'''
df_wetter_1.loc[(df_wetter_1.TMP_AT == '99999')] 


Unnamed: 0,Jahr,Ort,Datum,Datum_Startzeit_UTC,STATION,ELEVATION,NAME,TMP,DATE_TIME,TMP_AT,TMP_AT_QC


In [11]:
df_wetter_1.groupby(['TMP_AT_QC']).agg({'TMP_AT_QC':'count'})

Unnamed: 0_level_0,TMP_AT_QC
TMP_AT_QC,Unnamed: 1_level_1
1,516
5,102


In [12]:
# Dataframe kopieren und nicht benötigte Spalten entfernen
df_wetter_2 = df_wetter_1.drop(columns=['TMP']).copy()
df_wetter_2.head()

Unnamed: 0,Jahr,Ort,Datum,Datum_Startzeit_UTC,STATION,ELEVATION,NAME,DATE_TIME,TMP_AT,TMP_AT_QC
0,2007,Berlin,2007-09-30,2007-09-30 07:15:00,10382099999,37.18,"TEGEL, GM",2007-09-30 07:20:00,130,1
1,2007,Berlin,2007-09-30,2007-09-30 07:15:00,10382099999,37.18,"TEGEL, GM",2007-09-30 07:50:00,140,1
2,2007,Berlin,2007-09-30,2007-09-30 07:15:00,10382099999,37.18,"TEGEL, GM",2007-09-30 08:20:00,140,1
3,2007,Berlin,2007-09-30,2007-09-30 07:15:00,10382099999,37.18,"TEGEL, GM",2007-09-30 08:50:00,140,1
4,2007,Berlin,2007-09-30,2007-09-30 07:15:00,10382099999,37.18,"TEGEL, GM",2007-09-30 09:20:00,150,1


In [13]:
# Datentypen setzen
df_wetter_2['TMP_AT'] = df_wetter_2['TMP_AT'].astype(float)

df_wetter_2['ELEVATION'] = df_wetter_2['ELEVATION'].astype(float)
df_wetter_2['Jahr'] = df_wetter_2['Jahr'].astype(int)

In [14]:
# Prüfung
df_wetter_2.loc[(df_wetter_2.Ort == 'Tokyo') & (df_wetter_2.Jahr == 2008)].head(30) 

Unnamed: 0,Jahr,Ort,Datum,Datum_Startzeit_UTC,STATION,ELEVATION,NAME,DATE_TIME,TMP_AT,TMP_AT_QC
427,2008,Tokyo,2008-02-17,2008-02-17 00:10:00,47671099999,10.66,"TOKYO INTERNATIONAL, JA",2008-02-17 00:00:00,40.0,1
428,2008,Tokyo,2008-02-17,2008-02-17 00:10:00,47671099999,10.66,"TOKYO INTERNATIONAL, JA",2008-02-17 00:30:00,50.0,1
429,2008,Tokyo,2008-02-17,2008-02-17 00:10:00,47671099999,10.66,"TOKYO INTERNATIONAL, JA",2008-02-17 01:00:00,50.0,1
430,2008,Tokyo,2008-02-17,2008-02-17 00:10:00,47671099999,10.66,"TOKYO INTERNATIONAL, JA",2008-02-17 01:30:00,60.0,1
431,2008,Tokyo,2008-02-17,2008-02-17 00:10:00,47671099999,10.66,"TOKYO INTERNATIONAL, JA",2008-02-17 02:00:00,50.0,1
432,2008,Tokyo,2008-02-17,2008-02-17 00:10:00,47671099999,10.66,"TOKYO INTERNATIONAL, JA",2008-02-17 02:30:00,60.0,1
433,2008,Tokyo,2008-02-17,2008-02-17 00:10:00,47671099999,10.66,"TOKYO INTERNATIONAL, JA",2008-02-17 02:42:00,70.0,1
434,2008,Tokyo,2008-02-17,2008-02-17 00:10:00,47671099999,10.66,"TOKYO INTERNATIONAL, JA",2008-02-17 03:00:00,60.0,1
435,2008,Tokyo,2008-02-17,2008-02-17 00:10:00,47662099999,36.0,"TOKYO, JA",2008-02-17 00:00:00,36.0,1
436,2008,Tokyo,2008-02-17,2008-02-17 00:10:00,47662099999,36.0,"TOKYO, JA",2008-02-17 01:00:00,48.0,1


In [15]:
# Skalierung gemäß der Dokumentation anpassen:
df_wetter_2['TMP_AT'] = df_wetter_2['TMP_AT'] / 10


In [16]:
# Prüfung
df_wetter_2.loc[(df_wetter_2.Ort == 'Tokyo') & (df_wetter_2.Jahr == 2008)].head(30)

Unnamed: 0,Jahr,Ort,Datum,Datum_Startzeit_UTC,STATION,ELEVATION,NAME,DATE_TIME,TMP_AT,TMP_AT_QC
427,2008,Tokyo,2008-02-17,2008-02-17 00:10:00,47671099999,10.66,"TOKYO INTERNATIONAL, JA",2008-02-17 00:00:00,4.0,1
428,2008,Tokyo,2008-02-17,2008-02-17 00:10:00,47671099999,10.66,"TOKYO INTERNATIONAL, JA",2008-02-17 00:30:00,5.0,1
429,2008,Tokyo,2008-02-17,2008-02-17 00:10:00,47671099999,10.66,"TOKYO INTERNATIONAL, JA",2008-02-17 01:00:00,5.0,1
430,2008,Tokyo,2008-02-17,2008-02-17 00:10:00,47671099999,10.66,"TOKYO INTERNATIONAL, JA",2008-02-17 01:30:00,6.0,1
431,2008,Tokyo,2008-02-17,2008-02-17 00:10:00,47671099999,10.66,"TOKYO INTERNATIONAL, JA",2008-02-17 02:00:00,5.0,1
432,2008,Tokyo,2008-02-17,2008-02-17 00:10:00,47671099999,10.66,"TOKYO INTERNATIONAL, JA",2008-02-17 02:30:00,6.0,1
433,2008,Tokyo,2008-02-17,2008-02-17 00:10:00,47671099999,10.66,"TOKYO INTERNATIONAL, JA",2008-02-17 02:42:00,7.0,1
434,2008,Tokyo,2008-02-17,2008-02-17 00:10:00,47671099999,10.66,"TOKYO INTERNATIONAL, JA",2008-02-17 03:00:00,6.0,1
435,2008,Tokyo,2008-02-17,2008-02-17 00:10:00,47662099999,36.0,"TOKYO, JA",2008-02-17 00:00:00,3.6,1
436,2008,Tokyo,2008-02-17,2008-02-17 00:10:00,47662099999,36.0,"TOKYO, JA",2008-02-17 01:00:00,4.8,1


In [17]:
# Dataframe wieder umkopieren
df_wetter_3 = df_wetter_2.copy()
df_wetter_3.head()

Unnamed: 0,Jahr,Ort,Datum,Datum_Startzeit_UTC,STATION,ELEVATION,NAME,DATE_TIME,TMP_AT,TMP_AT_QC
0,2007,Berlin,2007-09-30,2007-09-30 07:15:00,10382099999,37.18,"TEGEL, GM",2007-09-30 07:20:00,13.0,1
1,2007,Berlin,2007-09-30,2007-09-30 07:15:00,10382099999,37.18,"TEGEL, GM",2007-09-30 07:50:00,14.0,1
2,2007,Berlin,2007-09-30,2007-09-30 07:15:00,10382099999,37.18,"TEGEL, GM",2007-09-30 08:20:00,14.0,1
3,2007,Berlin,2007-09-30,2007-09-30 07:15:00,10382099999,37.18,"TEGEL, GM",2007-09-30 08:50:00,14.0,1
4,2007,Berlin,2007-09-30,2007-09-30 07:15:00,10382099999,37.18,"TEGEL, GM",2007-09-30 09:20:00,15.0,1


In [18]:
# Prüfung
df_wetter_3.loc[(df_wetter_3.Ort == 'Chicago')]

Unnamed: 0,Jahr,Ort,Datum,Datum_Startzeit_UTC,STATION,ELEVATION,NAME,DATE_TIME,TMP_AT,TMP_AT_QC
156,2007,Chicago,2007-10-07,2007-10-07 12:30:00,72534014819,186.5,"CHICAGO MIDWAY AIRPORT, IL US",2007-10-07 12:51:00,22.2,5
157,2007,Chicago,2007-10-07,2007-10-07 12:30:00,72534014819,186.5,"CHICAGO MIDWAY AIRPORT, IL US",2007-10-07 13:51:00,23.9,5
158,2007,Chicago,2007-10-07,2007-10-07 12:30:00,72534014819,186.5,"CHICAGO MIDWAY AIRPORT, IL US",2007-10-07 14:51:00,26.1,5
159,2007,Chicago,2007-10-07,2007-10-07 12:30:00,72534014819,186.5,"CHICAGO MIDWAY AIRPORT, IL US",2007-10-07 15:00:00,26.1,1
160,2008,Chicago,2008-10-12,2008-10-12 12:30:00,72534014819,186.5,"CHICAGO MIDWAY AIRPORT, IL US",2008-10-12 12:51:00,17.8,5
161,2008,Chicago,2008-10-12,2008-10-12 12:30:00,72534014819,186.5,"CHICAGO MIDWAY AIRPORT, IL US",2008-10-12 13:51:00,18.9,5
162,2008,Chicago,2008-10-12,2008-10-12 12:30:00,72534014819,186.5,"CHICAGO MIDWAY AIRPORT, IL US",2008-10-12 14:51:00,21.1,5
163,2008,Chicago,2008-10-12,2008-10-12 12:30:00,72534014819,186.5,"CHICAGO MIDWAY AIRPORT, IL US",2008-10-12 15:00:00,21.1,1
164,2009,Chicago,2009-10-11,2009-10-11 12:30:00,72534014819,186.5,"CHICAGO MIDWAY AIRPORT, IL US",2009-10-11 12:51:00,0.0,5
165,2009,Chicago,2009-10-11,2009-10-11 12:30:00,72534014819,186.5,"CHICAGO MIDWAY AIRPORT, IL US",2009-10-11 13:51:00,1.1,5


In [19]:
# Dataframe serialisieren (Pickle) 
pickle.dump(df_wetter_3, open(path_to_working_weather_files + 'daten_wetter_step_2.p', "wb"))