## Load and unzip weatherdata

In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from pandas import read_csv

import features_engineering as fe
from data_loader import load_weather_data



In [2]:
weather_data_raw = load_weather_data()

In [3]:
weather_data_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5175 entries, 0 to 5174
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   timestamp    5175 non-null   object 
 1   temp         5175 non-null   float64
 2   windspeed    4036 non-null   float64
 3   humidity     5175 non-null   float64
 4   precip       5175 non-null   float64
 5   pressure     5041 non-null   float64
 6   conditions   5175 non-null   object 
 7   dailyprecip  5175 non-null   object 
 8   dailysnow    5175 non-null   object 
 9   fog          5175 non-null   int64  
 10  rain         5175 non-null   int64  
 11  snow         5175 non-null   int64  
dtypes: float64(5), int64(3), object(4)
memory usage: 485.3+ KB


In [None]:
weather_data_new_rows = read_csv('gitData/weather_data_new_rows.csv')

In [None]:
weather_data_new_rows.info()

In [None]:
weather_data_new_rows['fog'] = 0
weather_data_new_rows['fog'] = weather_data_new_rows['fog'].astype(int)
weather_data_new_rows['rain'] = weather_data_new_rows['conditions'].str.contains('Rain',
                                                                                 case=False).astype(
    int)
weather_data_new_rows['snow'] = weather_data_new_rows['conditions'].str.contains('Snow',
                                                                                 case=False).astype(
    int)

In [None]:
weather_data_new_rows['dailyprecip'] = weather_data_new_rows['dailyprecip'].replace('T', '0.001')
weather_data_new_rows['dailyprecip'] = weather_data_new_rows['dailyprecip'].astype(float)
weather_data_new_rows['dailysnow'] = weather_data_new_rows['dailysnow'].replace('T', '0.001')
weather_data_new_rows['dailysnow'] = weather_data_new_rows['dailysnow'].astype(float)
for col in ['fog', 'rain', 'snow']:
  weather_data_new_rows[col] = weather_data_new_rows[col].fillna(0).astype(int)
# Merge weather data
weather_data_raw = pd.concat([weather_data_raw, weather_data_new_rows])
weather_data_raw = fe.add_weather_time_features(weather_data_raw)

# EDA  weather_data_raw

## General EDA

In [None]:
weather_data_raw.info()

In [None]:
weather_data_raw.duplicated().sum()

- timestamp into datetime
- temp into celsius
- windspeed into kph
- precip into mm
- pressure into hpa
- dailyprecip into mm
- dailysnow into mm

In [None]:
weather_data_raw.describe()

In [None]:
weather_data_raw.describe(include='object')

- timestamp has one double

In [None]:
weather_data_raw.nunique()

## EDA timestamp (datetime)

In [None]:
# Check for format consistency
weather_data_raw['datetime_hour'].astype(str).str.match(
    r'^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}$').all()

In [None]:
counts_per_hour = weather_data_raw['datetime_hour'].value_counts().sort_index()

# Plot time series frequency
plt.figure(figsize=(12, 4))
counts_per_hour.plot()
plt.title("Weather Observations per Hour")
plt.xlabel("Time")
plt.ylabel("Observations per Hour")
plt.grid(True)
plt.show()

- The timestamps are predominantly distributed hourly, with slight fluctuations.
- There are some dips in frequency; potentially missing hours.
- The data does not appear to be exactly on the hour (e.g. xx:51).

In [None]:
# Extract minute component from timestamp
weather_data_raw['datetime_minute'] = weather_data_raw['datetime'].dt.minute

# Count how many records fall into each minute of the hour
minute_distribution = weather_data_raw['datetime_minute'].value_counts().sort_index()

# Plot the distribution of measurement minutes
plt.figure(figsize=(10, 4))
minute_distribution.plot(kind='bar')
plt.title("Distribution of Measurement Minutes")
plt.xlabel("Minute of the Hour")
plt.ylabel("Number of Observations")
plt.grid(True)
plt.tight_layout()
plt.show()

- Minute 51 is by far the most frequent minute of measurement (over 4000 entries).
- All other minutes are rare or sporadic → probably exceptions, manual additions or other sources.

In [None]:
# Filter for rows measured at minute 51
weather_51 = weather_data_raw[weather_data_raw['datetime_minute'] == 51].copy()

# Count duplicated timestamps after filtering
duplicate_timestamps = weather_51['datetime'].duplicated().sum()

# Display all duplicated timestamp rows if any
duplicated_rows = weather_51[weather_51['datetime'].duplicated(keep=False)]

duplicate_timestamps, duplicated_rows.sort_values('datetime')

In [None]:
# Aggregate to hourly level using mean for continuous values, max for binary flags
weather_data_raw = (
  weather_51.groupby('datetime_hour')
  .agg({
    'temp': 'mean',
    'windspeed': 'mean',
    'humidity': 'mean',
    'precip': 'sum',
    'pressure': 'mean',
    'dailyprecip': 'first',  # same within a day
    'dailysnow': 'first',
    'fog': 'max',
    'rain': 'max',
    'snow': 'max',
    'conditions': lambda x: x.mode().iloc[0] if not x.mode().empty else x.iloc[0]  # most frequent
  })
  .reset_index()
)

In [None]:
# Standardizes units of measurement & classifies all weather phenomena
weather_data_raw = fe.add_weather_features(weather_data_raw)


- Exactly one timestamp is duplicated: 2016-03-13 00:51:00; Two different entries (Clear vs. Light
Rain), probably two weather sources or measurements.

## EDA temp

In [None]:
weather_data_raw['temp_c'].describe()

In [None]:
plt.figure(figsize=(8, 4))
weather_data_raw['temp_c'].plot.hist(bins=40, edgecolor='black')
plt.title('Distribution of Temperature (°C)')
plt.xlabel("Temperature [°C]")
plt.ylabel("Frequency")
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(4, 6))
plt.boxplot(weather_data_raw["temp_c"].dropna())
plt.title("Boxplot of temp_c")
plt.xlabel("temp_c")
plt.tight_layout()
plt.show()

## EDA Windspeed

In [None]:
weather_data_raw['windspeed_kph'].describe()

In [None]:
weather_data_raw['windspeed_kph'].isna().sum()

In [None]:
plt.figure(figsize=(8, 4))
weather_data_raw['windspeed_kph'].plot.hist(bins=40, edgecolor='black')
plt.title("Distribution of Wind Speed (km/h)")
plt.xlabel("Wind Speed [km/h]")
plt.ylabel("Frequency")
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(4, 6))
plt.boxplot(weather_data_raw["windspeed_kph"].dropna())
plt.title("Boxplot of windspeed_kph")
plt.xlabel("windspeed_kph")
plt.tight_layout()
plt.show()

- Most values lie between 5 and 25 km/h, which is typical for city-level weather.
- One extremely high value (137.12 km/h) may be an outlier.
- The variable appears well-behaved and usable without further transformation.

## EDA humidity

In [None]:
weather_data_raw['humidity'].describe()

In [None]:
plt.figure(figsize=(8, 4))
weather_data_raw['humidity'].plot.hist(bins=40, edgecolor='black')
plt.title("Distribution of Humidity (%)")
plt.xlabel("Humidity [%]")
plt.ylabel("Frequency")
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(4, 6))
plt.boxplot(weather_data_raw["humidity"].dropna())
plt.title("Boxplot of humidity")
plt.xlabel("humidity")
plt.tight_layout()
plt.show()

- Values range from 9% to 97%.
- Most values lie between 40% and 70%.
- No anomalies or unrealistic entries.

## EDA precip

In [None]:
weather_data_raw['precip_mm'].describe()

In [None]:
plt.figure(figsize=(8, 4))
weather_data_raw['precip_mm'].plot.hist(bins=40, edgecolor='black')
plt.title("Distribution of Hourly Precipitation (mm)")
plt.xlabel("Hourly Precipitation [mm]")
plt.ylabel("Frequency")
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(4, 6))
plt.boxplot(weather_data_raw["precip_mm"].dropna())
plt.title("Boxplot of precip_mm")
plt.xlabel("precip_mm")
plt.tight_layout()
plt.show()

Findings of the Exploration
- Most values are 0.0 (dry hours).
- Occasional rain events up to 18.5 mm.

Required Arrangements
- None. Consider binary flag creation (e.g. had_rain = precip_mm > 0).

## EDA pressure

In [None]:
weather_data_raw['pressure_hPa'].describe()

In [None]:
weather_data_raw['pressure_hPa'].isna().sum()

In [None]:
plt.figure(figsize=(8, 4))
weather_data_raw['pressure_hPa'].plot.hist(bins=40, edgecolor='black')
plt.title("Distribution of pressure_hPa")
plt.xlabel("pressure_hPa")
plt.ylabel("Frequency")
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(4, 6))
plt.boxplot(weather_data_raw["pressure_hPa"].dropna())
plt.title("Boxplot of pressure_hPa")
plt.xlabel("pressure_hPa")
plt.tight_layout()
plt.show()

## EDA dailyrain

In [None]:
weather_data_raw['dailyprecip'].value_counts()

In [None]:
# Ersetze 'T' durch einen minimalen Wert
weather_data_raw['dailyprecip'] = weather_data_raw['dailyprecip'].replace('T', '0.001')

In [None]:
weather_data_raw['dailyprecip'] = weather_data_raw['dailyprecip'].astype(float)

In [None]:
weather_data_raw['precip_daily_mm'].isna().sum()

In [None]:
weather_data_raw['precip_daily_mm'].describe()

In [None]:
plt.figure(figsize=(8, 4))
weather_data_raw['precip_daily_mm'].plot.hist(bins=40, edgecolor='black')
plt.title("Distribution of precip_daily_mm")
plt.xlabel("precip_daily_mm")
plt.ylabel("Frequency")
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(4, 6))
plt.boxplot(weather_data_raw["precip_daily_mm"].dropna())
plt.title("Boxplot of precip_daily_mm")
plt.xlabel("precip_daily_mm")
plt.tight_layout()
plt.show()

- ...

## EDA dailysnow

In [None]:
weather_data_raw['dailysnow'].value_counts()

In [None]:
weather_data_raw['dailysnow'] = weather_data_raw['dailysnow'].replace('T', '0.001')
weather_data_raw['dailysnow'] = weather_data_raw['dailysnow'].astype(float)

In [None]:

weather_data_raw['daily_snow_mm'].describe()

In [None]:
weather_data_raw['daily_snow_mm'].isna().sum()

In [None]:
plt.figure(figsize=(8, 4))
weather_data_raw['daily_snow_mm'].plot.hist(bins=40, edgecolor='black')
plt.title("Distribution of daily_snow_mm")
plt.xlabel("daily_snow_mm")
plt.ylabel("Frequency")
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(4, 6))
plt.boxplot(weather_data_raw["daily_snow_mm"].dropna())
plt.title("Boxplot of daily_snow_mm")
plt.xlabel("daily_snow_mm")
plt.tight_layout()
plt.show()

EDA fog, rain, snow

In [None]:
weather_data_raw['fog'].value_counts()

In [None]:
weather_data_raw['rain'].value_counts()

In [None]:
weather_data_raw['snow'].value_counts()

## EDA Conditions

In [None]:
condition_counts = weather_data_raw['conditions'].value_counts()

plt.figure(figsize=(10, 4))
condition_counts.plot(kind='bar')
plt.title("Distribution of Weather Conditions")
plt.xlabel("Condition")
plt.ylabel("Frequency")
plt.xticks(rotation=45, ha='right')
plt.grid(True)
plt.tight_layout()
plt.show()

- Most common condition is Clear (2215 times), followed by Overcast (1000).
- Some conditions occur only a few times, e.g., Heavy Snow, Light Freezing Fog.
- Unknown appears 83 times and may indicate missing or invalid sensor data.

# Classify / EDA  Weather Conditions

## Classify / EDA –  rain intensity

In [None]:
rain_counts = weather_data_raw['rain_class'].value_counts().reindex([
  'no_rain', 'light_rain', 'moderate_rain', 'heavy_rain', 'very_heavy_rain', 'extreme_rain'
])

rain_counts.plot(kind='bar', edgecolor='black', figsize=(8, 4))
plt.title("Frequency of Rain Intensity Classes")
plt.xlabel("Rain Class")
plt.ylabel('number_of_hours')
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
rain_time_series = weather_data_raw.set_index('datetime_hour')['rain_code']

plt.figure(figsize=(12, 4))
rain_time_series.plot(drawstyle='steps-post')
plt.title("Rain Intensity Over Time")
plt.ylabel("Rain Class Code")
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(8, 4))
sns.boxplot(data=weather_data_raw, x='rain_class', y='humidity')
plt.title("Humidity Distribution by Rain Class")
plt.xlabel("Rain Class")
plt.ylabel("Humidity (%)")
plt.xticks(rotation=45)
plt.grid(True)
plt.tight_layout()
plt.show()

## Classify / EDA – snow intensity

In [None]:
weather_data_raw['snow_class'].value_counts().reindex([
  'no_snow', 'light_snow', "snow", 'heavy_snow'
]).plot(kind='bar', edgecolor='black')
plt.title("Frequency of Snow Intensity Classes")
plt.xlabel("Snow Class")
plt.ylabel('number_of_hours')
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
# Time series: Snow Class Code
plt.figure(figsize=(12, 4))
weather_data_raw.set_index('datetime_hour')['snow_code'].plot(drawstyle='steps-post')
plt.title("Snow Intensity Over Time")
plt.ylabel("Snow Class Code")
plt.grid(True)
plt.tight_layout()
plt.show()

## Classify / EDA –  cloud intensity

In [None]:
# Bar plot: Frequency of Cloud Classes
cloud_counts = weather_data_raw['cloud_class'].value_counts().reindex([
  "clear", 'scattered_clouds', 'partly_cloudy', 'mostly_cloudy', "overcast", "unknown"
])
cloud_counts.plot(kind='bar', edgecolor='black', figsize=(8, 4))
plt.title("Frequency of Cloud Intensity Classes")
plt.xlabel("Cloud Class")
plt.ylabel('number_of_hours')
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
# Time series: Cloud Class Code
plt.figure(figsize=(12, 4))
weather_data_raw.set_index('datetime_hour')['cloud_code'].plot(drawstyle='steps-post')
plt.title("Cloud Intensity Over Time")
plt.ylabel("Cloud Class Code")
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
sns.boxplot(data=weather_data_raw, x='cloud_class', y='humidity')
plt.title("Humidity by Cloud Class")

## Classify / EDA –  haze intensity

In [None]:
# Bar plot: Frequency of Haze
hazy_counts = weather_data_raw['hazy_class'].value_counts().reindex(['no_haze', "haze"])
hazy_counts.plot(kind='bar', edgecolor='black', figsize=(6, 4))
plt.title("Frequency of Haze")
plt.xlabel("Hazy Class")
plt.ylabel('number_of_hours')
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
# Time series: Hazy Class Code
plt.figure(figsize=(12, 4))
weather_data_raw.set_index('datetime_hour')['hazy_code'].plot(drawstyle='steps-post')
plt.title("Haze Intensity Over Time")
plt.ylabel("Hazy Class Code")
plt.grid(True)
plt.tight_layout()
plt.show()

## Classify / EDA –  freezing fog or rain

In [None]:
# Bar plot: Frequency of Freezing Classes
freezing_counts = weather_data_raw['freezing_class'].value_counts().reindex([
  "none", 'light_freezing_rain', 'light_freezing_fog'
])
freezing_counts.plot(kind='bar', edgecolor='black', figsize=(8, 4))
plt.title("Frequency of Freezing Fog or Rain")
plt.xlabel("Freezing Class")
plt.ylabel('number_of_hours')
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
# Time series: Freezing Class Code
plt.figure(figsize=(12, 4))
weather_data_raw.set_index('datetime_hour')['freezing_code'].plot(drawstyle='steps-post')
plt.title("Freezing Fog or Rain Intensity Over Time")
plt.ylabel("Freezing Class Code")
plt.grid(True)
plt.tight_layout()
plt.show()

## Classify / EDA –  fog

In [None]:
# Bar plot: Frequency of Fog Classes
fog_counts = weather_data_raw['fog_class'].value_counts().reindex(["no_fog", "fog"])
fog_counts.plot(kind='bar', edgecolor='black', figsize=(6, 4))
plt.title("Frequency of Fog Classes")
plt.xlabel("Fog Class")
plt.ylabel('number_of_hours')
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
# Time series: Fog Class Code
plt.figure(figsize=(12, 4))
weather_data_raw.set_index('datetime_hour')['fog_code'].plot(drawstyle='steps-post')
plt.title("Fog Intensity Over Time")
plt.ylabel("Fog Class Code")
plt.grid(True)
plt.tight_layout()
plt.show()

## Classify / EDA –  temp itensity

In [None]:
# Bar plot: Frequency of Temperature Classes
temp_counts = weather_data_raw['temp_class'].value_counts().reindex([
  'very_cold', "cold", "cool", "mild", "warm", "hot"
])

temp_counts.plot(kind='bar', edgecolor='black', figsize=(8, 4))
plt.title("Frequency of Temperature Classes")
plt.xlabel('Temperature_Class')
plt.ylabel('number_of_hours')
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
# Boxplot: Windspeed by Temperature Class
sns.boxplot(data=weather_data_raw, x='temp_class', y='windspeed_kph')
plt.title("Windspeed by Temperature Class")
plt.xlabel('Temperature_Class')
plt.ylabel("Windspeed (km/h)")
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
# Boxplot: Pressure by Temperature Class
sns.boxplot(data=weather_data_raw, x='temp_class', y='pressure_hPa')
plt.title("Pressure by Temperature Class")
plt.xlabel('Temperature_Class')
plt.ylabel("Pressure (hPa)")
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
# Time series: Temperature Class Code over Time
plt.figure(figsize=(12, 4))
weather_data_raw.set_index('datetime_hour')['temp_code'].plot(drawstyle='steps-post')
plt.title("Temperature Intensity Over Time")
plt.ylabel("Temperature Class Code")
plt.grid(True)
plt.tight_layout()
plt.show()

## Classify / EDA –  windspeed itensity

In [None]:
# Bar plot: Frequency of Windspeed Classes
windspeed_counts = weather_data_raw['windspeed_class'].value_counts().reindex([
  "calm", 'light_air', 'light_breeze', 'moderate_breeze', 'strong_breeze', "stormy"
])

windspeed_counts.plot(kind='bar', edgecolor='black', figsize=(8, 4))
plt.title("Frequency of Windspeed Classes")
plt.xlabel('Windspeed_Class')
plt.ylabel('number_of_hours')
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
# Boxplot: Temperature by Windspeed Class
sns.boxplot(data=weather_data_raw, x='windspeed_class', y='temp_c')
plt.title("Temperature by Windspeed Class")
plt.xlabel('Windspeed_Class')
plt.ylabel('Temperature (°C)')
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
# Boxplot: Pressure by Windspeed Class
sns.boxplot(data=weather_data_raw, x='windspeed_class', y='pressure_hPa')
plt.title("Pressure by Windspeed Class")
plt.xlabel('Windspeed_Class')
plt.ylabel("Pressure (hPa)")
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
# Time series: Windspeed Class Code over Time
plt.figure(figsize=(12, 4))
weather_data_raw.set_index('datetime_hour')['windspeed_code'].plot(drawstyle='steps-post')
plt.title("Windspeed Intensity Over Time")
plt.ylabel("Windspeed Class Code")
plt.grid(True)
plt.tight_layout()
plt.show()

## Classify / EDA –  humidity itensity

In [None]:
# Bar plot: Frequency of Humidity Classes
humidity_counts = weather_data_raw['humidity_class'].value_counts().reindex([
  'very_dry', 'dry', 'normal', 'wet', 'very_wet'
])

humidity_counts.plot(kind='bar', edgecolor='black', figsize=(8, 4))
plt.title("Frequency of Humidity Classes")
plt.xlabel("Humidity Class")
plt.ylabel('number_of_hours')
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:

# Boxplot: Temperature by Humidity Class
sns.boxplot(data=weather_data_raw, x='humidity_class', y='temp_c')
plt.title("Temperature by Humidity Class")
plt.xlabel("Humidity Class")
plt.ylabel('Temperature (°C)')
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:

# Time series: Humidity Class Code over Time
plt.figure(figsize=(12, 4))
weather_data_raw.set_index('datetime_hour')['humidity_code'].plot(drawstyle='steps-post')
plt.title("Humidity Intensity Over Time")
plt.ylabel("Humidity Class Code")
plt.grid(True)
plt.tight_layout()
plt.show()

## Classify / EDA –  pressure itensity

In [None]:
# Bar plot: Frequency of Pressure Classes
pressure_counts = weather_data_raw['pressure_class'].value_counts().reindex([
  'very_low', "low", "normal", "high", 'very_high'
])

pressure_counts.plot(kind='bar', edgecolor='black', figsize=(8, 4))
plt.title("Frequency of Pressure Classes")
plt.xlabel("Pressure Class")
plt.ylabel('number_of_hours')
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:

# Boxplot: Temperature by Pressure Class
sns.boxplot(data=weather_data_raw, x='pressure_class', y='temp_c')
plt.title("Temperature by Pressure Class")
plt.xlabel("Pressure Class")
plt.ylabel('Temperature (°C)')
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:

# Time series: Pressure Class Code over Time
plt.figure(figsize=(12, 4))
weather_data_raw.set_index('datetime_hour')['pressure_code'].plot(drawstyle='steps-post')
plt.title("Pressure Intensity Over Time")
plt.ylabel("Pressure Class Code")
plt.grid(True)
plt.tight_layout()
plt.show()

# Imputation

In [None]:
weather_data_before_imputation = weather_data_raw
weather_data_raw.set_index('datetime_hour', inplace=True)
weather_data_raw['windspeed_kph'] = weather_data_raw['windspeed_kph'].interpolate(method='time')

weather_data_raw.reset_index(inplace=True)
plt.figure(figsize=(12, 4))
weather_data_raw['windspeed_kph'].plot(label='Imputiert', alpha=0.8)
weather_data_raw['windspeed_kph'].where(weather_data_raw['windspeed_kph'].notna()).plot(
    style='o',
    label='Original (Nicht-NaN)',
    markersize=2)
plt.title('Windspeed: Imputation sichtbar machen')
plt.ylabel('km/h')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()
before = weather_data_before_imputation['windspeed_kph']
after = weather_data_raw['windspeed_kph']

print("mean before:", before.mean())
print("mean after:", after.mean())
print("std deviation before:", before.std())
print("std deviation after:", after.std())

## pressure

In [None]:
weather_data_before_imputation = weather_data_raw
weather_data_raw.set_index('datetime_hour', inplace=True)
weather_data_raw['pressure_hPa'] = weather_data_raw['pressure_hPa'].interpolate(method='time')
weather_data_raw.reset_index(inplace=True)
plt.figure(figsize=(12, 4))
weather_data_raw['pressure_hPa'].plot(label='Imputiert', alpha=0.8)
weather_data_raw['pressure_hPa'].where(weather_data_raw['pressure_hPa'].notna()).plot(style='o',
                                                                                      label='Original (Nicht-NaN)',
                                                                                      markersize=2)
plt.title('pressure_hPa: Imputation sichtbar machen')
plt.ylabel('hpa')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()
before = weather_data_before_imputation['pressure_hPa']
after = weather_data_raw['pressure_hPa']

print("mean before:", before.mean())
print("mean after:", after.mean())
print("std deviation before:", before.std())
print("std deviation after:", after.std())

In [None]:
weather_data_raw.isna().sum()

In [None]:
weather_data_raw.info()

# Save Weather Data

In [None]:
weather_data_raw.to_csv("data/weather_data_clean.csv", index=False)