## Load and unzip weatherdata

In [None]:
import pickle
from pathlib import Path
from zipfile import ZipFile

import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from kaggle.api.kaggle_api_extended import KaggleApi


def load_weather_data():
  dataset_slug = "pschale/nyc-taxi-wunderground-weather"
  zip_name = "nyc-taxi-wunderground-weather.zip"
  csv_name = "weatherdata.csv"

  data_dir = Path("data")
  package_dir = Path("data_packages")

  zip_path = package_dir / zip_name
  csv_path = data_dir / csv_name
  pkl_path = data_dir / "weatherdata.pkl"

  # Schritt 0: Cache verwenden
  if pkl_path.is_file():
    with open(pkl_path, "rb") as f:
      return pickle.load(f)

  # Schritt 1: ZIP herunterladen, wenn sie nicht existiert
  if not zip_path.is_file():
    package_dir.mkdir(parents=True, exist_ok=True)
    api = KaggleApi()
    api.authenticate()
    api.dataset_download_files(dataset_slug, path=str(package_dir), unzip=False)

  # Schritt 2: Entpacken, falls CSV noch nicht existiert
  if not csv_path.is_file():
    data_dir.mkdir(parents=True, exist_ok=True)
    with ZipFile(zip_path, "r") as zip_ref:
      zip_ref.extract(csv_name, path=data_dir)

  # Schritt 3: CSV laden und Cache schreiben
  if not csv_path.is_file():
    raise FileNotFoundError(f"{csv_path} wurde nicht gefunden – Entpackung fehlgeschlagen.")

  df = pd.read_csv(csv_path)
  with open(pkl_path, "wb") as f:
    pickle.dump(df, f)

  return df


# Anwendung
weather_data_raw = load_weather_data()

# EDA  weather_data_raw

## General EDA

In [None]:
weather_data_raw.info()

- timestamp into datetime
- temp into celsius
- windspeed into kph
- precip into mm
- pressure into hpa
- dailyprecip into mm
- dailysnow into mm

In [None]:
weather_data_raw.describe()

In [None]:
weather_data_raw.describe(include='object')

- timestamp has one double

In [None]:
weather_data_raw.nunique()

In [None]:
weather_data_raw.isna().sum()

## EDA timestamp (datetime)

In [None]:
# Count entries per full hour (rounded down)
weather_data_raw['datetime'] = pd.to_datetime(weather_data_raw['timestamp'], errors='coerce')
weather_data_raw['datetime_hour'] = weather_data_raw['datetime'].dt.floor('h')

In [None]:
# Check for format consistency
weather_data_raw['datetime_hour'].astype(str).str.match(
    r'^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}$').all()

In [None]:
counts_per_hour = weather_data_raw['datetime_hour'].value_counts().sort_index()

# Plot time series frequency
plt.figure(figsize=(12, 4))
counts_per_hour.plot()
plt.title("Weather Observations per Hour")
plt.xlabel("Time")
plt.ylabel("Observations per Hour")
plt.grid(True)
plt.show()

- The timestamps are predominantly distributed hourly, with slight fluctuations.
- There are some dips in frequency; potentially missing hours.
- The data does not appear to be exactly on the hour (e.g. xx:51).

In [None]:
# Extract minute component from timestamp
weather_data_raw['datetime_minute'] = weather_data_raw['datetime'].dt.minute

# Count how many records fall into each minute of the hour
minute_distribution = weather_data_raw['datetime_minute'].value_counts().sort_index()

# Plot the distribution of measurement minutes
plt.figure(figsize=(10, 4))
minute_distribution.plot(kind='bar')
plt.title("Distribution of Measurement Minutes")
plt.xlabel("Minute of the Hour")
plt.ylabel("Number of Observations")
plt.grid(True)
plt.tight_layout()
plt.show()

- Minute 51 is by far the most frequent minute of measurement (over 4000 entries).
- All other minutes are rare or sporadic → probably exceptions, manual additions or other sources.

In [None]:
# Filter for rows measured at minute 51
weather_51 = weather_data_raw[weather_data_raw['datetime_minute'] == 51].copy()

# Count duplicated timestamps after filtering
duplicate_timestamps = weather_51['datetime'].duplicated().sum()

# Display all duplicated timestamp rows if any
duplicated_rows = weather_51[weather_51['datetime'].duplicated(keep=False)]

duplicate_timestamps, duplicated_rows.sort_values('datetime')

In [None]:
# Aggregate to hourly level using mean for continuous values, max for binary flags
weather_data_raw = (
  weather_51.groupby('datetime_hour')
  .agg({
    'temp': 'mean',
    'windspeed': 'mean',
    'humidity': 'mean',
    'precip': 'sum',
    'pressure': 'mean',
    'dailyprecip': 'first',  # same within a day
    'dailysnow': 'first',
    'fog': 'max',
    'rain': 'max',
    'snow': 'max',
    'conditions': lambda x: x.mode().iloc[0] if not x.mode().empty else x.iloc[0]  # most frequent
  })
  .reset_index()
)
weather_data_raw


- Exactly one timestamp is duplicated: 2016-03-13 00:51:00; Two different entries (Clear vs. Light
Rain), probably two weather sources or measurements.

In [None]:
weather_data_raw['hour_of_day'] = weather_data_raw['datetime_hour'].dt.hour

# Day of year and hour of year (for merging with weather)
weather_data_raw['day_of_year'] = weather_data_raw['datetime_hour'].dt.dayofyear

In [None]:
weather_data_raw['hour_of_year'] = ((weather_data_raw['day_of_year'] - 1) * 24 +
                                 weather_data_raw['hour_of_day'])

## EDA temp

In [None]:
weather_data_raw['temp_c'] = (weather_data_raw['temp'] - 32) * 5 / 9
weather_data_raw['temp_c'].describe()

In [None]:
plt.figure(figsize=(8, 4))
weather_data_raw['temp_c'].plot.hist(bins=40, edgecolor='black')
plt.title('Distribution of Temperature (°C)')
plt.xlabel("Temperature [°C]")
plt.ylabel("Frequency")
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(4, 6))
plt.boxplot(weather_data_raw["temp_c"].dropna())
plt.title("Boxplot of temp_c")
plt.xlabel("temp_c")
plt.tight_layout()
plt.show()

## EDA Windspeed

In [None]:
weather_data_raw['windspeed_kph'] = weather_data_raw['windspeed'] * 1.60934
weather_data_raw['windspeed_kph'].describe()

In [None]:
weather_data_raw['windspeed_kph'].isna().sum()

In [None]:
weather_data_before_imputation = weather_data_raw
weather_data_raw.set_index('datetime_hour', inplace=True)
weather_data_raw['windspeed_kph'] = weather_data_raw['windspeed_kph'].interpolate(method='time')
weather_data_raw.reset_index(inplace=True)

In [None]:
plt.figure(figsize=(12, 4))
weather_data_raw['windspeed_kph'].plot(label='Imputiert', alpha=0.8)
weather_data_raw['windspeed_kph'].where(weather_data_raw['windspeed_kph'].notna()).plot(style='o',
                                                                                        label='Original (Nicht-NaN)',
                                                                                        markersize=2)
plt.title('Windspeed: Imputation sichtbar machen')
plt.ylabel('km/h')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
before = weather_data_before_imputation['windspeed_kph']
after = weather_data_raw['windspeed_kph']

print("mean before:", before.mean())
print("mean after:", after.mean())
print("std deviation before:", before.std())
print("std deviation after:", after.std())

In [None]:
weather_data_raw['windspeed_kph'].plot(figsize=(12, 4),
                                       title="Stündliche Windspeed nach Imputation", grid=True)

In [None]:
weather_data_raw['windspeed_kph'].isna().sum()

In [None]:
plt.figure(figsize=(8, 4))
weather_data_raw['windspeed_kph'].plot.hist(bins=40, edgecolor='black')
plt.title("Distribution of Wind Speed (km/h)")
plt.xlabel("Wind Speed [km/h]")
plt.ylabel("Frequency")
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(4, 6))
plt.boxplot(weather_data_raw["windspeed_kph"].dropna())
plt.title("Boxplot of windspeed_kph")
plt.xlabel("windspeed_kph")
plt.tight_layout()
plt.show()

- Most values lie between 5 and 25 km/h, which is typical for city-level weather.
- One extremely high value (137.12 km/h) may be an outlier.
- The variable appears well-behaved and usable without further transformation.

## EDA humidity

In [None]:
weather_data_raw['humidity'].describe()

In [None]:
plt.figure(figsize=(8, 4))
weather_data_raw['humidity'].plot.hist(bins=40, edgecolor='black')
plt.title("Distribution of Humidity (%)")
plt.xlabel("Humidity [%]")
plt.ylabel("Frequency")
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(4, 6))
plt.boxplot(weather_data_raw["humidity"].dropna())
plt.title("Boxplot of humidity")
plt.xlabel("humidity")
plt.tight_layout()
plt.show()

- Values range from 9% to 97%.
- Most values lie between 40% and 70%.
- No anomalies or unrealistic entries.

## EDA precip

In [None]:
weather_data_raw['precip_mm'] = weather_data_raw['precip'] * 25.4
weather_data_raw['precip_mm'].describe()

In [None]:
plt.figure(figsize=(8, 4))
weather_data_raw['precip_mm'].plot.hist(bins=40, edgecolor='black')
plt.title("Distribution of Hourly Precipitation (mm)")
plt.xlabel("Hourly Precipitation [mm]")
plt.ylabel("Frequency")
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(4, 6))
plt.boxplot(weather_data_raw["precip_mm"].dropna())
plt.title("Boxplot of precip_mm")
plt.xlabel("precip_mm")
plt.tight_layout()
plt.show()

Findings of the Exploration
- Most values are 0.0 (dry hours).
- Occasional rain events up to 18.5 mm.

Required Arrangements
- None. Consider binary flag creation (e.g. had_rain = precip_mm > 0).

## EDA pressure

In [None]:
weather_data_raw['pressure_hPa'] = weather_data_raw['pressure'] * 33.8639
weather_data_raw['pressure_hPa'].describe()

In [None]:
weather_data_raw['pressure_hPa'].isna().sum()

In [None]:
weather_data_before_imputation = weather_data_raw
weather_data_raw.set_index('datetime_hour', inplace=True)
weather_data_raw['pressure_hPa'] = weather_data_raw['pressure_hPa'].interpolate(method='time')
weather_data_raw.reset_index(inplace=True)

In [None]:
plt.figure(figsize=(12, 4))
weather_data_raw['pressure_hPa'].plot(label='Imputiert', alpha=0.8)
weather_data_raw['pressure_hPa'].where(weather_data_raw['pressure_hPa'].notna()).plot(style='o',
                                                                                      label='Original (Nicht-NaN)',
                                                                                      markersize=2)
plt.title('pressure_hPa: Imputation sichtbar machen')
plt.ylabel('hpa')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
before = weather_data_before_imputation['pressure_hPa']
after = weather_data_raw['pressure_hPa']

print("mean before:", before.mean())
print("mean after:", after.mean())
print("std deviation before:", before.std())
print("std deviation after:", after.std())

In [None]:
weather_data_raw['pressure_hPa'].plot(figsize=(12, 4),
                                      title="Stündliche pressure_hPa nach Imputation", grid=True)

In [None]:
weather_data_raw['pressure_hPa'].isna().sum()

In [None]:
plt.figure(figsize=(8, 4))
weather_data_raw['pressure_hPa'].plot.hist(bins=40, edgecolor='black')
plt.title("Distribution of pressure_hPa")
plt.xlabel("pressure_hPa")
plt.ylabel("Frequency")
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(4, 6))
plt.boxplot(weather_data_raw["pressure_hPa"].dropna())
plt.title("Boxplot of pressure_hPa")
plt.xlabel("pressure_hPa")
plt.tight_layout()
plt.show()

## EDA dailyrain

In [None]:
weather_data_raw['dailyprecip'].value_counts()

In [None]:
# Ersetze 'T' durch einen minimalen Wert
weather_data_raw['dailyprecip'] = weather_data_raw['dailyprecip'].replace('T', '0.001')

In [None]:
weather_data_raw['dailyprecip'] = weather_data_raw['dailyprecip'].astype(float)

In [None]:
weather_data_raw['precip_daily_mm'] = weather_data_raw['dailyprecip'] * 25.4

In [None]:
weather_data_raw['precip_daily_mm'].isna().sum()

In [None]:
weather_data_raw['precip_daily_mm'].describe()

In [None]:
plt.figure(figsize=(8, 4))
weather_data_raw['precip_daily_mm'].plot.hist(bins=40, edgecolor='black')
plt.title("Distribution of precip_daily_mm")
plt.xlabel("precip_daily_mm")
plt.ylabel("Frequency")
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(4, 6))
plt.boxplot(weather_data_raw["precip_daily_mm"].dropna())
plt.title("Boxplot of precip_daily_mm")
plt.xlabel("precip_daily_mm")
plt.tight_layout()
plt.show()

- ...

## EDA dailysnow

In [None]:
weather_data_raw['dailysnow'].value_counts()

In [None]:
weather_data_raw['dailysnow'] = weather_data_raw['dailysnow'].replace('T', '0.001')
weather_data_raw['dailysnow'] = weather_data_raw['dailysnow'].astype(float)

In [None]:
weather_data_raw['daily_snow_mm'] = weather_data_raw['dailysnow'] * 25.4
weather_data_raw['daily_snow_mm'].describe()

In [None]:
weather_data_raw['daily_snow_mm'].isna().sum()

In [None]:
plt.figure(figsize=(8, 4))
weather_data_raw['daily_snow_mm'].plot.hist(bins=40, edgecolor='black')
plt.title("Distribution of daily_snow_mm")
plt.xlabel("daily_snow_mm")
plt.ylabel("Frequency")
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(4, 6))
plt.boxplot(weather_data_raw["daily_snow_mm"].dropna())
plt.title("Boxplot of daily_snow_mm")
plt.xlabel("daily_snow_mm")
plt.tight_layout()
plt.show()

EDA fog, rain, snow

In [None]:
weather_data_raw['fog'].value_counts()

In [None]:
weather_data_raw['rain'].value_counts()

In [None]:
weather_data_raw['snow'].value_counts()

## EDA Conditions

In [None]:
condition_counts = weather_data_raw['conditions'].value_counts()

plt.figure(figsize=(10, 4))
condition_counts.plot(kind='bar')
plt.title("Distribution of Weather Conditions")
plt.xlabel("Condition")
plt.ylabel("Frequency")
plt.xticks(rotation=45, ha='right')
plt.grid(True)
plt.tight_layout()
plt.show()

- Most common condition is Clear (2215 times), followed by Overcast (1000).
- Some conditions occur only a few times, e.g., Heavy Snow, Light Freezing Fog.
- Unknown appears 83 times and may indicate missing or invalid sensor data.

# Classify / EDA  Weather Conditions

## Classify / EDA –  rain intensity

In [None]:
EXTREME_RAIN = 'extreme_rain'
VERY_HEAVY_RAIN = 'very_heavy_rain'
LIGHT_RAIN = 'light_rain'
MODERATE_RAIN = 'moderate_rain'
HEAVY_RAIN = 'heavy_rain'
NO_RAIN = 'no_rain'
NUMBER_OF_HOURS = 'number_of_hours'

In [None]:
# 1. Human-readable rain classification
def classify_rain_label(x):
  if x >= 30:
    return EXTREME_RAIN
  elif x >= 15:
    return VERY_HEAVY_RAIN
  elif x >= 7.5:
    return HEAVY_RAIN
  elif x >= 2.5:
    return MODERATE_RAIN
  elif x > 0:
    return LIGHT_RAIN
  else:
    return NO_RAIN


weather_data_raw['rain_class'] = weather_data_raw['precip_mm'].apply(classify_rain_label)

In [None]:

# 2. Ordinal encoding for machine learning
rain_mapping = {
  NO_RAIN: 0,
  LIGHT_RAIN: 1,
  MODERATE_RAIN: 2,
  HEAVY_RAIN: 3,
  VERY_HEAVY_RAIN: 4,
  EXTREME_RAIN: 5
}

weather_data_raw['rain_code'] = weather_data_raw['rain_class'].map(rain_mapping)

In [None]:
rain_counts = weather_data_raw['rain_class'].value_counts().reindex([
  NO_RAIN, LIGHT_RAIN, MODERATE_RAIN, HEAVY_RAIN, VERY_HEAVY_RAIN, EXTREME_RAIN
])

rain_counts.plot(kind='bar', edgecolor='black', figsize=(8, 4))
plt.title("Frequency of Rain Intensity Classes")
plt.xlabel("Rain Class")
plt.ylabel(NUMBER_OF_HOURS)
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
rain_time_series = weather_data_raw.set_index('datetime_hour')['rain_code']

plt.figure(figsize=(12, 4))
rain_time_series.plot(drawstyle='steps-post')
plt.title("Rain Intensity Over Time")
plt.ylabel("Rain Class Code")
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(8, 4))
sns.boxplot(data=weather_data_raw, x='rain_class', y='humidity')
plt.title("Humidity Distribution by Rain Class")
plt.xlabel("Rain Class")
plt.ylabel("Humidity (%)")
plt.xticks(rotation=45)
plt.grid(True)
plt.tight_layout()
plt.show()

## Classify / EDA – snow intensity

In [None]:
LIGHT_SNOW = 'light_snow'
HEAVY_SNOW = 'heavy_snow'
NO_SNOW = 'no_snow'

In [None]:
# 1. Human-readable snow classification based on conditions
def classify_snow_label(x):
  if x == "Light Snow":
    return LIGHT_SNOW
  elif x == "Snow":
    return "snow"
  elif x == "Heavy Snow":
    return HEAVY_SNOW
  else:
    return NO_SNOW


weather_data_raw['snow_class'] = weather_data_raw['conditions'].apply(classify_snow_label)

In [None]:
# 2. Ordinal encoding for ML
snow_mapping = {
  NO_SNOW: 0,
  LIGHT_SNOW: 1,
  "snow": 2,
  HEAVY_SNOW: 3
}

weather_data_raw['snow_code'] = weather_data_raw['snow_class'].map(snow_mapping)

In [None]:
weather_data_raw['snow_class'].value_counts().reindex([
  NO_SNOW, LIGHT_SNOW, "snow", HEAVY_SNOW
]).plot(kind='bar', edgecolor='black')
plt.title("Frequency of Snow Intensity Classes")
plt.xlabel("Snow Class")
plt.ylabel(NUMBER_OF_HOURS)
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
# Time series: Snow Class Code
plt.figure(figsize=(12, 4))
weather_data_raw.set_index('datetime_hour')['snow_code'].plot(drawstyle='steps-post')
plt.title("Snow Intensity Over Time")
plt.ylabel("Snow Class Code")
plt.grid(True)
plt.tight_layout()
plt.show()

## Classify / EDA –  cloud intensity

In [None]:
SCATTERED_CLOUDS = 'scattered_clouds'
MOSTLY_CLOUDY = 'mostly_cloudy'
PARTLY_CLOUDY = 'partly_cloudy'

In [None]:
# 1. Human-readable cloud classification based on conditions
def classify_cloud_label(x):
  if x == "Clear":
    return "clear"  # 0–10%
  elif x == "Scattered Clouds":
    return SCATTERED_CLOUDS  # ~25–50%
  elif x == "Partly Cloudy":
    return PARTLY_CLOUDY  # ~20–60%
  elif x == "Mostly Cloudy":
    return MOSTLY_CLOUDY  # ~60–90%
  elif x == "Overcast":
    return "overcast"  # >90%
  else:
    return "unknown"


weather_data_raw['cloud_class'] = weather_data_raw['conditions'].apply(classify_cloud_label)

In [None]:

# 2. Ordinal encoding for ML
cloud_mapping = {
  "unknown": 0,
  "clear": 1,
  SCATTERED_CLOUDS: 2,
  PARTLY_CLOUDY: 3,
  MOSTLY_CLOUDY: 4,
  "overcast": 5
}

weather_data_raw['cloud_code'] = weather_data_raw['cloud_class'].map(cloud_mapping)

In [None]:
# Bar plot: Frequency of Cloud Classes
cloud_counts = weather_data_raw['cloud_class'].value_counts().reindex([
  "clear", SCATTERED_CLOUDS, PARTLY_CLOUDY, MOSTLY_CLOUDY, "overcast", "unknown"
])
cloud_counts.plot(kind='bar', edgecolor='black', figsize=(8, 4))
plt.title("Frequency of Cloud Intensity Classes")
plt.xlabel("Cloud Class")
plt.ylabel(NUMBER_OF_HOURS)
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
# Time series: Cloud Class Code
plt.figure(figsize=(12, 4))
weather_data_raw.set_index('datetime_hour')['cloud_code'].plot(drawstyle='steps-post')
plt.title("Cloud Intensity Over Time")
plt.ylabel("Cloud Class Code")
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
sns.boxplot(data=weather_data_raw, x='cloud_class', y='humidity')
plt.title("Humidity by Cloud Class")

## Classify / EDA –  haze intensity

In [None]:
NO_HAZE = 'no_haze'

In [None]:
# 1. Human-readable haze classification
def classify_haze_label(x):
  return "haze" if x == "Haze" else NO_HAZE


weather_data_raw['hazy_class'] = weather_data_raw['conditions'].apply(classify_haze_label)

In [None]:

# 2. Binary encoding for ML
haze_mapping = {
  NO_HAZE: 0,
  "haze": 1
}

weather_data_raw['hazy_code'] = weather_data_raw['hazy_class'].map(haze_mapping)

In [None]:
# Bar plot: Frequency of Haze
hazy_counts = weather_data_raw['hazy_class'].value_counts().reindex([NO_HAZE, "haze"])
hazy_counts.plot(kind='bar', edgecolor='black', figsize=(6, 4))
plt.title("Frequency of Haze")
plt.xlabel("Hazy Class")
plt.ylabel(NUMBER_OF_HOURS)
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
# Time series: Hazy Class Code
plt.figure(figsize=(12, 4))
weather_data_raw.set_index('datetime_hour')['hazy_code'].plot(drawstyle='steps-post')
plt.title("Haze Intensity Over Time")
plt.ylabel("Hazy Class Code")
plt.grid(True)
plt.tight_layout()
plt.show()

## Classify / EDA –  freezing fog or rain

In [None]:
LIGHT_FREEZING_FOG = 'light_freezing_fog'
LIGHT_FREEZING_RAIN = 'light_freezing_rain'

In [None]:
# 1. Human-readable freezing condition classification
def classify_freezing_label(x):
  if x == "Light Freezing Fog":
    return LIGHT_FREEZING_FOG
  elif x == "Light Freezing Rain":
    return LIGHT_FREEZING_RAIN
  else:
    return "none"


weather_data_raw['freezing_class'] = weather_data_raw['conditions'].apply(classify_freezing_label)

In [None]:


# 2. Ordinal encoding for ML
freezing_mapping = {
  "none": 0,
  LIGHT_FREEZING_RAIN: 1,
  LIGHT_FREEZING_FOG: 2
}

weather_data_raw['freezing_code'] = weather_data_raw['freezing_class'].map(freezing_mapping)

In [None]:
# Bar plot: Frequency of Freezing Classes
freezing_counts = weather_data_raw['freezing_class'].value_counts().reindex([
  "none", LIGHT_FREEZING_RAIN, LIGHT_FREEZING_FOG
])
freezing_counts.plot(kind='bar', edgecolor='black', figsize=(8, 4))
plt.title("Frequency of Freezing Fog or Rain")
plt.xlabel("Freezing Class")
plt.ylabel(NUMBER_OF_HOURS)
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
# Time series: Freezing Class Code
plt.figure(figsize=(12, 4))
weather_data_raw.set_index('datetime_hour')['freezing_code'].plot(drawstyle='steps-post')
plt.title("Freezing Fog or Rain Intensity Over Time")
plt.ylabel("Freezing Class Code")
plt.grid(True)
plt.tight_layout()
plt.show()

## Classify / EDA –  fog

In [None]:
# Define fog labels as constants
FOG_LABEL = "fog"
NO_FOG_LABEL = "no_fog"

In [None]:
# 1. Human-readable fog classification
def classify_fog_label(x):
  return FOG_LABEL if x == 1 else NO_FOG_LABEL


weather_data_raw['fog_class'] = weather_data_raw[FOG_LABEL].apply(classify_fog_label)

In [None]:
# 2. Binary encoding for ML
fog_mapping = {
  "no fog": 0,
  "fog": 1
}

weather_data_raw['fog_code'] = weather_data_raw['fog_class'].map(fog_mapping)

In [None]:
# Bar plot: Frequency of Fog Classes
fog_counts = weather_data_raw['fog_class'].value_counts().reindex(["no fog", "fog"])
fog_counts.plot(kind='bar', edgecolor='black', figsize=(6, 4))
plt.title("Frequency of Fog Classes")
plt.xlabel("Fog Class")
plt.ylabel(NUMBER_OF_HOURS)
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
# Time series: Fog Class Code
plt.figure(figsize=(12, 4))
weather_data_raw.set_index('datetime_hour')['fog_code'].plot(drawstyle='steps-post')
plt.title("Fog Intensity Over Time")
plt.ylabel("Fog Class Code")
plt.grid(True)
plt.tight_layout()
plt.show()

## Classify / EDA –  temp itensity

In [None]:
TEMPERATURE_CLASS ='Temperature_Class'
VERY_COLD = 'very_cold'

In [None]:
# 1. Human-readable temperature classification
def classify_temp_label(t):
  if t < -5:
    return VERY_COLD
  elif t < 5:
    return "cold"
  elif t < 15:
    return "cool"
  elif t < 20:
    return "mild"
  elif t < 25:
    return "warm"
  else:
    return "hot"


weather_data_raw['temp_class'] = weather_data_raw['temp_c'].apply(classify_temp_label)

In [None]:


# 2. Ordinal encoding for ML
temp_mapping = {
  VERY_COLD: 0,
  "cold": 1,
  "cool": 2,
  "mild": 3,
  "warm": 4,
  "hot": 5
}

weather_data_raw['temp_code'] = weather_data_raw['temp_class'].map(temp_mapping)

In [None]:
# Bar plot: Frequency of Temperature Classes
temp_counts = weather_data_raw['temp_class'].value_counts().reindex([
  VERY_COLD, "cold", "cool", "mild", "warm", "hot"
])

temp_counts.plot(kind='bar', edgecolor='black', figsize=(8, 4))
plt.title("Frequency of Temperature Classes")
plt.xlabel(TEMPERATURE_CLASS)
plt.ylabel(NUMBER_OF_HOURS)
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:

# Boxplot: Windspeed by Temperature Class
sns.boxplot(data=weather_data_raw, x='temp_class', y='windspeed_kph')
plt.title("Windspeed by Temperature Class")
plt.xlabel(TEMPERATURE_CLASS)
plt.ylabel("Windspeed (km/h)")
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:

# Boxplot: Pressure by Temperature Class
sns.boxplot(data=weather_data_raw, x='temp_class', y='pressure_hPa')
plt.title("Pressure by Temperature Class")
plt.xlabel(TEMPERATURE_CLASS)
plt.ylabel("Pressure (hPa)")
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:

# Time series: Temperature Class Code over Time
plt.figure(figsize=(12, 4))
weather_data_raw.set_index('datetime_hour')['temp_code'].plot(drawstyle='steps-post')
plt.title("Temperature Intensity Over Time")
plt.ylabel("Temperature Class Code")
plt.grid(True)
plt.tight_layout()
plt.show()

## Classify / EDA –  windspeed itensity

In [None]:
WINDSPEED_CLASS = 'Windspeed_Class'
LIGHT_AIR = 'light_air'
LIGHT_BREEZE = 'light_breeze'
MODERATE_BREEZE = 'moderate_breeze'
STRONG_BREEZE = 'strong_breeze'

In [None]:
# 1. Human-readable windspeed classification (Beaufort-based)
def classify_wind_label(speed):
  if speed < 1:
    return "calm"
  elif speed < 12:
    return LIGHT_AIR
  elif speed < 29:
    return LIGHT_BREEZE
  elif speed < 50:
    return MODERATE_BREEZE
  elif speed < 75:
    return STRONG_BREEZE
  else:
    return "stormy"


weather_data_raw['windspeed_class'] = weather_data_raw['windspeed_kph'].apply(classify_wind_label)

In [None]:
# 2. Ordinal encoding for ML
windspeed_mapping = {
  "calm": 0,
  LIGHT_AIR: 1,
  LIGHT_BREEZE: 2,
  MODERATE_BREEZE: 3,
  STRONG_BREEZE: 4,
  "stormy": 5
}

weather_data_raw['windspeed_code'] = weather_data_raw['windspeed_class'].map(
    windspeed_mapping)

In [None]:
# Bar plot: Frequency of Windspeed Classes
windspeed_counts = weather_data_raw['windspeed_class'].value_counts().reindex([
  "calm", LIGHT_AIR, LIGHT_BREEZE, MODERATE_BREEZE, STRONG_BREEZE, "stormy"
])

windspeed_counts.plot(kind='bar', edgecolor='black', figsize=(8, 4))
plt.title("Frequency of Windspeed Classes")
plt.xlabel(WINDSPEED_CLASS)
plt.ylabel(NUMBER_OF_HOURS)
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
TEMPERATURE_C = "Temperature (°C)"

In [None]:
# Boxplot: Temperature by Windspeed Class
sns.boxplot(data=weather_data_raw, x='windspeed_class', y='temp_c')
plt.title("Temperature by Windspeed Class")
plt.xlabel(WINDSPEED_CLASS)
plt.ylabel(TEMPERATURE_C)
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
# Boxplot: Pressure by Windspeed Class
sns.boxplot(data=weather_data_raw, x='windspeed_class', y='pressure_hPa')
plt.title("Pressure by Windspeed Class")
plt.xlabel(WINDSPEED_CLASS)
plt.ylabel("Pressure (hPa)")
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
# Time series: Windspeed Class Code over Time
plt.figure(figsize=(12, 4))
weather_data_raw.set_index('datetime_hour')['windspeed_code'].plot(drawstyle='steps-post')
plt.title("Windspeed Intensity Over Time")
plt.ylabel("Windspeed Class Code")
plt.grid(True)
plt.tight_layout()
plt.show()

## Classify / EDA –  humidity itensity

In [None]:
def classify_humidity(h):
  if h <= 30:
    return 'very_dry'
  elif h <= 50:
    return 'dry'
  elif h <= 70:
    return 'normal'
  elif h <= 85:
    return 'wet'
  else:
    return 'very_wet'


weather_data_raw['humidity_class'] = weather_data_raw['humidity'].apply(classify_humidity)

In [None]:
humidity_mapping = {
  'very_dry': 0,
  'dry': 1,
  'normal': 2,
  'wet': 3,
  'very_wet': 4
}
weather_data_raw['humidity_code'] = weather_data_raw['humidity_class'].map(humidity_mapping)

In [None]:
# Bar plot: Frequency of Humidity Classes
humidity_counts = weather_data_raw['humidity_class'].value_counts().reindex([
  'very_dry', 'dry', 'normale', 'wet', 'very_wet'
])

humidity_counts.plot(kind='bar', edgecolor='black', figsize=(8, 4))
plt.title("Frequency of Humidity Classes")
plt.xlabel("Humidity Class")
plt.ylabel(NUMBER_OF_HOURS)
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:

# Boxplot: Temperature by Humidity Class
sns.boxplot(data=weather_data_raw, x='humidity_class', y='temp_c')
plt.title("Temperature by Humidity Class")
plt.xlabel("Humidity Class")
plt.ylabel(TEMPERATURE_C)
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:

# Time series: Humidity Class Code over Time
plt.figure(figsize=(12, 4))
weather_data_raw.set_index('datetime_hour')['humidity_code'].plot(drawstyle='steps-post')
plt.title("Humidity Intensity Over Time")
plt.ylabel("Humidity Class Code")
plt.grid(True)
plt.tight_layout()
plt.show()

## Classify / EDA –  pressure itensity

In [None]:
VERY_HIGH = 'very_high'
VERY_LOW = 'very_low'

In [None]:
def classify_pressure_label(p):
  if p < 980:
    return VERY_LOW
  elif p < 1000:
    return "low"
  elif p < 1020:
    return "normal"
  elif p < 1030:
    return "high"
  else:
    return VERY_HIGH


weather_data_raw['pressure_class'] = weather_data_raw['pressure_hPa'].apply(classify_pressure_label)

In [None]:

pressure_mapping = {
  VERY_LOW: 0,
  "low": 1,
  "normal": 2,
  "high": 3,
  VERY_HIGH: 4
}

weather_data_raw['pressure_code'] = weather_data_raw['pressure_class'].map(pressure_mapping)

In [None]:
# Bar plot: Frequency of Pressure Classes
pressure_counts = weather_data_raw['pressure_class'].value_counts().reindex([
  VERY_LOW, "low", "normal", "high", VERY_HIGH
])

pressure_counts.plot(kind='bar', edgecolor='black', figsize=(8, 4))
plt.title("Frequency of Pressure Classes")
plt.xlabel("Pressure Class")
plt.ylabel(NUMBER_OF_HOURS)
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:

# Boxplot: Temperature by Pressure Class
sns.boxplot(data=weather_data_raw, x='pressure_class', y='temp_c')
plt.title("Temperature by Pressure Class")
plt.xlabel("Pressure Class")
plt.ylabel(TEMPERATURE_C)
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:

# Time series: Pressure Class Code over Time
plt.figure(figsize=(12, 4))
weather_data_raw.set_index('datetime_hour')['pressure_code'].plot(drawstyle='steps-post')
plt.title("Pressure Intensity Over Time")
plt.ylabel("Pressure Class Code")
plt.grid(True)
plt.tight_layout()
plt.show()

# Save Weather Data

In [None]:
# ensure no dupli remains
weather_data_raw = weather_data_raw.sort_values('datetime_hour').drop_duplicates('datetime_hour')

In [None]:
weather_data_raw.to_csv("data/weather_data_clean.csv", index=False)