In [27]:
import os
from metar import Metar
import pandas as pd
import numpy as np
import re
from datetime import datetime
import warnings

In [28]:
warnings.filterwarnings("ignore", message="Unparsed groups in body")

base_dir = "./data/METAR_train"

data = []

def extract_visibility(visibility_str):
    pattern = r"(\d+)\s+meters"
    match = re.search(pattern, visibility_str)
    if match:
        return int(match.group(1))
    else:
        return np.nan

def extract_knots(speed_str):
    pattern = r"\b(\d+)\s+knots"
    match = re.search(pattern, speed_str)
    if match:
        return int(match.group(1))
    else:
        return np.nan

def process_weather_conditions(weather_data):
    conditions = {
        'rain': 0,
        'thunderstorm': 0,
        'hail': 0,
        'snow': 0,
        'mist': 0,
        'fog': 0
    }

    condition_map = {
        'RA': 'rain',
        'TS': 'thunderstorm',
        'GR': 'hail',
        'SN': 'snow',
        'BR': 'mist',
        'FG': 'fog'
    }

    for weather_tuple in weather_data:
        intensity, description, precipitation, obscuration, other = weather_tuple
        for element in [description, precipitation, obscuration, other]:
            if element in condition_map:
                key = condition_map[element]
                if intensity == '-':
                    conditions[key] = max(conditions[key], 1)  # Light
                elif intensity == '':
                    conditions[key] = max(conditions[key], 2)  # Moderate
                elif intensity == '+':
                    conditions[key] = max(conditions[key], 3)  # Heavy

    return conditions

def round_to_nearest_quarter_hour(time_obj):
    minutes = (time_obj.minute // 15) * 15
    return time_obj.replace(minute=minutes, second=0, microsecond=0)

def parse_metar_line(metar_text, date=datetime):
    try:
        metar = Metar.Metar(metar_text, date.month, date.year, strict=False)
        rounded_time = round_to_nearest_quarter_hour(date.time())
        
        icao_code = metar.station_id
        wind_speed = metar.wind_speed.value('KT') if metar.wind_speed else -1
        wind_direction = metar.wind_dir.value() if metar.wind_dir else -1
        visibility = extract_visibility(metar.visibility('M')) if metar.visibility('M') != 'missing' else -1
        temperature = metar.temp.value('C') if metar.temp else -1
        dewpoint = metar.dewpt.value('C') if metar.dewpt else -1
        pressure = metar.press.value('MB') if metar.press else -1
        peak_wind = extract_knots(metar.peak_wind('KT')) if metar.peak_wind('KT') != 'missing' else -1
        precipitation = metar.precip_1hr.value() if metar.precip_1hr else -1

        return {
            "icao_code": icao_code,
            "date": date.date(),
            "time": rounded_time,
            "wind_speed": wind_speed,
            "wind_speed_missing": 1 if wind_speed == -1 else 0,
            "wind_direction": wind_direction,
            "wind_direction_missing": 1 if wind_direction == -1 else 0,
            "visibility": visibility,
            "visibility_missing": 1 if visibility == -1 else 0,
            "temperature": temperature,
            "temperature_missing": 1 if temperature == -1 else 0,
            "dewpoint": dewpoint,
            "dewpoint_missing": 1 if dewpoint == -1 else 0,
            "pressure": pressure,
            "pressure_missing": 1 if pressure == -1 else 0,
            "peak_wind": peak_wind,
            "peak_wind_missing": 1 if peak_wind == -1 else 0,
            "precipitation": precipitation,
            "precipitation_missing": 1 if precipitation == -1 else 0,
        } | process_weather_conditions(metar.weather)
    except Exception as e:
        print(f"Failed to parse METAR: {metar_text}, Error: {e}")
        return None


for part in range(1, 2):  # Change to 6 eventually
    part_dir = os.path.join(base_dir, f"METAR_train_part_{part}")
    for filename in os.listdir(part_dir):
        file_path = os.path.join(part_dir, filename)
        
        # Read and check for duplicates
        with open(file_path, 'r') as file:
            lines = file.readlines()
            
            previous_icao = None
            previous_time = None
            
            for i in range(0, len(lines), 3):
                datetime_line = lines[i].strip()  # Line with datetime (3n+1)
                date = datetime.strptime(datetime_line, "%Y/%m/%d %H:%M")
                current_icao = lines[i + 1].strip()[:4]  # Line with ICAO (3n+2)
                
                # Skip redundant data
                if previous_icao == current_icao and previous_time == date:
                    continue

                previous_icao = current_icao
                previous_time = date
                
                metar_text = lines[i + 1].strip()
                # Remember to add parsing for wind shear and rainfall rate here
                parsed_data = parse_metar_line(metar_text, date)
                
                if parsed_data:
                    data.append(parsed_data)


df = pd.DataFrame(data)
print(df.head())
df.to_csv("output.csv", index=False)


  icao_code        date      time  wind_speed  wind_speed_missing  \
0      AGGH  2022-09-01  00:00:00        17.0                   0   
1      AYMH  2022-09-01  00:00:00         4.0                   0   
2      AYPY  2022-09-01  00:00:00         6.0                   0   
3      BGBW  2022-08-31  23:45:00         2.0                   0   
4      BGGH  2022-08-31  23:45:00        10.0                   0   

   wind_direction  wind_direction_missing  visibility  visibility_missing  \
0            90.0                       0       10000                   0   
1            -1.0                       1       10000                   0   
2            -1.0                       1       10000                   0   
3            70.0                       0       10000                   0   
4            70.0                       0       10000                   0   

   temperature  ...  peak_wind  peak_wind_missing  precipitation  \
0         31.0  ...         -1                  1     