In [7]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
file_path = './data/METAR/train/2022-09-01.csv'
fuser_df = pd.read_csv(file_path)
nan_s = fuser_df.isnull().sum()
print(nan_s[nan_s> 0])


u_wind    97
v_wind    97
dtype: int64


In [None]:
import re
import pandas as pd
from datetime import datetime, timezone

# Define cloud cover dictionary for sky cover conversion
cloud_cover_dict = {
    "SKC": 0,  # Sky Clear
    "CLR": 0,  # Clear
    "NSC": 0,  # No Significant Clouds
    "NCD": 0,  # No Cloud Detected
    "FEW": 1,  # Few (1/8 to 2/8 sky cover)
    "SCT": 3,  # Scattered (3/8 to 4/8 sky cover)
    "BKN": 5,  # Broken (5/8 to 7/8 sky cover)
    "OVC": 8,  # Overcast (8/8 sky cover)
    "VV": 9    # Vertical Visibility (obscured sky, treated as full overcast)
}

def parse_cloud_layers(cloud_layers):
    """Convert cloud layer codes to structured data for modeling."""
    parsed_layers = []
    for layer in cloud_layers:
        # Match cloud layer code, altitude, and CB flag if present
        match = re.match(r"([A-Z]{3})(\d{3})?(CB)?", layer)
        if match:
            cloud_code, altitude, cumulonimbus = match.groups()
            sky_cover = cloud_cover_dict.get(cloud_code, None)  # Get sky cover value
            altitude_ft = int(altitude) * 100 if altitude else None  # Convert altitude to feet if present
            cb_flag = 1 if cumulonimbus else 0  # Cumulonimbus flag (1 for CB, 0 otherwise)

            # Append the structured cloud data
            parsed_layers.append({
                "sky_cover": sky_cover,           # Numerical sky cover level
                "altitude_ft": altitude_ft,       # Altitude in feet
                "cumulonimbus": cb_flag           # Cumulonimbus presence (1 or 0)
            })
    return parsed_layers

def parse_metar_line(date_time, line):
    """Parse a single METAR report line and convert cloud information for modeling."""
    # Enhanced METAR pattern to capture complex remarks and additional fields
    metar_pattern = re.compile(
        r'^(?P<station>[A-Z0-9]{4})\s+'                        # Station code
        r'(?P<datetime>\d{2}\d{4}Z)\s+'                     # Date and time
        r'(?P<cor>COR\s+)?'                                 # Optional correction indicator
        r'(?P<auto>AUTO\s+)?'                               # Optional AUTO indicator
        r'(?P<wind>(VRB|\d{3}|/////)\d{2}(G\d{2})?(KT|MPS|KMH)?\s*(\d{3}V\d{3})?)?\s*'  # Wind information
        r'(?P<visibility>////|CAVOK|\d{4}(SM|NDV)?|[0-9]+SM)?\s*'  # Visibility
        r'(?P<weather>[\+\-]?[A-Z]{2,6}\s*)?'               # Optional weather phenomena
        r'(?P<clouds>((FEW|SCT|BKN|OVC|NSC|VV|NCD|CLR|SKC|///)\d{0,3}(CB)?\s*)*)'  # Cloud layers
        r'(?:(?P<temperature>M?\d{2}|//)/(?P<dewpoint>M?\d{2}|//)\s+)?'    # Optional temperature and dewpoint
        r'(?:(?P<pressure_indicator>[QA])(?P<pressure_value>\d{4}|////)(=)?\s*)?'  # Optional pressure with optional '='
        r'(?P<has_rmk>RMK\s+)?'                            # Optional RMK section
    )

    match = metar_pattern.match(line)
    if match:
        data = match.groupdict()

        # Convert `date_time` to UTC format
        try:
            utc_date = datetime.strptime(date_time, "%Y/%m/%d %H:%M").replace(tzinfo=timezone.utc)
            data['date_time'] = utc_date.strftime("%Y-%m-%d %H:%M:%S")
            data['date_time'] = pd.to_datetime(data['date_time'], utc=True)
        except ValueError as e:
            print(f"Error parsing date_time {date_time}: {e}")
            data['date_time'] = None
            
        # AUTO handling
        data['auto'] = True if data.get('clouds') == 'AUTO' else False
        
        # Process cloud layers into structured data
        clouds = data.get('clouds')
        data['cloud_layers'] = parse_cloud_layers(clouds.strip().split()) if clouds else []

        # Temperature conversion
        if data.get('temperature'):
            data['temperature'] = float(data['temperature'].replace("M", "-")) if data['temperature'] != "//" else None
        else:
            data['temperature'] = None
        
        if data.get('dewpoint'):
            data['dewpoint'] = float(data['dewpoint'].replace("M", "-")) if data['dewpoint'] and data['dewpoint'] != "//" else None
        else:
            data['dewpoint'] = None

        # Visibility conversion (to meters)
        visibility = data.get('visibility')
        if visibility == "CAVOK":
            data['visibility_meters'] = 10000  # Convention for CAVOK
        elif visibility and "SM" in visibility:
            visibility_miles = float(visibility.replace("SM", ""))
            data['visibility_meters'] = int(visibility_miles * 1609.34)
        elif visibility and visibility.isdigit():
            data['visibility_meters'] = int(visibility)
        else:
            data['visibility_meters'] = None

        # Wind speed conversion (to m/s)
        wind = data.get('wind')
        if wind and "/////" not in wind:  # Handle missing wind speed
            wind_speed_match = re.search(r'\d{2}', wind)
            wind_speed = int(wind_speed_match.group()) if wind_speed_match else 0
            if "KT" in wind:
                data['wind_speed_mps'] = round(wind_speed * 0.514444, 2)
            elif "KMH" in wind:
                data['wind_speed_mps'] = round(wind_speed / 3.6, 2)
            elif "MPS" in wind:
                data['wind_speed_mps'] = wind_speed
            else:
                data['wind_speed_mps'] = None
        else:
            data['wind_speed_mps'] = None

        # Pressure handling with unit conversion
        pressure_indicator = data.get('pressure_indicator')
        pressure_value = data.get('pressure_value')
        if pressure_value and pressure_value != "////":
            if pressure_indicator == "A":
                # Convert inHg (A) to hPa (Q) by multiplying by 33.8639
                data['pressure'] = round(int(pressure_value) * 33.8639 / 100, 2)
            elif pressure_indicator == "Q":
                data['pressure'] = int(pressure_value)
            else:
                data['pressure'] = None
        else:
            data['pressure'] = None
        
        fields_to_drop = ['wind', 'clouds', 'visibility', 'pressure_indicator', 'pressure_value', 'has_rmk']
        processed_data = data.copy() 
        for field in fields_to_drop:
            if field in processed_data:
                del processed_data[field]
        return processed_data

    print(f"Line did not match the METAR pattern: {date_time} {line}")
    return None



In [5]:
import re
import pandas as pd
import fetchData
file_path = '/home/jaosn/finalProject/data/METAR/train/part_1/metar.20220901.15Z.txt'
with open(file_path, 'r', encoding='ISO-8859-1') as file:
    lines = file.readlines()
    lines = [line.strip() for line in lines if line.strip()]
    data_entries = []
    #print(len(lines)/2)
    date_time = None  # To keep track of the current date and time
    
    for line in lines:
        # Check if the line is a date line
        if re.match(r'\d{4}/\d{2}/\d{2} \d{2}:\d{2}', line):
            date_time = line 
        elif date_time:  # If we have a date_time, process the METAR line
            
            parsed_data = fetchData.parse_metar_line(date_time, line)
            if parsed_data:
                data_entries.append(parsed_data)
            date_time = None
    df = pd.DataFrame(data_entries)
    print(df) 
    


      station   cor   auto weather  temperature  dewpoint  \
0        AGGH  None  False     FEW          NaN       NaN   
1        AGGH  None  False     FEW          NaN       NaN   
2        AGGH  None  False     FEW          NaN       NaN   
3        AGGH  None  False     FEW          NaN       NaN   
4        AYPY  None  False     FEW          NaN       NaN   
...       ...   ...    ...     ...          ...       ...   
33801    ZYTX  None  False    None         11.0      10.0   
33802    ZYTX  None  False    None         11.0      10.0   
33803    ZYTX  None  False    None         11.0      10.0   
33804    ZYTX  None  False    None         11.0      10.0   
33805    ZYTX  None  False    None         11.0      10.0   

                      date_time cloud_layers  visibility_meters  \
0     2022-09-01 15:00:00+00:00           []             9999.0   
1     2022-09-01 15:00:00+00:00           []             9999.0   
2     2022-09-01 15:00:00+00:00           []             9999.0   

In [2]:
print(df.dtypes) 

station                           object
cor                               object
auto                                bool
weather                           object
temperature                      float64
dewpoint                         float64
date_time            datetime64[ns, UTC]
cloud_layers                      object
visibility_meters                float64
wind_speed_mps                   float64
pressure                         float64
dtype: object


In [26]:
import re
import pandas as pd
import fetchData
from metar import Metar
file_path = '/home/jaosn/finalProject/data/METAR/train/part_1/metar.20220901.00Z.txt'
with open(file_path, 'r', encoding='ISO-8859-1') as file:
    lines = file.readlines()
    lines = [line.strip() for line in lines if line.strip()]
    data_entries = []
    print(len(lines)/2)
    date_time = None  # To keep track of the current date and time
    
    for i in range(0, len(lines), 2):
        date_time = lines[i].strip()      # 第一行是日期时间
        metar_data = lines[i + 1].strip() # 第二行是 METAR 报文

        try:
            # 使用 Metar 库解析 METAR 报文
            report = Metar.Metar(metar_data)

            # 提取所需信息并添加到记录列表
            data_entries.append({
                "datetime": date_time,
                "station": report.station_id,
                "temperature": report.temp.value() if report.temp else None,
                "dew_point": report.dewpt.value() if report.dewpt else None,
                "wind_speed": report.wind_speed.value("KT") if report.wind_speed else None,
                "wind_direction": report.wind_dir.value() if report.wind_dir else None,
                "visibility": report.vis.value("SM") if report.vis else None,
                "altimeter": report.press.value("IN") if report.press else None,
                "remarks": report.remarks,
            })
        except Exception as e:
            print(f"Error parsing METAR report: {metar_data}. Error: {e}")

    df = pd.DataFrame(data_entries)
    print(df) 

33592.0
Error parsing METAR report: CBAR 010000Z AUTO 08007KT ////SM NCD 13/11 A2965. Error: Unparsed groups in body '////SM' while processing 'CBAR 010000Z AUTO 08007KT ////SM NCD 13/11 A2965'
Error parsing METAR report: CPIN 010000Z AUTO 12008KT ////SM NCD ///// A////. Error: Unparsed groups in body '////SM' while processing 'CPIN 010000Z AUTO 12008KT ////SM NCD ///// A////'
Error parsing METAR report: CWFD 010000Z AUTO ///// ////SM NCD 00/M04 A3000. Error: Unparsed groups in body '////SM' while processing 'CWFD 010000Z AUTO ///// ////SM NCD 00/M04 A3000'
Error parsing METAR report: CWFD 010000Z AUTO ///// ////SM NCD 00/M04 A3000. Error: Unparsed groups in body '////SM' while processing 'CWFD 010000Z AUTO ///// ////SM NCD 00/M04 A3000'
Error parsing METAR report: CWIL 010000Z AUTO ///// ////SM OVC090 ///// A////. Error: Unparsed groups in body '////SM' while processing 'CWIL 010000Z AUTO ///// ////SM OVC090 ///// A////'
Error parsing METAR report: CWIL 010000Z AUTO ///// ////SM OVC09