In [1]:
import pandas as pd
import numpy as np
import os
import sys
import re

from matplotlib import pyplot as plt
from dotenv import load_dotenv

sys.path.append("../")

load_dotenv()

plt.style.use('Solarize_Light2')

# Setting default DPI, pulling it from dotenv if it exists, setting it on 100 if not

try:
    pc_dpi = int(os.getenv('DPI'))
except TypeError:
    pc_dpi = 100
if pc_dpi is None:
    pc_dpi = 100


Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
df = pd.read_pickle(filepath_or_buffer="../datasets/conversion_failures.pkl")


In [3]:
df.head()


Unnamed: 0_level_0,type,coordinates,latitude,longitude,fall_country,weathering_g,shock_stage,mag_sus,fs_content,wo_content,fa_content,tsm
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
Grove Mountains 020351,L6,"(nan, nan)","72°59'02""S","75°14'47""E",Antarctica,W1,S1,,20.6,1.4,24.4,
Grove Mountains 022777,L6,"(nan, nan)","72°46'30""S","75°19'47""E",Antarctica,W1,S4,,20.6,1.4,24.2,
Grove Mountains 053498,L6,"(nan, nan)","72°46'44""S","75°19'00""E",Antarctica,W1,S3,,20.3,1.7,23.6,
Grove Mountains 051612,H5,"(nan, nan)","72°49'46""S","75°16'43""E",Antarctica,W2,S2,,17.2,1.4,19.7,
Grove Mountains 054458,H6,"(nan, nan)","72°54'57""S","75°06'03""E",Antarctica,W1,S1,,16.9,1.6,19.5,


# okay so there (some, not all) of the cases where conversion failed despite the lat/lon being both here
- Let's take the two functions in charge of handling this
- Let's make a function to show the errors so we don't have to type the whole filter every time we need to check

In [4]:
import re
import numpy as np

def dms_to_decimal(dms: str) -> float:
    # On the off chance its already in a decimal format :
    try:
        return float(dms)
    except ValueError:
        if dms is None or str(dms).strip().lower() in ["nan", ""]:
            return np.nan

        dms_cleaned = re.sub(r"\s+", "", dms.replace("''", '"'))

        # Match various DMS patterns
        patterns = [
            r"(\d+\.?\d*)°([NSWE])",  # Matches simple degrees with direction
            r"(\d+)°(\d+\.?\d*)'([NSWE])",  # Matches degrees and decimal minutes with direction
            r"(\d+)°(\d+)'(\d*\.?\d*)?\"?([NSWE])",  # Matches full DMS with optional seconds
        ]

        for pattern in patterns:
            match = re.match(pattern, dms_cleaned)
            if match:
                parts = match.groups()
                degrees = float(parts[0])
                minutes = float(parts[1]) if len(parts) > 2 else 0
                seconds = float(parts[2]) if len(parts) > 3 else 0
                direction = parts[-1]

                # Calculate decimal value
                decimal = degrees + minutes / 60 + seconds / 3600
                if direction in ('S', 'W'):
                    decimal *= -1
                return decimal

        return np.nan



def handle_coordinates(latitude: str, longitude: str) -> tuple:
    """
    Function :
        - Converts lat/lon in degrees, minutes seconds to floating decimals (+/-)
        and returns it as a tuple of lat/lon (decimals)
    Args :
        - latitude : a string AB°CD'EF.GH"N|S
        - latitude : a string IJ°KL'MN.OP"E|W
    Returns :
        - Tuple of lat/lon (decimals)
    """

    lat_decimal = dms_to_decimal(latitude)
    lon_decimal = dms_to_decimal(longitude)

    if lat_decimal is not np.nan and lon_decimal is not np.nan:
        return (lat_decimal, lon_decimal)
    else:
        return (np.nan, np.nan)  # incomplete coordinates will yield a full error if just one param is na


In [5]:
def show_conversion_errors(df: pd.DataFrame) -> pd.DataFrame:
    df_errors = df[
    (df["coordinates"] == (np.nan, np.nan))
    &
    ((df["latitude"].notna() & df["longitude"].notna()))
    ]
    return df_errors


In [6]:
df["coordinates"] = df.apply(lambda row: handle_coordinates(row["latitude"], row["longitude"]), axis=1)


In [7]:
show_conversion_errors(df=df)

Unnamed: 0_level_0,type,coordinates,latitude,longitude,fall_country,weathering_g,shock_stage,mag_sus,fs_content,wo_content,fa_content,tsm
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
Forrest 029,H35,"(nan, nan)",30°43.89'5,127°56.35'E,Australia,C,S3,,17.7,,19.1,
Jiddat al Harasis 423,H4,"(nan, nan)",19°46.997N,56°24.836'E,Oman,W3,S3,,15.1,1.1,17.8,1457.3


These two cases are two examples of stuff we cant really act on because we'd have to adjust on a case by case basis (029 has no direction, 423 has a ... doubtful structure). Both these errors cant be corrected with certainty.