## .CSV Coordinate Extractor ##


This notebook contains a script that will transverse through each folder/subfoler to locate csv files, in order to extract coordinates (Coord_x & Coord_Y) from those files.
**NOTE: Ensure that file/folder path and output directory is replaced accordingly **

In [None]:
import os
import pandas as pd

# Mapping of possible names for Coord_X and Coord_Y
COORD_X_VARIANTS = ['coord_x', 'longitude', 'lon', 'long', 'x', 'x_coordinate']
COORD_Y_VARIANTS = ['coord_y', 'latitude', 'lat', 'y', 'y_coordinate']

# Function to validate the coordinate values
def is_valid_coord_x(value):
    try:
        value = float(value)
        return -180 <= value <= 180
    except ValueError:
        return False

def is_valid_coord_y(value):
    try:
        value = float(value)
        return -90 <= value <= 90
    except ValueError:
        return False

def extract_coordinates_from_csv(file_path):
    """Extract Coord_X and Coord_Y columns from a CSV file."""
    try:
        df = pd.read_csv(file_path, low_memory=False)
        
        # Normalize column names to lowercase to ensure case-insensitivity
        df.columns = df.columns.str.lower()
        
        # Find columns that match any of the variants for coord_x and coord_y
        coord_x_columns = [col for col in df.columns if any(variant in col for variant in COORD_X_VARIANTS)]
        coord_y_columns = [col for col in df.columns if any(variant in col for variant in COORD_Y_VARIANTS)]
        
        if coord_x_columns and coord_y_columns:
            # Ensure only the first matching column is selected (avoid duplicate columns)
            coord_x_column = coord_x_columns[0]  # Take the first match
            coord_y_column = coord_y_columns[0]  # Take the first match
            
            # Extract the matching coord_x and coord_y columns
            coordinates_df = df[[coord_x_column, coord_y_column]]
            
            # Validate that the values are within the acceptable ranges
            valid_coord_x = coordinates_df[coord_x_column].apply(is_valid_coord_x)
            valid_coord_y = coordinates_df[coord_y_column].apply(is_valid_coord_y)
            
            # Filter out rows where any coordinate is invalid
            valid_rows = valid_coord_x & valid_coord_y
            coordinates_df = coordinates_df[valid_rows]
            
            # Add the original file name as a new column
            coordinates_df['Source_File'] = os.path.basename(file_path)
            
            return coordinates_df
        else:
            return None
    except Exception as e:
        print(f"Error reading {file_path}: {e}")
        return None

def process_directory(input_dir, output_dir):
    """Process all CSV files in the directory and save the coordinates."""
    os.makedirs(output_dir, exist_ok=True)  # Create the output directory if it doesn't exist
    
    all_coordinates = []  # List to hold all the coordinates
    
    # Traverse the directory
    for dirpath, dirnames, filenames in os.walk(input_dir):
        for filename in filenames:
            if filename.lower().endswith('.csv'):  # Check if the file is a CSV
                file_path = os.path.join(dirpath, filename)
                print(f"Processing {file_path}")
                
                # Extract coordinates from the CSV file
                coordinates = extract_coordinates_from_csv(file_path)
                if coordinates is not None:
                    all_coordinates.append(coordinates)
    
    # Combine all coordinates into a single DataFrame
    if all_coordinates:
        combined_coordinates = pd.concat(all_coordinates, ignore_index=True)
        
        # Reset the index to ensure it's unique and valid
        combined_coordinates.reset_index(drop=True, inplace=True)
        
        # Save the combined coordinates to the output CSV file
        output_file = os.path.join(output_dir, 'all_coordinates.csv')
        combined_coordinates.to_csv(output_file, index=False)
        print(f"All coordinates have been saved to {output_file}")
    else:
        print("No coordinates found in the CSV files.")

# Define the input and output paths
input_dir = "path/to/your/input/folder"  # Replace with your folder path
output_dir = os.path.join(input_dir, "Coordinates")  # Save results in 'Coordinates' subfolder

# Process the directory
process_directory(input_dir, output_dir)
