## .CSV Coordinate Extractor
This notebook contains a script that will transverse through each folder/subfolder within a Data directory, in order to locate every .csv file, and extract (Coord_X & Coord_Y) coordinate columns

In [1]:
## The script below tranverses through folders/files and extracts specified Coordinate Columns (Coord_x & Coord_Y) ## 

import os
import pandas as pd

def extract_coordinates(input_folder, output_folder):
    # Ensure the 'Coordinates' folder exists in File Directory (create if not)
    os.makedirs(output_folder, exist_ok=True)
    
    # Keywords that might be part of coordinate column names  (May need to adjust accordingly, ensure all possible coordinates are accounted for) 
    coord_keywords = ['coord_x', 'x', 'lon', 'longitude', 'lat', 'coord_y', 'y']
    
    # Walk through the folder and its subfolders
    for root, dirs, files in os.walk(input_folder):
        # Skip the 'Coordinates' folder itself
        if 'Coordinates' in root:
            continue
        
        for file in files:
            # Look for CSV files
            if file.endswith('.csv'):
                file_path = os.path.join(root, file)
                
                try:
                    # Read the CSV file, avoiding mixed dtype warning 
                    # (if dealing with large dataset, will need to revise code to include dytpe parameter: df = pd.read_csv(file path, low_memory=False, dtype={'column_name': 'str'}))
                    df = pd.read_csv(file_path, low_memory=False)
                    
                    # Find columns that might represent coordinates
                    coord_columns = []
                    for col in df.columns:
                        if any(keyword.lower() in col.lower() for keyword in coord_keywords):
                            coord_columns.append(col)
                    
                    # Check if at least two coordinate-like columns are found
                    if len(coord_columns) >= 2:
                        # Extract the first two coordinate columns (assuming they represent X and Y)
                        coords_df = df[coord_columns[:2]]
                        
                        # Preserve the folder structure inside the 'Coordinates' folder
                        relative_path = os.path.relpath(root, input_folder)
                        output_path = os.path.join(output_folder, relative_path)
                        
                        # Make sure the directory structure exists in the output
                        os.makedirs(output_path, exist_ok=True)
                        
                        # Save the new CSV in the 'Coordinates' folder
                        output_file_path = os.path.join(output_path, file)
                        coords_df.to_csv(output_file_path, index=False)
                        print(f"Saved coordinates to {output_file_path}")
                    else:
                        print(f"Skipping {file_path}: No suitable coordinate columns found.")
                
                except Exception as e:
                    print(f"Error processing {file_path}: {e}")
        # End of file loop

# Example usage
input_folder = "path/to/your/input/folder" ## Path to the 'Data' folder (Change this according to your file folder directory)
output_folder = 'path/to/your/output/Coordinates/folder' # Path to the 'Coordinates' folder (change this according to your file folder directory)

# Call the function to process CSV files
extract_coordinates(input_folder, output_folder)