In [61]:
import pandas as pd
import numpy as np
import sqlalchemy
import geopandas as gpd
from sqlalchemy import create_engine, text
from geoalchemy2 import Geometry
from dotenv import load_dotenv
from urllib.parse import quote 
import os, glob
from collections import defaultdict
from shapely.geometry import Point, LineString
from itertools import groupby
from operator import itemgetter


In [62]:
load_dotenv()
DBUSER = os.getenv('DBUSER')
DBPASS = os.getenv('DBPASS')
DBHOST = os.getenv('DBHOST')
DBPORT = os.getenv('DBPORT')
DBNAME = os.getenv('DBNAME')

# db_settings = {
#    'dbname': DBNAME,
#    'user': DBUSER,
#    'password': DBPASS,
#    'host': DBHOST,
#    'port': DBPORT,
    #'options': "-c statement_timeout=0",
    #'keepalives': 1,
    #'keepalives_idle': 30,  # Number of seconds of inactivity after which a keepalive message is sent
    #'keepalives_interval': 10,  # Number of seconds between keepalive messages when no response is received
    #'keepalives_count': 5  # Number of attempts before concluding the connection is dead
#}
# Function to establish connection to PostgreSQL
# def connect_db(settings):
#    conn = psycopg2.connect(**settings)
    # conn.set_isolation_level(ISOLATION_LEVEL_AUTOCOMMIT)
#    return conn

# connect_db(db_settings)

In [90]:
PG_CONNECTION_STRING = 'postgresql://' + DBUSER + ':%s@' + DBHOST + ':' + DBPORT + '/' + DBNAME
engine = create_engine(PG_CONNECTION_STRING % quote(DBPASS), echo=True)

In [16]:
data_dir = '../data_src/'
data_file = 'GLODAPv2.2023_Indian_Ocean.csv'
df = pd.read_csv(data_dir + data_file)
# Remove G2 prefix
df.columns = [col[2:] if col.startswith('G2') else col for col in df.columns]
# Replace missing values
df.replace(-9999, np.nan, inplace=True)
# Combine date and time columns into UTC datetime
df['datetime'] = pd.to_datetime(df[['year', 'month', 'day', 'hour', 'minute']], errors='coerce', utc=True)

# Drop original datetime columns
df.drop(['day', 'hour', 'minute'], axis=1, inplace=True)

def rename_flags(col):
    if col.endswith('f'):
        return f"flag_{col[:-1]}"
    elif col.endswith('qc'):
        return f"qc_{col[:-2]}"
    elif col.endswith('err'):
        return f"err_{col[:-3]}"
    else:
        return col

df.columns = [rename_flags(col) for col in df.columns]

# Create Point geometries
df['geom'] = df.apply(lambda row: Point(row['longitude'], row['latitude']), axis=1)


In [17]:
print(df)

            expocode  cruise  station  region  cast    year  month  latitude  \
0       09AR19910925    65.0      8.0    16.0   1.0  1991.0   10.0  -45.1830   
1       09AR19910925    65.0      8.0    16.0   1.0  1991.0   10.0  -45.1830   
2       09AR19910925    65.0      8.0    16.0   1.0  1991.0   10.0  -45.1830   
3       09AR19910925    65.0      8.0    16.0   1.0  1991.0   10.0  -45.1830   
4       09AR19910925    65.0      8.0    16.0   1.0  1991.0   10.0  -45.1830   
...              ...     ...      ...     ...   ...     ...    ...       ...   
167743  49NZ20191205  4062.0     69.0    16.0   1.0  2019.0   12.0  -20.4961   
167744  49NZ20191205  4062.0     69.0    16.0   1.0  2019.0   12.0  -20.4961   
167745  49NZ20191205  4062.0     69.0    16.0   1.0  2019.0   12.0  -20.4961   
167746  49NZ20191205  4062.0     69.0    16.0   1.0  2019.0   12.0  -20.4961   
167747  49NZ20191205  4062.0     69.0    16.0   1.0  2019.0   12.0  -20.4961   

        longitude  bottomdepth  ...  fl

In [18]:
print(df.columns.tolist())

['expocode', 'cruise', 'station', 'region', 'cast', 'year', 'month', 'latitude', 'longitude', 'bottomdepth', 'maxsampdepth', 'bottle', 'pressure', 'depth', 'temperature', 'theta', 'salinity', 'flag_salinity', 'qc_salinity', 'sigma0', 'sigma1', 'sigma2', 'sigma3', 'sigma4', 'gamma', 'oxygen', 'flag_oxygen', 'qc_oxygen', 'aou', 'flag_aou', 'nitrate', 'flag_nitrate', 'qc_nitrate', 'nitrite', 'flag_nitrite', 'silicate', 'flag_silicate', 'qc_silicate', 'phosphate', 'flag_phosphate', 'qc_phosphate', 'tco2', 'flag_tco2', 'qc_tco2', 'talk', 'flag_talk', 'qc_talk', 'fco2', 'flag_fco2', 'fco2temp', 'phts25p0', 'flag_phts25p0', 'phtsinsitutp', 'flag_phtsinsitutp', 'qc_phts', 'cfc11', 'pcfc11', 'flag_cfc11', 'qc_cfc11', 'cfc12', 'pcfc12', 'flag_cfc12', 'qc_cfc12', 'cfc113', 'pcfc113', 'flag_cfc113', 'qc_cfc113', 'ccl4', 'pccl4', 'flag_ccl4', 'qc_ccl4', 'sf6', 'psf6', 'flag_sf6', 'qc_sf6', 'c13', 'flag_c13', 'qc_c13', 'c14', 'flag_c14', 'err_c14', 'h3', 'flag_h3', 'err_h3', 'he3', 'flag_he3', 'err_

In [None]:
# Convert to GeoDataFrame for easy handling and direct export to PostGIS
gdf = gpd.GeoDataFrame(df, geometry='geom', crs="EPSG:4326")


In [25]:
# Load cruise metadata
cruise_df = pd.read_csv('../data/glodapv2_2023_cruise_metadata.csv')
cruise1 = cruise_df[cruise_df['expocode'].isin(df['expocode'])]
print(cruise1)

          expocode  start_date    end_date  \
401   09AR19910925  1991-09-25  1991-10-27   
402   09AR19941213  1994-12-13  1995-02-01   
403   09AR19960822  1996-08-22  1996-09-21   
404   09AR19980228  1998-02-28  1998-04-01   
405   09AR20011029  2001-10-29  2001-12-13   
...            ...         ...         ...   
992   35MF19820626  1982-06-26  1982-07-03   
993   35MF19821003  1982-10-03  1982-10-07   
994   49NZ20191229  2019-12-29  2020-02-10   
1057  49HH20091106  2009-11-06  2010-01-09   
1058  49NZ20191205  2019-12-05  2019-12-27   

                                                    map          legs  region  \
401   https://www.ncei.noaa.gov/access/ocean-carbon-...  09AR19910925  Indian   
402   https://www.ncei.noaa.gov/access/ocean-carbon-...  09AR19941213  Indian   
403   https://www.ncei.noaa.gov/access/ocean-carbon-...  09AR19960822  Indian   
404   https://www.ncei.noaa.gov/access/ocean-carbon-...  09AR19980228  Indian   
405   https://www.ncei.noaa.gov/access/oce

In [44]:
cruise_dict = cruise1.set_index('expocode').to_dict(orient='index')

# Dictionary to accumulate coordinates for cruises across files
cruise_coords = defaultdict(list)
print(cruise_dict)

{'09AR19910925': {'start_date': '1991-09-25', 'end_date': '1991-10-27', 'map': 'https://www.ncei.noaa.gov/access/ocean-carbon-acidification-data-system/oceans/PACIFICA/maps/09AR19910925.png', 'legs': '09AR19910925', 'region': 'Indian', 'alias': '09AR9101_1, WOCE SR03', 'ship': 'Aurora Australis', 'chief_scientist': 'S. Rintoul', 'carbon_PI': nan, 'hydrography_PI': 'S. Rintoul', 'oxygen_PI': 'S. Rintoul', 'nutrients_PI': 'S. Rintoul', 'cfc_PI': 'J. Bullister', 'organics_PI': nan, 'isotopes_PI': nan, 'other_pi': nan, 'measurements': 'CTDTMP, CTDSAL, SALNTY, CTDOXY, OXYGEN, SILCAT, NO2+NO3, PHSPHT, CFC-11, CFC-12, THETA', 'cruise_references': nan, 'data_files': 'https://www.ncei.noaa.gov/data/oceans/ncei/ocads/data/0116370/', 'metadata_report': 'https://www.ncei.noaa.gov/data/oceans/ncei/ocads/metadata/0116370.html', 'QC_details': 'https://glodapv2.geomar.de/adjustments/show/OMEX2'}, '09AR19941213': {'start_date': '1994-12-13', 'end_date': '1995-02-01', 'map': 'https://www.ncei.noaa.gov/a

In [45]:
# Accumulate points for LineString per cruise
for expocode, group in gdf.groupby('expocode'):
    sorted_group = group.sort_values('datetime')
    cruise_coords[expocode].extend(
        sorted_group[['longitude', 'latitude', 'expocode', 'station', 'region', 'cast', 'datetime']].values.tolist()
    )

In [None]:
# Create cruise linestring
lines = []
for expocode, coords in cruise_coords.items():
    # Convert collected coordinates into a DataFrame to sort and deduplicate
    coords_df = pd.DataFrame(coords, columns=['longitude', 'latitude', 'expocode', 'station', 'region', 'cast', 'datetime'])
    # print(coords_df)
    
    # Sort the DataFrame as required
    coords_df.sort_values(by=['expocode', 'cast', 'datetime'], inplace=True)
    
    # Remove consecutive duplicate coordinates, preserving revisits later
    unique_coords = [
        (float(lon), float(lat)) for lon, lat in [
            next(group) for _, group in groupby(
                coords_df[['longitude', 'latitude']].itertuples(index=False, name=None),
                key=itemgetter(0, 1)
            )
        ]
    ]

    # Create LineString geometry from cleaned coordinates
    if len(unique_coords) >= 2:
        line_geom = LineString(unique_coords)
    else:
        line_geom = Point(unique_coords[0])

    meta = cruise_dict.get(expocode, {})
    lines.append({
        'expocode': expocode,
        'region': meta.get('region'),
        'ship': meta.get('ship'),
        'chief_scientist': meta.get('chief_scientist'),
        'start_date': meta.get('start_date'),
        'end_date': meta.get('end_date'),
        'geom': line_geom
    })

print(lines)

[{'expocode': '096U20150321', 'region': 'Indian', 'ship': 'Investigator', 'chief_scientist': 'Tom Trull', 'start_date': '2015-03-21', 'end_date': '2015-03-30', 'geom': <LINESTRING (142.17 -46.645, 143.99 -47.126, 141.57 -46.842)>}, {'expocode': '096U20160108', 'region': 'Indian', 'ship': 'Investigator', 'chief_scientist': 'Mike Coffin', 'start_date': '2016-01-08', 'end_date': '2016-02-27', 'geom': <LINESTRING (103.65 -36.364, 99.413 -39.483, 79.366 -48.28, 78.599 -49.894, ...>}, {'expocode': '096U20160314', 'region': 'Indian', 'ship': 'Investigator', 'chief_scientist': 'Pete G. Strutton', 'start_date': '2016-03-14', 'end_date': '2016-04-13', 'geom': <LINESTRING (143.71 -46.001, 141.93 -46.773, 147.35 -50.383, 147.08 -50.385,...>}, {'expocode': '096U20180111', 'region': 'Indian', 'ship': 'Investigator', 'chief_scientist': 'Mark Rosenberg, Steve Rintoul', 'start_date': '2018-01-11', 'end_date': '2018-02-22', 'geom': <LINESTRING (146.32 -44.002, 146.29 -44.05, 146.22 -44.12, 146.19 -44.38

In [59]:
print(lines[1]['geom'])


LINESTRING (103.65 -36.364, 99.413 -39.483, 79.366 -48.28, 78.599 -49.894, 77.729 -50.24, 77.024 -50.542, 76.188 -50.691, 76.014 -50.724, 75.959 -50.736, 75.782 -50.789, 75.617 -50.817, 75.38 -50.899, 74.594 -51.098, 73.81 -51.287, 73.001 -51.506, 71.362 -52.927, 72.659 -53.032, 72.627 -52.998, 72.552 -53.035, 72.593 -53.073, 72.555 -52.984, 73.665 -54.167, 73.99 -53.06, 73.64 -53.213, 73.316 -53.281, 73.239 -52.96, 73.722 -53.005, 73.607 -53.003, 72.66 -53.032, 72.661 -53.033, 72.662 -53.035, 72.661 -53.029, 72.669 -53.034, 72.662 -53.039, 72.653 -53.034, 72.661 -53.034, 72.662 -53.039, 74.021 -52.922, 74.402 -52.81, 74.792 -52.697, 75.262 -52.542, 75.607 -52.41, 76.006 -52.302, 74.323 -52.838, 73.709 -53.007, 73.716 -53.012, 73.724 -53.007, 73.717 -53.003, 73.716 -53.008)


In [58]:
print(df[df['expocode'] == '096U20160108'].drop_duplicates(['longitude','latitude']))

            expocode  cruise  station  region  cast    year  month  latitude  \
131787  096U20160108  1018.0      1.0    16.0   1.0  2016.0    1.0   -36.364   
131811  096U20160108  1018.0      2.0    16.0   1.0  2016.0    1.0   -39.483   
131835  096U20160108  1018.0      4.0    16.0   1.0  2016.0    1.0   -48.280   
131859  096U20160108  1018.0      6.0    16.0   1.0  2016.0    1.0   -49.894   
131883  096U20160108  1018.0      7.0    16.0   1.0  2016.0    1.0   -50.240   
131907  096U20160108  1018.0      8.0    16.0   1.0  2016.0    1.0   -50.542   
131931  096U20160108  1018.0      9.0    16.0   1.0  2016.0    1.0   -50.691   
131955  096U20160108  1018.0     10.0    16.0   1.0  2016.0    1.0   -50.724   
131979  096U20160108  1018.0     11.0    16.0   1.0  2016.0    1.0   -50.736   
132002  096U20160108  1018.0     12.0    16.0   1.0  2016.0    1.0   -50.789   
132026  096U20160108  1018.0     13.0    16.0   1.0  2016.0    1.0   -50.817   
132046  096U20160108  1018.0     14.0   

In [91]:
# Initialize PostGIS and create tables
with engine.begin() as conn:  # use engine.begin() to ensure the commands run in a transaction
    conn.execute(text("CREATE EXTENSION IF NOT EXISTS postgis;"))

'''
    conn.execute(text("""
    CREATE TABLE IF NOT EXISTS cruisev2_2023 (
        expocode TEXT PRIMARY KEY,
        region TEXT,
        ship TEXT,
        chief_scientist TEXT,
        start_date TEXT,
        end_date TEXT,
        geom GEOMETRY(LINESTRING, 4326)
    );
    """))

    conn.execute(text("""
    CREATE TABLE IF NOT EXISTS glodapv2_2023 (
        id SERIAL PRIMARY KEY,
        expocode TEXT REFERENCES cruisev2_2023(expocode),
        cruise INT,
        station INT,
        region INT,
        cast_number INT,
        year INT,
        month INT,
        datetime TIMESTAMP WITH TIME ZONE,
        latitude FLOAT,
        longitude FLOAT,
        bottomdepth FLOAT,
        maxsampdepth FLOAT,
        bottle FLOAT,
        pressure FLOAT,
        depth FLOAT,
        geom GEOMETRY(POINT, 4326)
    );
    CREATE INDEX IF NOT EXISTS idx_glodap_expocode ON glodapv2_2023(expocode);
    CREATE INDEX IF NOT EXISTS idx_glodap_datetime ON glodapv2_2023(datetime);
    CREATE INDEX IF NOT EXISTS idx_glodap_longitude ON glodapv2_2023(longitude);
    CREATE INDEX IF NOT EXISTS idx_glodap_latitude ON glodapv2_2023(latitude);
    CREATE INDEX IF NOT EXISTS idx_glodap_depth ON glodapv2_2023(depth);
    CREATE INDEX IF NOT EXISTS idx_cruise_expocode ON cruisev2_2023(expocode);
    """))
'''


2025-03-21 15:30:59,738 INFO sqlalchemy.engine.Engine select pg_catalog.version()
2025-03-21 15:30:59,738 INFO sqlalchemy.engine.Engine [raw sql] {}
2025-03-21 15:30:59,739 INFO sqlalchemy.engine.Engine select current_schema()
2025-03-21 15:30:59,739 INFO sqlalchemy.engine.Engine [raw sql] {}
2025-03-21 15:30:59,740 INFO sqlalchemy.engine.Engine show standard_conforming_strings
2025-03-21 15:30:59,740 INFO sqlalchemy.engine.Engine [raw sql] {}
2025-03-21 15:30:59,740 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2025-03-21 15:30:59,741 INFO sqlalchemy.engine.Engine CREATE EXTENSION IF NOT EXISTS postgis;
2025-03-21 15:30:59,741 INFO sqlalchemy.engine.Engine [generated in 0.00032s] {}
2025-03-21 15:30:59,742 INFO sqlalchemy.engine.Engine COMMIT


'\n    conn.execute(text("""\n    CREATE TABLE IF NOT EXISTS cruisev2_2023 (\n        expocode TEXT PRIMARY KEY,\n        region TEXT,\n        ship TEXT,\n        chief_scientist TEXT,\n        start_date TEXT,\n        end_date TEXT,\n        geom GEOMETRY(LINESTRING, 4326)\n    );\n    """))\n\n    conn.execute(text("""\n    CREATE TABLE IF NOT EXISTS glodapv2_2023 (\n        id SERIAL PRIMARY KEY,\n        expocode TEXT REFERENCES cruisev2_2023(expocode),\n        cruise INT,\n        station INT,\n        region INT,\n        cast_number INT,\n        year INT,\n        month INT,\n        datetime TIMESTAMP WITH TIME ZONE,\n        latitude FLOAT,\n        longitude FLOAT,\n        bottomdepth FLOAT,\n        maxsampdepth FLOAT,\n        bottle FLOAT,\n        pressure FLOAT,\n        depth FLOAT,\n        geom GEOMETRY(POINT, 4326)\n    );\n    CREATE INDEX IF NOT EXISTS idx_glodap_expocode ON glodapv2_2023(expocode);\n    CREATE INDEX IF NOT EXISTS idx_glodap_datetime ON glodap

In [94]:
# Load cruise metadata
cruise_df = pd.read_csv('../data/glodapv2_2023_cruise_metadata.csv')
cruise_df.columns = cruise_df.columns.str.lower().str.replace(' ', '_').str.replace('/', '_')
cruise_dict = cruise_df.set_index('expocode').to_dict(orient='index')

# Add geometry placeholder to cruise metadata
cruise_df['geom'] = None

# Dynamically create and insert cruise metadata table with explicit types
cruise_gdf = gpd.GeoDataFrame(cruise_df, geometry='geom', crs="EPSG:4326")
cruise_gdf.head(0).to_postgis('cruisev2_2023', engine, if_exists='replace', index=False,
    dtype={
        'expocode': sqlalchemy.types.TEXT,
        'region': sqlalchemy.types.TEXT,
        'ship': sqlalchemy.types.TEXT,
        'chief_scientist': sqlalchemy.types.TEXT,
        'start_date': sqlalchemy.types.TEXT,
        'end_date': sqlalchemy.types.TEXT,
        'geom': Geometry('LINESTRING', srid=4326)
    }
)

# Explicitly insert cruise metadata (without geometry yet)
cruise_gdf.drop(columns=['geom']).to_sql('cruisev2_2023', engine, if_exists='append', index=False)

# Process CSV files
cruise_coords = {}
data_dir = '../data_src/'
csv_files = sorted(glob.glob(data_dir + '*_Ocean.csv'))
print(csv_files)

2025-03-21 15:37:00,149 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2025-03-21 15:37:00,151 INFO sqlalchemy.engine.Engine SELECT pg_catalog.pg_class.relname 
FROM pg_catalog.pg_class JOIN pg_catalog.pg_namespace ON pg_catalog.pg_namespace.oid = pg_catalog.pg_class.relnamespace 
WHERE pg_catalog.pg_class.relname = %(table_name)s AND pg_catalog.pg_class.relkind = ANY (ARRAY[%(param_1)s, %(param_2)s, %(param_3)s, %(param_4)s, %(param_5)s]) AND pg_catalog.pg_namespace.nspname = %(nspname_1)s
2025-03-21 15:37:00,151 INFO sqlalchemy.engine.Engine [cached since 332.6s ago] {'table_name': 'cruisev2_2023', 'param_1': 'r', 'param_2': 'p', 'param_3': 'f', 'param_4': 'v', 'param_5': 'm', 'nspname_1': 'public'}
2025-03-21 15:37:00,154 INFO sqlalchemy.engine.Engine 
CREATE TABLE public.cruisev2_2023 (
	expocode TEXT, 
	start_date TEXT, 
	end_date TEXT, 
	map TEXT, 
	legs TEXT, 
	region TEXT, 
	alias TEXT, 
	ship TEXT, 
	chief_scientist TEXT, 
	carbon_pi TEXT, 
	hydrography_pi TEXT, 
	oxygen_pi TE

In [95]:
for csv_file in csv_files:
    print(f'Processing {csv_file}')
    for chunk in pd.read_csv(csv_file, chunksize=500000):
        chunk.columns = [col[2:] if col.startswith('G2') else col for col in chunk.columns]
        chunk.replace(-9999, np.nan, inplace=True)

        chunk['datetime'] = pd.to_datetime(chunk[['year', 'month', 'day', 'hour', 'minute']], utc=True, errors='coerce')
        chunk.drop(['day', 'hour', 'minute'], axis=1, inplace=True)

        chunk.columns = [
            f"flag_{col[:-1]}" if col.endswith('f') else
            f"qc_{col[:-2]}" if col.endswith('qc') else
            f"err_{col[:-3]}" if col.endswith('err') else col
            for col in chunk.columns
        ]

        chunk.rename(columns={'cast': 'cast_number'}, inplace=True)

        int_cols = ['cruise', 'region', 'cast_number', 'year', 'month']
        for col in int_cols:
            chunk[col] = pd.to_numeric(chunk[col], errors='coerce').round().astype('Int64')

        chunk['station'] = chunk['station'].astype(str)

        chunk['geom'] = chunk.apply(lambda row: Point(row['longitude'], row['latitude']), axis=1)
        gdf_chunk = gpd.GeoDataFrame(chunk, geometry='geom', crs="EPSG:4326")

        # Dynamically create glodap data table with explicit types
        if csv_file == csv_files[0] and chunk.index.start == 0:
            text_columns = ['expocode', 'doi', 'station']
            column_types = {col: sqlalchemy.types.FLOAT for col in gdf_chunk.columns if col not in text_columns}

            for col in text_columns:
                column_types[col] = sqlalchemy.types.TEXT

            column_types.update({
                'cruise': sqlalchemy.types.INTEGER,
                'region': sqlalchemy.types.INTEGER,
                'cast_number': sqlalchemy.types.INTEGER,
                'year': sqlalchemy.types.INTEGER,
                'month': sqlalchemy.types.INTEGER,
                'datetime': sqlalchemy.types.DateTime(timezone=True),
                'geom': Geometry('POINT', srid=4326)
            })

            gdf_chunk.head(0).to_postgis('glodapv2_2023', engine, if_exists='replace', index=False,
                                         dtype=column_types)

        # Insert data dynamically
        gdf_chunk.to_postgis('glodapv2_2023', engine, if_exists='append', index=False,
                             dtype={'geom': Geometry('POINT', srid=4326)})

        for expocode, group in gdf_chunk.groupby('expocode'):
            sorted_group = group.sort_values('datetime')
            cruise_coords.setdefault(expocode, []).extend(
                sorted_group[['longitude', 'latitude', 'datetime']].values.tolist()
            )

# Create LineStrings for cruises
lines = []
for expocode, coords in cruise_coords.items():
    coords_df = pd.DataFrame(coords, columns=['longitude', 'latitude', 'datetime']).sort_values('datetime')

    unique_coords = [
        (float(lon), float(lat)) for lon, lat in [
            next(group) for _, group in groupby(
                coords_df[['longitude', 'latitude']].itertuples(index=False, name=None),
                key=itemgetter(0, 1)
            )
        ]
    ]

    line_geom = LineString(unique_coords) if len(unique_coords) >= 2 else Point(unique_coords[0])

    lines.append({'expocode': expocode, 'geom': line_geom})

# Update cruise geometries dynamically
lines_gdf = gpd.GeoDataFrame(lines, geometry='geom', crs="EPSG:4326")
with engine.begin() as conn:
    for idx, row in lines_gdf.iterrows():
        conn.execute(text("UPDATE cruisev2_2023 SET geom = ST_GeomFromText(:geom, 4326) WHERE expocode = :expocode"),
                     {'geom': row['geom'].wkt, 'expocode': str(row['expocode'])})


Processing ../data_src/GLODAPv2.2023_Arctic_Ocean.csv
2025-03-21 15:37:39,399 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2025-03-21 15:37:39,403 INFO sqlalchemy.engine.Engine SELECT pg_catalog.pg_class.relname 
FROM pg_catalog.pg_class JOIN pg_catalog.pg_namespace ON pg_catalog.pg_namespace.oid = pg_catalog.pg_class.relnamespace 
WHERE pg_catalog.pg_class.relname = %(table_name)s AND pg_catalog.pg_class.relkind = ANY (ARRAY[%(param_1)s, %(param_2)s, %(param_3)s, %(param_4)s, %(param_5)s]) AND pg_catalog.pg_namespace.nspname = %(nspname_1)s
2025-03-21 15:37:39,403 INFO sqlalchemy.engine.Engine [cached since 371.9s ago] {'table_name': 'glodapv2_2023', 'param_1': 'r', 'param_2': 'p', 'param_3': 'f', 'param_4': 'v', 'param_5': 'm', 'nspname_1': 'public'}
2025-03-21 15:37:39,407 INFO sqlalchemy.engine.Engine 
CREATE TABLE public.glodapv2_2023 (
	expocode TEXT, 
	cruise INTEGER, 
	station TEXT, 
	region INTEGER, 
	cast_number INTEGER, 
	year INTEGER, 
	month INTEGER, 
	latitude FLOAT, 
	

  for chunk in pd.read_csv(csv_file, chunksize=500000):


2025-03-21 15:38:27,750 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2025-03-21 15:38:27,750 INFO sqlalchemy.engine.Engine SELECT pg_catalog.pg_class.relname 
FROM pg_catalog.pg_class JOIN pg_catalog.pg_namespace ON pg_catalog.pg_namespace.oid = pg_catalog.pg_class.relnamespace 
WHERE pg_catalog.pg_class.relname = %(table_name)s AND pg_catalog.pg_class.relkind = ANY (ARRAY[%(param_1)s, %(param_2)s, %(param_3)s, %(param_4)s, %(param_5)s]) AND pg_catalog.pg_table_is_visible(pg_catalog.pg_class.oid) AND pg_catalog.pg_namespace.nspname != %(nspname_1)s
2025-03-21 15:38:27,751 INFO sqlalchemy.engine.Engine [cached since 386s ago] {'table_name': 'glodapv2_2023', 'param_1': 'r', 'param_2': 'p', 'param_3': 'f', 'param_4': 'v', 'param_5': 'm', 'nspname_1': 'pg_catalog'}
2025-03-21 15:38:27,752 INFO sqlalchemy.engine.Engine SELECT Find_SRID('public', 'glodapv2_2023', 'geom');
2025-03-21 15:38:27,752 INFO sqlalchemy.engine.Engine [cached since 386s ago] {}
2025-03-21 15:38:27,754 INFO sqlalchem

  for chunk in pd.read_csv(csv_file, chunksize=500000):


2025-03-21 15:38:47,422 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2025-03-21 15:38:47,422 INFO sqlalchemy.engine.Engine SELECT pg_catalog.pg_class.relname 
FROM pg_catalog.pg_class JOIN pg_catalog.pg_namespace ON pg_catalog.pg_namespace.oid = pg_catalog.pg_class.relnamespace 
WHERE pg_catalog.pg_class.relname = %(table_name)s AND pg_catalog.pg_class.relkind = ANY (ARRAY[%(param_1)s, %(param_2)s, %(param_3)s, %(param_4)s, %(param_5)s]) AND pg_catalog.pg_table_is_visible(pg_catalog.pg_class.oid) AND pg_catalog.pg_namespace.nspname != %(nspname_1)s
2025-03-21 15:38:47,423 INFO sqlalchemy.engine.Engine [cached since 405.7s ago] {'table_name': 'glodapv2_2023', 'param_1': 'r', 'param_2': 'p', 'param_3': 'f', 'param_4': 'v', 'param_5': 'm', 'nspname_1': 'pg_catalog'}
2025-03-21 15:38:47,424 INFO sqlalchemy.engine.Engine SELECT Find_SRID('public', 'glodapv2_2023', 'geom');
2025-03-21 15:38:47,424 INFO sqlalchemy.engine.Engine [cached since 405.7s ago] {}
2025-03-21 15:38:47,426 INFO sqlal