# Import SOCAT data
This notebook reads the contents of the main SOCAT synthesis file and created a PostGIS database ready for analysis.

## Prerequisites
This notebook assumes that you have created a PostgreSQL database with the PostGIS extension installed:
- `CREATE DATABASE socat_kpi;`
- `CREATE EXTENSION postgis;`

## Setup
Imports, constants etc.

In [27]:
import psycopg2
from tqdm.notebook import tqdm
from datetime import datetime
import re
import os
from io import BytesIO
from zipfile import ZipFile
from urllib.request import urlopen


SOCAT_ZIP_URL = 'https://socat.info/socat_files/v2025/SOCATv2025.tsv.zip'
SOCAT_FILE = 'SOCATv2025.tsv'

DB_HOST = 'localhost'
DB_USER = 'postgres'
DB_PASSWORD = 'postgres'
DB_NAME = 'socat_kpi'

## Download SOCAT data
If the `SOCAT_FILE` defined above is not present, download it.

In [28]:
if os.path.exists(SOCAT_FILE):
    print('SOCAT already downloaded.')
else:
    print('Downloading SOCAT...')
    resp = urlopen(SOCAT_ZIP_URL)
    myzip = ZipFile(BytesIO(resp.read()))
    print('Extracting data...')
    myzip.extract(SOCAT_FILE)
    myzip.close()
    print('Done')


SOCAT already downloaded.


## Connect to database
Connect to the database and delete any existing tables.

In [20]:
conn = psycopg2.connect(database = DB_NAME, 
                        user = DB_USER, 
                        host= DB_HOST,
                        password = DB_PASSWORD)

cur = conn.cursor()

## Kill the old table

In [21]:
cur.execute("DROP TABLE IF EXISTS socat")
conn.commit()

## Create SOCAT data table
We will create the following fields:
- Platform Code
- EXPO Code
- Timestamp
- Year†
- Month†
- Day†
- Lat/Lon
- fCO₂ value
- fCO₂ flag

† Useful for generating stats

In [22]:
cur.execute("""CREATE TABLE socat(
id bigserial primary key,
platform_code text,
expocode text,
time timestamp,
year int,
month int,
day int,
position geometry(Point, 4326),
fco2 float,
fco2_flag integer
);""")
conn.commit()

## Locate the data in the SOCAT file
We skip the header. There are 3 lines starting with `Expocode` before the data starts. (The last is the column header line, but we don't need it.)

In [24]:
with open(SOCAT_FILE, 'rb') as f:
    line_count = sum(1 for _ in f)

f = open(SOCAT_FILE)

with tqdm(total=line_count) as progress:
    expocode_count = 0
    while expocode_count < 3:
        line = f.readline()
        if line.startswith('Expocode'):
            expocode_count += 1
        progress.update()
        
    line = f.readline()
    record_count = 0
    progress.update()
    while line != '':
        fields = line.split('\t')
        expocode = fields[0]
        if '-' in expocode:
            platform_code = re.search('(.*)\\d{8}-\\d$', expocode)[1]
        else:
            platform_code = re.search('(.*)\\d{8}$', expocode)[1]

        seconds = int(fields[9][0:2])
        if seconds > 59:
            seconds = 59
        timestamp = datetime(int(fields[4]), int(fields[5]), int(fields[6]), int(fields[7]), int(fields[8]), seconds)
        fco2 = float(fields[29])
        fco2_flag = int(fields[31])

        lon = float(fields[10])
        if lon > 180:
            lon = (360 - lon) * -1
        
        cur.execute(f"""INSERT INTO socat (platform_code, expocode, time, year, month, day, position, fco2, fco2_flag) VALUES 
        ('{platform_code}', '{expocode}', '{timestamp}', {int(fields[4])}, {int(fields[5])}, {int(fields[6])}, ST_GeomFromText('POINT({lon} {float(fields[11])})', 4326), '{fco2}', '{fco2_flag}')""")

        record_count += 1
        if record_count % 10000 == 0:
            conn.commit()

        progress.update()
        line = f.readline()
        
conn.commit()

  0%|          | 0/41387663 [00:00<?, ?it/s]

## Create index
Create a spatial index to speed up searches

In [25]:
cur.execute('CREATE INDEX socat_points_idx ON socat USING GIST(position)')
conn.commit()

## Close everything down

In [26]:
f.close()
cur.close()
conn.close()