In [1]:
import pandas as pd  # Tabular data
from glob import glob  # File pattern matching
import os  # Operating System
import geopandas as gpd  # Geospatial data
import re  # Regular expressions
import math
from tqdm.auto import tqdm  # Progress bars
from tqdm.contrib.concurrent import thread_map, process_map  # Parallel operations
import rapidfuzz # Fuzzy string matching

pd.set_option("display.max_columns", None)
pd.set_option("display.max_colwidth", 130)

import platform
if platform.system() == "Windows":
  prefix = "Z:/"
else:
  prefix = "ressci201900060-RNC2-Coastal/"

df = pd.concat(pd.read_csv(f) for f in ["meta.csv", "LDS_meta.csv"])
# Filter to just shapefiles that have the CRS column defined
df = df[df.CPS & (df.n_lines > 0)]
df["Pixel_ER"] = (
    df.res.str.replace("(", "", regex=False).str.split(",").str[0].astype(float)
)
df

Unnamed: 0,filename,matched_image,match_score,n_lines,driver,dtype,nodata,width,height,count,crs,transform,blockxsize,blockysize,tiled,compress,interleave,GCPs,res,CPS,photometric,matched_tile_root,matched_filename,Pixel_ER
0,MaxarImagery/HighFreq/HawkesBay/Mahanga/Shorelines/Mahanga_31AUG2005.shp,MaxarImagery/HighFreq/HawkesBay/Mahanga/Imagery/Stack/Mahanga_31AUG2005.tif,100.0,2,GTiff,uint8,,3975,12039,3,,"| 0.60, 0.00, 2022707.13|\n| 0.00,-0.60, 5670278.45|\n| 0.00, 0.00, 1.00|",128.0,128,True,lzw,pixel,26,"(0.6, 0.5999999999999536)",True,,,,0.600000
1,Retrolens/Tasman/MoutereRiver/Shorelines/MoutereRiver_19FEB2010.shp,MaxarImagery/HighFreq/Tasman/MoutereRiver/Imagery/Stack/MoutereRiver_19FEB2010.tif,100.0,7,GTiff,uint8,,7839,13469,3,"PROJCS[""NZGD2000 / New Zealand Transverse Mercator 2000"",GEOGCS[""NZGD2000"",DATUM[""New_Zealand_Geodetic_Datum_2000"",SPHEROID[""G...","| 0.60, 0.00, 1602120.79|\n| 0.00,-0.60, 5446080.72|\n| 0.00, 0.00, 1.00|",128.0,128,True,lzw,pixel,0,"(0.6000000000000179, 0.6000000000000276)",True,,,,0.600000
2,Retrolens/Tasman/MoutereRiver/Shorelines/MoutereRiver_31JAN1980.shp,Retrolens/Tasman/MoutereRiver/Stack/MoutereRiver_31JAN1980_mosaic.jp2,100.0,4,JP2OpenJPEG,uint16,256.0,4917,6845,3,"PROJCS[""NZGD2000 / New Zealand Transverse Mercator 2000"",GEOGCS[""NZGD2000"",DATUM[""New_Zealand_Geodetic_Datum_2000"",SPHEROID[""G...","| 1.40, 0.00, 1600979.23|\n| 0.00,-1.40, 5447041.85|\n| 0.00, 0.00, 1.00|",1024.0,1024,True,,pixel,0,"(1.3997952365892727, 1.3997952365892659)",True,,,,1.399795
3,Retrolens/Tasman/MoutereRiver/Shorelines/MoutereRiver_10AUG2003.shp,MaxarImagery/HighFreq/Tasman/MoutereRiver/Imagery/Stack/MoutereRiver_10AUG2003.tif,100.0,8,GTiff,uint8,,7839,13469,3,"PROJCS[""NZGD2000 / New Zealand Transverse Mercator 2000"",GEOGCS[""NZGD2000"",DATUM[""New_Zealand_Geodetic_Datum_2000"",SPHEROID[""G...","| 0.60, 0.00, 1602120.79|\n| 0.00,-0.60, 5446080.72|\n| 0.00, 0.00, 1.00|",128.0,128,True,lzw,pixel,0,"(0.6000000000000179, 0.6000000000000276)",True,,,,0.600000
4,Retrolens/Tasman/MoutereRiver/Shorelines/MoutereRiver_19APR2017.shp,MaxarImagery/HighFreq/Tasman/MoutereRiver/Imagery/Stack/MoutereRiver_19APR2017.tif,100.0,5,GTiff,uint8,,9408,16163,3,"PROJCS[""NZGD2000 / New Zealand Transverse Mercator 2000"",GEOGCS[""NZGD2000"",DATUM[""New_Zealand_Geodetic_Datum_2000"",SPHEROID[""G...","| 0.50, 0.00, 1602120.69|\n| 0.00,-0.50, 5446080.83|\n| 0.00, 0.00, 1.00|",128.0,128,True,lzw,pixel,0,"(0.5, 0.5)",True,,,,0.500000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51,Retrolens/Bay of Plenty/WaihiBeach/Shorelines/WaihiBeach_3DEC2014.shp,Retrolens/Bay of Plenty/WaihiBeach/Shorelines/WaihiBeach_3DEC2014.tif,100.0,7,GTiff,uint8,,38400,63360,3,EPSG:2193,"| 0.12, 0.00, 1859680.00|\n| 0.00,-0.12, 5857440.00|\n| 0.00, 0.00, 1.00|",,1,False,lzw,pixel,0,"(0.125, 0.125)",True,,DigitalJPGs/BayOfPlenty/BOP14U,Retrolens/Bay of Plenty/WaihiBeach/Shorelines/WaihiBeach_3DEC2014.shp,0.125000
52,Retrolens/Bay of Plenty/PortOhope/Shorelines/OhopeBeach_3DEC2014.shp,Retrolens/Bay of Plenty/PortOhope/Shorelines/OhopeBeach_3DEC2014.tif,100.0,5,GTiff,uint8,,88320,40320,3,EPSG:2193,"| 0.12, 0.00, 1953280.00|\n| 0.00,-0.12, 5791920.00|\n| 0.00, 0.00, 1.00|",,1,False,lzw,pixel,0,"(0.125, 0.125)",True,,DigitalJPGs/BayOfPlenty/BOP14U,Retrolens/Bay of Plenty/PortOhope/Shorelines/OhopeBeach_3DEC2014.shp,0.125000
53,Retrolens/Bay of Plenty/Ohiwa/Shorelines/Ohiwa_3DEC2014.shp,Retrolens/Bay of Plenty/Ohiwa/Shorelines/Ohiwa_3DEC2014.tif,100.0,1,GTiff,uint8,,38400,11520,3,EPSG:2193,"| 0.12, 0.00, 1964800.00|\n| 0.00,-0.12, 5787600.00|\n| 0.00, 0.00, 1.00|",,1,False,lzw,pixel,0,"(0.125, 0.125)",True,,DigitalJPGs/BayOfPlenty/BOP14U,Retrolens/Bay of Plenty/Ohiwa/Shorelines/Ohiwa_3DEC2014.shp,0.125000
54,Retrolens/Bay of Plenty/PortOhope/Shorelines/OhopeBeach_3DEC2014.shp,Retrolens/Bay of Plenty/PortOhope/Shorelines/OhopeBeach_3DEC2014.tif,100.0,5,GTiff,uint8,,88320,40320,3,EPSG:2193,"| 0.12, 0.00, 1953280.00|\n| 0.00,-0.12, 5791920.00|\n| 0.00, 0.00, 1.00|",,1,False,lzw,pixel,0,"(0.125, 0.125)",True,,DigitalJPGs/BayOfPlenty/BOP14U,Retrolens/Bay of Plenty/PortOhope/Shorelines/OhopeBeach_3DEC2014.shp,0.125000


In [2]:
def get_lines(filename):
    try:
        gdf = gpd.read_file(prefix + filename)
    except Exception as e:
        print(e)
        return pd.NA
    return len(gdf.explode(index_parts=False))

df["n_lines_now"] = thread_map(get_lines, df.filename)

ressci201900060-RNC2-Coastal/Retrolens/Canterbury/GoochBay/Shorelines/GoochBay_10DEC1942.shp: No such file or directory


  0%|          | 0/1185 [00:04<?, ?it/s]

ressci201900060-RNC2-Coastal/Retrolens/Bay of Plenty/WaihiBeach/Shorelines/WaihiBeach_16NOV1942.shp: No such file or directory


In [26]:
bad = df[df.n_lines_now == 0]
bad[["filename", "n_lines", "n_lines_now"]]

Unnamed: 0,filename,n_lines,n_lines_now
5,Retrolens/Tasman/RuataniwhaInlet/Shorelines/RuataniwhaInlet_11OCT1950.shp,9,0
11,Retrolens/Tasman/Totaranui/Shorelines/Totaranui_21FEB2002.shp,13,0
13,Retrolens/Tasman/Totaranui/Shorelines/Totaranui_26JAN1974.shp,15,0
15,Retrolens/Tasman/SandyBay/Shorelines/SandyBay_19May1958.shp,17,0
16,Retrolens/Tasman/SandyBay/Shorelines/SandyBay_04May1944.shp,25,0
21,Retrolens/Tasman/MoutereRiver/Shorelines/MoutereRiver_22FEB1940.shp,13,0
26,Retrolens/Canterbury/GoreBay/Shorelines/GoreBay_7OCT1955.shp,7,0
27,Retrolens/Canterbury/GoreBay/Shorelines/GoreBay_25FEB1979.shp,5,0
29,Retrolens/Canterbury/GoreBay/Shorelines/GoreBay_3NOV1965.shp,7,0
42,Retrolens/Tasman/SandyBay/Shorelines/SandyBay_31JAN1980.shp,30,0


In [39]:
len(bad)

34

In [38]:
def get_mtime(filename):
    return pd.to_datetime(os.path.getmtime(filename), unit="s", origin="unix", utc=True).tz_convert("Pacific/Auckland")
bad["mtime"] = bad.filename.apply(get_mtime)
bad[["filename", "n_lines", "n_lines_now", "mtime"]].sort_values("mtime")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bad["mtime"] = bad.filename.apply(get_mtime)


Unnamed: 0,filename,n_lines,n_lines_now,mtime
1077,MaxarImagery/HighFreq/Southland/ToetoesBay_east/Shorelines/ToetoesBay_east_24NOV2019.shp,8,0,2024-01-24 11:04:51.509413632+13:00
29,Retrolens/Canterbury/GoreBay/Shorelines/GoreBay_3NOV1965.shp,7,0,2024-01-24 11:14:14.401831936+13:00
16,Retrolens/Tasman/SandyBay/Shorelines/SandyBay_04May1944.shp,25,0,2024-01-24 11:14:15.575725056+13:00
43,Retrolens/Tasman/SandyBay/Shorelines/SandyBay_21FEB2002.shp,31,0,2024-01-24 11:15:24.592977920+13:00
42,Retrolens/Tasman/SandyBay/Shorelines/SandyBay_31JAN1980.shp,30,0,2024-01-24 11:15:25.206235392+13:00
70,Retrolens/Tasman/Motueka/Shorelines/Motueka_19MAY1958.shp,7,0,2024-01-24 11:15:26.295210752+13:00
63,Retrolens/Tasman/PortPuponga/Shorelines/PortPuponga_16SEP1954.shp,2,0,2024-01-24 11:15:26.442781184+13:00
62,Retrolens/Tasman/PortPuponga/Shorelines/PortPuponga_20OCT1975.shp,2,0,2024-01-24 11:15:26.513212672+13:00
54,Retrolens/Tasman/WainuiBay/Shorelines/WainuiBay_31JAN1980.shp,17,0,2024-01-24 11:15:26.706024960+13:00
44,Retrolens/Tasman/SandyBay/Shorelines/SandyBay_26JAN1974.shp,27,0,2024-01-24 11:15:26.748400384+13:00


In [5]:
glob("ressci201900060-RNC2-Coastal/Retrolens/Canterbury/GoochBay/Shorelines/GoochBay_10DEC1942.*")

['ressci201900060-RNC2-Coastal/Retrolens/Canterbury/GoochBay/Shorelines/GoochBay_10DEC1942.shp.xml',
 'ressci201900060-RNC2-Coastal/Retrolens/Canterbury/GoochBay/Shorelines/GoochBay_10DEC1942.dbf',
 'ressci201900060-RNC2-Coastal/Retrolens/Canterbury/GoochBay/Shorelines/GoochBay_10DEC1942.prj',
 'ressci201900060-RNC2-Coastal/Retrolens/Canterbury/GoochBay/Shorelines/GoochBay_10DEC1942.cpg']

In [8]:
backup = gpd.read_file("shorelines.geojson")
backup.shape

(12485, 36)

In [14]:
test = gpd.read_file("Retrolens/Canterbury/GoochBay/Shorelines/GoochBay_8OCT1961.shp")
test

Unnamed: 0,Id,Region,Site,Date,DSASDate,Digitiser,Scale,Notes,Source,CPS,Proxy,Photoscale,Georef_ER,Pixel_Er,Total_UNCY,geometry
0,0,Canterbury,GoochBay,1961-10-08,08/10/1961,TK,1500,EOV,RL,2,1,44500,2.9,1.1,3.186362,"LINESTRING (1651948.140 5303642.976, 1651944.568 5303640.198, 1651942.980 5303635.832, 1651939.805 5303634.245, 1651932.662 53..."
1,0,Canterbury,GoochBay,1961-10-08,08/10/1961,TK,1500,EOV,RL,3,1,44500,2.9,1.1,3.249754,"LINESTRING (1651267.895 5303425.885, 1651265.116 5303420.726, 1651257.973 5303417.154, 1651250.035 5303416.757, 1651240.510 53..."
2,0,Canterbury,GoochBay,1961-10-08,08/10/1961,TK,1500,EOV,RL,4,1,44500,2.9,1.1,3.728927,"LINESTRING (1654870.775 5304194.355, 1654877.521 5304193.164, 1654881.887 5304193.164, 1654886.650 5304191.973, 1654890.221 53..."
3,0,Canterbury,GoochBay,1961-10-08,08/10/1961,TK,1500,EOV,RL,4,1,44500,2.9,1.1,3.728927,"LINESTRING (1652206.601 5303816.332, 1652220.492 5303809.585, 1652229.620 5303806.807, 1652238.351 5303802.838, 1652245.495 53..."


In [43]:
cols = test.columns
cols

Index(['Id', 'Region', 'Site', 'Date', 'DSASDate', 'Digitiser', 'Scale',
       'Notes', 'Source', 'CPS', 'Proxy', 'Photoscale', 'Georef_ER',
       'Pixel_Er', 'Total_UNCY', 'geometry'],
      dtype='object')

In [25]:
glob('Retrolens/Tasman/RuataniwhaInlet/Shorelines/RuataniwhaInlet_11OCT1950.*')

['Retrolens/Tasman/RuataniwhaInlet/Shorelines/RuataniwhaInlet_11OCT1950.shp.xml',
 'Retrolens/Tasman/RuataniwhaInlet/Shorelines/RuataniwhaInlet_11OCT1950.cpg',
 'Retrolens/Tasman/RuataniwhaInlet/Shorelines/RuataniwhaInlet_11OCT1950.prj',
 'Retrolens/Tasman/RuataniwhaInlet/Shorelines/RuataniwhaInlet_11OCT1950.shp',
 'Retrolens/Tasman/RuataniwhaInlet/Shorelines/RuataniwhaInlet_11OCT1950.dbf',
 'Retrolens/Tasman/RuataniwhaInlet/Shorelines/RuataniwhaInlet_11OCT1950.shx']

In [24]:
gpd.read_file('Retrolens/Tasman/RuataniwhaInlet/Shorelines/RuataniwhaInlet_11OCT1950.shp')

Unnamed: 0,Id,Region,Site,Date,DSASDate,Digitiser,Scale,Notes,Source,CPS,Proxy,Photoscale,Georef_ER,Pixel_Er,Total_UNCY,geometry


In [44]:
backup[backup.filename == "Retrolens/Canterbury/GoochBay/Shorelines/GoochBay_10DEC1942.shp"][cols].to_crs(2193).to_file("Retrolens/Canterbury/GoochBay/Shorelines/GoochBay_10DEC1942.shp")

In [45]:
backup[backup.filename == "Retrolens/Bay of Plenty/WaihiBeach/Shorelines/WaihiBeach_16NOV1942.shp"][cols].to_crs(2193).to_file("Retrolens/Bay of Plenty/WaihiBeach/Shorelines/WaihiBeach_16NOV1942.shp")

In [50]:
# Store the bad files in a zip, just in case
import os
import zipfile

zf = zipfile.ZipFile("bad.zip", "w")
for file in bad.filename:
    files = glob(os.path.splitext(file)[0] + "*")
    for f in files:
        print(f)
        zf.write(f)
zf.close()

Retrolens/Tasman/RuataniwhaInlet/Shorelines/RuataniwhaInlet_11OCT1950.shp.xml
Retrolens/Tasman/RuataniwhaInlet/Shorelines/RuataniwhaInlet_11OCT1950.cpg
Retrolens/Tasman/RuataniwhaInlet/Shorelines/RuataniwhaInlet_11OCT1950.prj
Retrolens/Tasman/RuataniwhaInlet/Shorelines/RuataniwhaInlet_11OCT1950.shp
Retrolens/Tasman/RuataniwhaInlet/Shorelines/RuataniwhaInlet_11OCT1950.dbf
Retrolens/Tasman/RuataniwhaInlet/Shorelines/RuataniwhaInlet_11OCT1950.shx
Retrolens/Tasman/Totaranui/Shorelines/Totaranui_21FEB2002.shx
Retrolens/Tasman/Totaranui/Shorelines/Totaranui_21FEB2002.prj
Retrolens/Tasman/Totaranui/Shorelines/Totaranui_21FEB2002.dbf
Retrolens/Tasman/Totaranui/Shorelines/Totaranui_21FEB2002.shp.xml
Retrolens/Tasman/Totaranui/Shorelines/Totaranui_21FEB2002.cpg
Retrolens/Tasman/Totaranui/Shorelines/Totaranui_21FEB2002.shp
Retrolens/Tasman/Totaranui/Shorelines/Totaranui_26JAN1974.prj
Retrolens/Tasman/Totaranui/Shorelines/Totaranui_26JAN1974.shx
Retrolens/Tasman/Totaranui/Shorelines/Totaranui_26JA

In [51]:
for f in tqdm(bad.filename):
    print(f"Restoring {f} from backup")
    backup_copy = backup[backup.filename == f][cols].to_crs(2193)
    backup_copy.to_file(f)

  0%|          | 0/34 [00:00<?, ?it/s]

Restoring Retrolens/Tasman/RuataniwhaInlet/Shorelines/RuataniwhaInlet_11OCT1950.shp from backup
Restoring Retrolens/Tasman/Totaranui/Shorelines/Totaranui_21FEB2002.shp from backup
Restoring Retrolens/Tasman/Totaranui/Shorelines/Totaranui_26JAN1974.shp from backup
Restoring Retrolens/Tasman/SandyBay/Shorelines/SandyBay_19May1958.shp from backup
Restoring Retrolens/Tasman/SandyBay/Shorelines/SandyBay_04May1944.shp from backup
Restoring Retrolens/Tasman/MoutereRiver/Shorelines/MoutereRiver_22FEB1940.shp from backup
Restoring Retrolens/Canterbury/GoreBay/Shorelines/GoreBay_7OCT1955.shp from backup
Restoring Retrolens/Canterbury/GoreBay/Shorelines/GoreBay_25FEB1979.shp from backup
Restoring Retrolens/Canterbury/GoreBay/Shorelines/GoreBay_3NOV1965.shp from backup
Restoring Retrolens/Tasman/SandyBay/Shorelines/SandyBay_31JAN1980.shp from backup
Restoring Retrolens/Tasman/SandyBay/Shorelines/SandyBay_21FEB2002.shp from backup
Restoring Retrolens/Tasman/SandyBay/Shorelines/SandyBay_26JAN1974.sh

In [52]:
gpd.read_file("Retrolens/Tasman/RuataniwhaInlet/Shorelines/RuataniwhaInlet_11OCT1950.shp")

Unnamed: 0,Id,Region,Site,Date,DSASDate,Digitiser,Scale,Notes,Source,CPS,Proxy,Photoscale,Georef_ER,Pixel_Er,Total_UNCY,geometry
0,0,Tasman,Ruataniwha Inlet,1950-10-11,11/10/1950,MS,1000,,RL,3,1,17400,2.09,0.510713,2.360048,"LINESTRING (1574679.326 5491095.703, 1574672.182 5491103.376, 1574664.245 5491110.785, 1574653.132 5491119.516, 1574641.491 54..."
1,0,Tasman,Ruataniwha Inlet,1950-10-11,11/10/1950,MS,1000,,RL,1,4,17400,2.09,0.510713,2.194044,"LINESTRING (1573261.157 5492644.313, 1573269.491 5492644.709, 1573275.841 5492643.122, 1573283.382 5492638.756, 1573286.954 54..."
2,0,Tasman,Ruataniwha Inlet,1950-10-11,11/10/1950,MS,1000,,RL,3,1,17400,2.09,0.510713,2.360048,"LINESTRING (1573644.671 5493168.982, 1573646.258 5493164.220, 1573649.433 5493160.251, 1573654.461 5493158.134, 1573659.752 54..."
3,0,Tasman,Ruataniwha Inlet,1950-10-11,11/10/1950,MS,1000,,RL,4,1,17400,2.09,0.510713,2.985603,"LINESTRING (1573045.904 5499738.044, 1573046.764 5499725.344, 1573044.912 5499716.877, 1573043.325 5499705.500, 1573040.150 54..."
4,0,Tasman,Ruataniwha Inlet,1950-10-11,11/10/1950,MS,1000,,RL,3,1,17400,2.09,0.510713,2.360048,"LINESTRING (1573037.107 5500145.613, 1573038.761 5500151.897, 1573039.422 5500158.512, 1573039.422 5500165.788, 1573037.107 55..."
5,0,Tasman,Ruataniwha Inlet,1950-10-11,11/10/1950,MS,1000,,RL,3,1,17400,2.09,0.510713,2.360048,"LINESTRING (1572912.091 5502678.078, 1572915.266 5502671.199, 1572919.764 5502664.320, 1572925.056 5502655.324, 1572929.554 55..."
6,0,Tasman,Ruataniwha Inlet,1950-10-11,11/10/1950,MS,1000,,RL,2,1,17400,2.09,0.510713,2.271966,"LINESTRING (1573182.615 5504338.390, 1573186.055 5504345.005, 1573191.346 5504354.530, 1573195.050 5504362.732, 1573197.432 55..."
7,0,Tasman,Ruataniwha Inlet,1950-10-11,11/10/1950,MS,1000,,RL,1,7,17400,2.09,0.510713,2.194044,"LINESTRING (1573410.051 5496896.255, 1573408.252 5496897.630, 1573407.088 5496899.324, 1573405.289 5496901.229, 1573404.336 54..."
8,0,Tasman,Ruataniwha Inlet,1950-10-11,11/10/1950,MS,1000,,RL,1,7,17400,2.09,0.510713,2.194044,"LINESTRING (1573014.803 5503456.995, 1573015.133 5503464.734, 1573016.192 5503471.613, 1573018.308 5503480.345, 1573019.896 55..."
