# 01 â€” Data audit (NTGS stream sediments)

This notebook audits the raw NTGS stream sediment geochemistry dataset and creates quick sanity checks.

In [2]:
import geopandas as gpd
import pandas as pd


In [3]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)


In [7]:
gdf = gpd.read_file(r"..\data\GEOCHEMISTRY_STREAMS_shp\GEOCHEMISTRY_STREAM_SEDIMENTS.shp")

print(gdf.head())
print(gdf.columns)
print(gdf.crs)

                   ID SAMPLEID SAMPLEREF SAM_TYPE   SAM_METH  LATITUDE  \
0     1AD1443_7134313  7134313      1048   STREAM  STREAMSED -22.67468   
1  113500.10._7533936  7533936    GH5219   STREAM  STREAMSED -14.24830   
2  113500.10._7533954  7533954    GH5221   STREAM  STREAMSED -14.26857   
3  113500.10._7533004  7533004    GH5197   STREAM  STREAMSED -14.22128   
4     1AD1443_7134639  7134639      8523   STREAM  STREAMSED -22.64956   

   LONGITUDE LITHOLOGY DT_SAMPLE  SAMPLEWT                        COMPANY  \
0  136.32661      None       NaT       NaN  Normandy Exploration (Darwin)   
1  130.85313      None       NaT       NaN     Stockdale Prospecting Ltd.   
2  130.86000      None       NaT       NaN     Stockdale Prospecting Ltd.   
3  130.79383      None       NaT       NaN     Stockdale Prospecting Ltd.   
4  136.02512      None       NaT       NaN  Normandy Exploration (Darwin)   

  ACCURACY   TITLE            MAP100K          MAP250K    REPORT_NO OPEN_FILE  \
0     200m 

In [8]:
if gdf.crs is None:
    gdf = gdf.set_crs('EPSG:4283')
print('rows:', len(gdf), 'cols:', len(gdf.columns))
print('CRS:', gdf.crs)


rows: 157520 cols: 91
CRS: EPSG:4283


In [9]:
print('lon min/max:', float(gdf['LONGITUDE'].min()), float(gdf['LONGITUDE'].max()))
print('lat min/max:', float(gdf['LATITUDE'].min()), float(gdf['LATITUDE'].max()))


lon min/max: 129.00249 138.04748
lat min/max: -25.14141 -11.83064


In [10]:
dup_xy = gdf.duplicated(subset=['LONGITUDE','LATITUDE']).sum()
print('duplicate coordinate rows:', int(dup_xy))


duplicate coordinate rows: 28871


In [11]:
if 'UNIQ_ID' in gdf.columns:
    print('UNIQ_ID unique:', gdf['UNIQ_ID'].is_unique)
    print('UNIQ_ID duplicated:', int(gdf['UNIQ_ID'].duplicated().sum()))
else:
    print('UNIQ_ID column not present')


UNIQ_ID unique: True
UNIQ_ID duplicated: 0


In [12]:
gdf['SAM_TYPE'].value_counts()

SAM_TYPE
STREAM    157520
Name: count, dtype: int64

In [13]:
gdf['SAM_METH'].value_counts()

SAM_METH
STREAMSED    97818
STREAM       19599
BLEG         11925
CF           11510
BCL           8878
SUSP          1640
BULK          1401
PC            1330
ORIENT        1121
ALLUV          595
MAG            471
GRAVEL         452
Bleg           149
StreamSed      131
MAGLAG          92
FER             79
Streamsed       73
SL              71
SLIME           68
Stream          40
DUP             30
REASS           23
SAND            12
DHBLEG           6
CONC             3
UNK              2
streamsed        1
Name: count, dtype: int64

## Next

Proceed to `02_anomaly_model.ipynb` to build the 1 km grid and train Isolation Forest.