In [1]:
import rioxarray
import os

# <<< INVASIVE SPECIES MAP >>>
INVASIVE_BIRDS_PATH = 'Datasets/Machine Learning/5km Rasters/Birds/All_Invasive_Birds_5km.tif'

# <<< LAND COVER MAP >>>
# Dimensions: 700000x1300000
LAND_COVER_MAP_PATH = 'Datasets/Machine Learning/5km Rasters/Features/gb2021lcm5km_percentage_target.tif'

# <<< FERTILISER >>>
# The dataset consists of maps of the predicted average annual application rates (2010-2015) of three different inorganic 
# chemical fertilisers – nitrogen (N), phosphorus (P) and potassium (K) – in England across a six-year period, along with 
# their respective estimates of uncertainty, at a 1 km x 1 km resolution. 
FERTILISER_K_PATH = 'Datasets/Machine Learning/5km Rasters/Features/fertiliser_k_prediction_uncertainty_5km.tif'
FERTILISER_N_PATH = 'Datasets/Machine Learning/5km Rasters/Features/fertiliser_n_prediction_uncertainty_5km.tif'
FERTILISER_P_PATH = 'Datasets/Machine Learning/5km Rasters/Features/fertiliser_p_prediction_uncertainty_5km.tif'

# <<< PESTICIDE >>>
PESTICIDE_FOLDER_PATH = 'Datasets/Machine Learning/5km Rasters/Features/Pesticides/'

# <<< INTEGRATED HYDROLOGICAL DIGITAL TERRAIN MODEL >>>
# Dimensions: 700000x1300000
# These datasets all only have one band
ELEVATION_PATH = 'Datasets/Machine Learning/5km Rasters/Features/HGHT_5km.tif'
CUMULATIVE_CATCHMENT_AREA_PATH = 'Datasets/Machine Learning/5km Rasters/Features/CCAR_5km.tif'
SURFACE_TYPE_PATH = 'Datasets/Machine Learning/5km Rasters/Features/SURF_5km.tif'
OUTFLOWING_DRAINAGE_DIRECTION_PATH = 'Datasets/Machine Learning/5km Rasters/Features/OUTF_5km.tif'
INFLOWING_DRAINAGE_PATTERN_PATH = 'Datasets/Machine Learning/5km Rasters/Features/INFL_5km.tif'

IHDTM = {
    'Elevation': ELEVATION_PATH, 
    'Cumulative catchment area': CUMULATIVE_CATCHMENT_AREA_PATH, 
    'Surface type': SURFACE_TYPE_PATH, 
    'Outflowing drainage direction': OUTFLOWING_DRAINAGE_DIRECTION_PATH, 
    'Inflowing drainage direction': INFLOWING_DRAINAGE_PATTERN_PATH
}


# <<< Opening in rasterio >>>
# dataset = rasterio.open(FERTILISER_P_PATH)
# data = dataset.read()
# np.max(data)

# <<< Opening in rioxarray >>>
# dataset = rioxarray.open_rasterio(ELEVATION_PATH)
# dataset.name = 'data'
# df = dataset.to_dataframe()


In [2]:
# Open the land cover map raster file, transform its bands into feature columns and use this as the base for the main dataframe
# as this has the widest extent out of all the files and is arguably one of the most important dataframes

lcm = rioxarray.open_rasterio(LAND_COVER_MAP_PATH)
lcm.name = 'data'
main_df = lcm.to_dataframe().drop(columns='spatial_ref')
print(main_df.value_counts())
main_df = main_df.unstack(level='band')

LCM_CLASSES = [
    'Deciduous woodland', 
    'Coniferous woodland', 
    'Arable', 
    'Improve grassland', 
    'Neutral grassland', 
    'Calcareous grassland', 
    'Acid grassland',  
    'Fen', 
    'Heather', 
    'Heather grassland', 
    'Bog',
    'Inland rock', 
    'Saltwater',
    'Freshwater',
    'Supralittoral rock',
    'Supralittoral sediment',
    'Littoral rock',
    'Littoral sediment',
    'Saltmarsh',
    'Urban',
    'Suburban'
    ]

main_df = main_df['data'].rename(columns={i+1: j for i, j in enumerate(LCM_CLASSES)})
main_df

data
0       728295
1         4899
2         2811
3         1994
4         1576
         ...  
83         120
88         120
82         117
93         108
94         100
Length: 101, dtype: int64


Unnamed: 0_level_0,band,Deciduous woodland,Coniferous woodland,Arable,Improve grassland,Neutral grassland,Calcareous grassland,Acid grassland,Fen,Heather,Heather grassland,...,Inland rock,Saltwater,Freshwater,Supralittoral rock,Supralittoral sediment,Littoral rock,Littoral sediment,Saltmarsh,Urban,Suburban
y,x,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1297500.0,2500.0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1297500.0,7500.0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1297500.0,12500.0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1297500.0,17500.0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1297500.0,22500.0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2500.0,677500.0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2500.0,682500.0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2500.0,687500.0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2500.0,692500.0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [3]:
# For each IHDTM file, append its raster data to the main dataframe
for key in IHDTM:
    ihdtm_data = rioxarray.open_rasterio(IHDTM[key])
    ihdtm_data = ihdtm_data.squeeze().drop("spatial_ref").drop("band")
    ihdtm_data.name = key
    ihdtm_df = ihdtm_data.to_dataframe()

    # Adding 25 to x and y coordinates to match index of other datasets
    ihdtm_df.index = ihdtm_df.index.set_levels(ihdtm_df.index.levels[0]+25, level=0)
    ihdtm_df.index = ihdtm_df.index.set_levels(ihdtm_df.index.levels[1]+25, level=1)
    main_df = main_df.join(ihdtm_df)
main_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Deciduous woodland,Coniferous woodland,Arable,Improve grassland,Neutral grassland,Calcareous grassland,Acid grassland,Fen,Heather,Heather grassland,...,Littoral rock,Littoral sediment,Saltmarsh,Urban,Suburban,Elevation,Cumulative catchment area,Surface type,Outflowing drainage direction,Inflowing drainage direction
y,x,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1297500.0,2500.0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,-9999,-9999,-1,-1,255
1297500.0,7500.0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,-9999,-9999,-1,-1,255
1297500.0,12500.0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,-9999,-9999,-1,-1,255
1297500.0,17500.0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,-9999,-9999,-1,-1,255
1297500.0,22500.0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,-9999,-9999,-1,-1,255
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2500.0,677500.0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,-9999,-9999,-1,-1,255
2500.0,682500.0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,-9999,-9999,-1,-1,255
2500.0,687500.0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,-9999,-9999,-1,-1,255
2500.0,692500.0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,-9999,-9999,-1,-1,255


In [4]:
# For each fertiliser, append its raster data to the main dataframe
fertiliser = {'Fertiliser K' : FERTILISER_K_PATH, 'Fertiliser N' : FERTILISER_N_PATH, 'Fertiliser P' : FERTILISER_P_PATH}

for key in fertiliser:
    fert_dataset = rioxarray.open_rasterio(fertiliser[key])
    fert_dataset.name = key
    fert_df = fert_dataset.to_dataframe().drop(columns='spatial_ref')
    fert_df = fert_df.drop(index=2).droplevel('band')
    fert_df.index = fert_df.index.set_levels(fert_df.index.levels[0]-1000, level=0)

    main_df = main_df.join(fert_df)
main_df


Unnamed: 0_level_0,Unnamed: 1_level_0,Deciduous woodland,Coniferous woodland,Arable,Improve grassland,Neutral grassland,Calcareous grassland,Acid grassland,Fen,Heather,Heather grassland,...,Urban,Suburban,Elevation,Cumulative catchment area,Surface type,Outflowing drainage direction,Inflowing drainage direction,Fertiliser K,Fertiliser N,Fertiliser P
y,x,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1297500.0,2500.0,0,0,0,0,0,0,0,0,0,0,...,0,0,-9999,-9999,-1,-1,255,,,
1297500.0,7500.0,0,0,0,0,0,0,0,0,0,0,...,0,0,-9999,-9999,-1,-1,255,,,
1297500.0,12500.0,0,0,0,0,0,0,0,0,0,0,...,0,0,-9999,-9999,-1,-1,255,,,
1297500.0,17500.0,0,0,0,0,0,0,0,0,0,0,...,0,0,-9999,-9999,-1,-1,255,,,
1297500.0,22500.0,0,0,0,0,0,0,0,0,0,0,...,0,0,-9999,-9999,-1,-1,255,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2500.0,677500.0,0,0,0,0,0,0,0,0,0,0,...,0,0,-9999,-9999,-1,-1,255,,,
2500.0,682500.0,0,0,0,0,0,0,0,0,0,0,...,0,0,-9999,-9999,-1,-1,255,,,
2500.0,687500.0,0,0,0,0,0,0,0,0,0,0,...,0,0,-9999,-9999,-1,-1,255,,,
2500.0,692500.0,0,0,0,0,0,0,0,0,0,0,...,0,0,-9999,-9999,-1,-1,255,,,


In [5]:
# For each file in pesticide folder, append its raster data to the main dataframe
for file in os.listdir(PESTICIDE_FOLDER_PATH):
    filename = os.fsdecode(file)
    if not filename.endswith('.tif'):
         continue
    
    pest_dataset = rioxarray.open_rasterio(PESTICIDE_FOLDER_PATH+filename)
    pest_dataset.name = filename[:-4]
    pest_df = pest_dataset.to_dataframe().drop(columns='spatial_ref')
    pest_df = pest_df.drop(index=2).droplevel('band')
    main_df = main_df.join(pest_df)
main_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Deciduous woodland,Coniferous woodland,Arable,Improve grassland,Neutral grassland,Calcareous grassland,Acid grassland,Fen,Heather,Heather grassland,...,Chlorothalonil_5km,Glyphosate_5km,Mancozeb_5km,Mecoprop-P_5km,Metamitron_5km,Pendimethalin_5km,PropamocarbHydrochloride_5km,Prosulfocarb_5km,Sulphur_5km,Tri-allate_5km
y,x,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1297500.0,2500.0,0,0,0,0,0,0,0,0,0,0,...,,,,,,,,,,
1297500.0,7500.0,0,0,0,0,0,0,0,0,0,0,...,,,,,,,,,,
1297500.0,12500.0,0,0,0,0,0,0,0,0,0,0,...,,,,,,,,,,
1297500.0,17500.0,0,0,0,0,0,0,0,0,0,0,...,,,,,,,,,,
1297500.0,22500.0,0,0,0,0,0,0,0,0,0,0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2500.0,677500.0,0,0,0,0,0,0,0,0,0,0,...,,,,,,,,,,
2500.0,682500.0,0,0,0,0,0,0,0,0,0,0,...,,,,,,,,,,
2500.0,687500.0,0,0,0,0,0,0,0,0,0,0,...,,,,,,,,,,
2500.0,692500.0,0,0,0,0,0,0,0,0,0,0,...,,,,,,,,,,


In [6]:
# Label data with entire bird dataset
# -> Convert every non-null value into 1s and every null value into 0

bird_dataset = rioxarray.open_rasterio(INVASIVE_BIRDS_PATH)
bird_dataset.name = 'data'
bird_df = bird_dataset.squeeze().drop("spatial_ref").drop("band").to_dataframe()

bird_df['Occurrence'] = [0 if x == 0 else 1 for x in bird_df['data']]
main_df = main_df.join(bird_df.drop(columns='data'))
main_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Deciduous woodland,Coniferous woodland,Arable,Improve grassland,Neutral grassland,Calcareous grassland,Acid grassland,Fen,Heather,Heather grassland,...,Glyphosate_5km,Mancozeb_5km,Mecoprop-P_5km,Metamitron_5km,Pendimethalin_5km,PropamocarbHydrochloride_5km,Prosulfocarb_5km,Sulphur_5km,Tri-allate_5km,Occurrence
y,x,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1297500.0,2500.0,0,0,0,0,0,0,0,0,0,0,...,,,,,,,,,,0
1297500.0,7500.0,0,0,0,0,0,0,0,0,0,0,...,,,,,,,,,,0
1297500.0,12500.0,0,0,0,0,0,0,0,0,0,0,...,,,,,,,,,,0
1297500.0,17500.0,0,0,0,0,0,0,0,0,0,0,...,,,,,,,,,,0
1297500.0,22500.0,0,0,0,0,0,0,0,0,0,0,...,,,,,,,,,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2500.0,677500.0,0,0,0,0,0,0,0,0,0,0,...,,,,,,,,,,0
2500.0,682500.0,0,0,0,0,0,0,0,0,0,0,...,,,,,,,,,,0
2500.0,687500.0,0,0,0,0,0,0,0,0,0,0,...,,,,,,,,,,0
2500.0,692500.0,0,0,0,0,0,0,0,0,0,0,...,,,,,,,,,,0


In [7]:
# Checking to see values
main_df.value_counts('Occurrence')

# Needs to be cleaned as there's too many rows with no data

Occurrence
0    28439
1     7961
dtype: int64

In [8]:
# Cleaning data
main_df = main_df.loc[main_df['Deciduous woodland']
                        + main_df['Coniferous woodland']
                        + main_df['Arable']
                        + main_df['Improve grassland']
                        + main_df['Neutral grassland']
                        + main_df['Calcareous grassland']
                        + main_df['Acid grassland']
                        + main_df['Fen']
                        + main_df['Heather']
                        + main_df['Heather grassland']
                        + main_df['Bog']
                        + main_df['Inland rock']
                        + main_df['Saltwater']
                        + main_df['Freshwater']
                        + main_df['Supralittoral rock']
                        + main_df['Supralittoral sediment']
                        + main_df['Littoral rock']
                        + main_df['Littoral sediment']
                        + main_df['Saltmarsh']
                        + main_df['Urban']
                        + main_df['Suburban'] != 0
                    ]
main_df.value_counts('Occurrence')

Occurrence
1    7224
0    2483
dtype: int64

In [9]:
# Show columns with null values
nan_columns = []
for column in main_df:
    if main_df[column].isnull().values.any():
        nan_columns.append(column)
print(nan_columns)

# Only the pesticides have null values

['Chlorothalonil_5km', 'Glyphosate_5km', 'Mancozeb_5km', 'Mecoprop-P_5km', 'Metamitron_5km', 'Pendimethalin_5km', 'PropamocarbHydrochloride_5km', 'Prosulfocarb_5km', 'Sulphur_5km', 'Tri-allate_5km']


In [10]:
# Replace null values with the minimum float value
main_df = main_df.fillna(-3.4e+38)

In [11]:
# (main_df < 0).sum()

# for column in main_df:
#     main_df[column] = main_df[column] - main_df[column].min()

In [12]:
# from sklearn.preprocessing import StandardScaler
# scaler = StandardScaler()

# # Separate labels from features first before doing this
# main_df = scaler.fit_transform(main_df)
# # probably better to this in the other python notebook?

In [13]:
main_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Deciduous woodland,Coniferous woodland,Arable,Improve grassland,Neutral grassland,Calcareous grassland,Acid grassland,Fen,Heather,Heather grassland,...,Glyphosate_5km,Mancozeb_5km,Mecoprop-P_5km,Metamitron_5km,Pendimethalin_5km,PropamocarbHydrochloride_5km,Prosulfocarb_5km,Sulphur_5km,Tri-allate_5km,Occurrence
y,x,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1217500.0,462500.0,0,0,0,0,0,0,0,0,0,10,...,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,0
1212500.0,457500.0,0,0,0,0,0,0,0,0,4,50,...,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,0
1212500.0,462500.0,0,0,0,95,4,0,0,0,0,1,...,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,1
1207500.0,457500.0,0,0,0,8,0,0,37,0,0,0,...,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,0
1207500.0,462500.0,0,0,0,35,24,0,0,0,25,0,...,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17500.0,172500.0,8,0,27,39,0,0,0,0,17,9,...,-3.400000e+38,-3.400000e+38,-3.400000e+38,2.611408e+00,-3.400000e+38,1.666599e+00,1.018958e+00,4.691324e-01,-3.400000e+38,1
17500.0,177500.0,3,0,9,81,0,0,0,0,1,0,...,-3.400000e+38,-3.400000e+38,-3.400000e+38,5.194029e+00,-3.400000e+38,3.334249e+00,2.388691e+00,1.254918e+00,-3.400000e+38,0
12500.0,87500.0,0,0,0,0,0,0,0,0,0,0,...,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,1
12500.0,92500.0,9,0,2,9,0,2,0,0,1,0,...,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,-3.400000e+38,1


In [14]:
# Output dataframe as csv
main_df.to_csv('Datasets/Machine Learning/Dataframes/5km_All_Birds_DF.csv')