In [1]:
import rioxarray
import os

# <<< INVASIVE SPECIES MAP >>>
INVASIVE_BIRDS_PATH = 'Datasets/Machine Learning/10km Rasters/Birds/All_Invasive_Birds_10km.tif'

# <<< LAND COVER MAP >>>
# Dimensions: 700000x1300000
LAND_COVER_MAP_PATH = 'Datasets/Machine Learning/10km Rasters/Features/gb2021lcm10km_percentage_target.tif'

# <<< FERTILISER >>>
# The dataset consists of maps of the predicted average annual application rates (2010-2015) of three different inorganic 
# chemical fertilisers – nitrogen (N), phosphorus (P) and potassium (K) – in England across a six-year period, along with 
# their respective estimates of uncertainty, at a 1 km x 1 km resolution. 
FERTILISER_K_PATH = 'Datasets/Digimap/Land-Cover-plus-Fertilisers_4731497/fertiliser_k_prediction_uncertainty.tif'
FERTILISER_N_PATH = 'Datasets/Digimap/Land-Cover-plus-Fertilisers_4731497/fertiliser_n_prediction_uncertainty.tif'
FERTILISER_P_PATH = 'Datasets/Digimap/Land-Cover-plus-Fertilisers_4731497/fertiliser_p_prediction_uncertainty.tif'

# <<< PESTICIDE >>>
PESTICIDE_FOLDER_PATH = 'Datasets/Machine Learning/10km Rasters/Features/Pesticides/'

# <<< INTEGRATED HYDROLOGICAL DIGITAL TERRAIN MODEL >>>
# Dimensions: 700000x1300000
# These datasets all only have one band
ELEVATION_PATH = 'Datasets/Machine Learning/10km Rasters/Features/HGHT_10km.tif'
CUMULATIVE_CATCHMENT_AREA_PATH = 'Datasets/Machine Learning/10km Rasters/Features/CCAR_10km.tif'
SURFACE_TYPE_PATH = 'Datasets/Machine Learning/10km Rasters/Features/SURF_10km.tif'
OUTFLOWING_DRAINAGE_DIRECTION_PATH = 'Datasets/Machine Learning/10km Rasters/Features/OUTF_10km.tif'
INFLOWING_DRAINAGE_PATTERN_PATH = 'Datasets/Machine Learning/10km Rasters/Features/INFL_10km.tif'

IHDTM = {
    'Elevation': ELEVATION_PATH, 
    'Cumulative catchment area': CUMULATIVE_CATCHMENT_AREA_PATH, 
    'Surface type': SURFACE_TYPE_PATH, 
    'Outflowing drainage direction': OUTFLOWING_DRAINAGE_DIRECTION_PATH, 
    'Inflowing drainage direction': INFLOWING_DRAINAGE_PATTERN_PATH
}


# <<< Opening in rasterio >>>
# dataset = rasterio.open(FERTILISER_P_PATH)
# data = dataset.read()
# np.max(data)

# <<< Opening in rioxarray >>>
# dataset = rioxarray.open_rasterio(ELEVATION_PATH)
# dataset.name = 'data'
# df = dataset.to_dataframe()


In [2]:
# Open the land cover map raster file, transform its bands into feature columns and use this as the base for the main dataframe
# as this has the widest extent out of all the files and is arguably one of the most important dataframes

lcm = rioxarray.open_rasterio(LAND_COVER_MAP_PATH)
lcm.name = 'data'
main_df = lcm.to_dataframe().drop(columns='spatial_ref')
print(main_df.value_counts())
main_df = main_df.unstack(level='band')

LCM_CLASSES = [
    'Deciduous woodland', 
    'Coniferous woodland', 
    'Arable', 
    'Improve grassland', 
    'Neutral grassland', 
    'Calcareous grassland', 
    'Acid grassland',  
    'Fen', 
    'Heather', 
    'Heather grassland', 
    'Bog',
    'Inland rock', 
    'Saltwater',
    'Freshwater',
    'Supralittoral rock',
    'Supralittoral sediment',
    'Littoral rock',
    'Littoral sediment',
    'Saltmarsh',
    'Urban',
    'Suburban'
    ]

main_df = main_df['data'].rename(columns={i+1: j for i, j in enumerate(LCM_CLASSES)})
main_df

data
0       182086
1         1212
2          694
3          495
4          386
         ...  
80          25
93          24
85          23
79          23
78          23
Length: 101, dtype: int64


Unnamed: 0_level_0,band,Deciduous woodland,Coniferous woodland,Arable,Improve grassland,Neutral grassland,Calcareous grassland,Acid grassland,Fen,Heather,Heather grassland,...,Inland rock,Saltwater,Freshwater,Supralittoral rock,Supralittoral sediment,Littoral rock,Littoral sediment,Saltmarsh,Urban,Suburban
y,x,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1295000.0,5000.0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1295000.0,15000.0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1295000.0,25000.0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1295000.0,35000.0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1295000.0,45000.0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5000.0,655000.0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5000.0,665000.0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5000.0,675000.0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5000.0,685000.0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [3]:
# For each IHDTM file, append its raster data to the main dataframe
for key in IHDTM:
    ihdtm_data = rioxarray.open_rasterio(IHDTM[key])
    ihdtm_data = ihdtm_data.squeeze().drop("spatial_ref").drop("band")
    ihdtm_data.name = key
    ihdtm_df = ihdtm_data.to_dataframe()

    # Adding 25 to x and y coordinates to match index of other datasets
    ihdtm_df.index = ihdtm_df.index.set_levels(ihdtm_df.index.levels[0]+25, level=0)
    ihdtm_df.index = ihdtm_df.index.set_levels(ihdtm_df.index.levels[1]+25, level=1)
    main_df = main_df.join(ihdtm_df)
main_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Deciduous woodland,Coniferous woodland,Arable,Improve grassland,Neutral grassland,Calcareous grassland,Acid grassland,Fen,Heather,Heather grassland,...,Littoral rock,Littoral sediment,Saltmarsh,Urban,Suburban,Elevation,Cumulative catchment area,Surface type,Outflowing drainage direction,Inflowing drainage direction
y,x,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1295000.0,5000.0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,-9999,-9999,-1,-1,255
1295000.0,15000.0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,-9999,-9999,-1,-1,255
1295000.0,25000.0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,-9999,-9999,-1,-1,255
1295000.0,35000.0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,-9999,-9999,-1,-1,255
1295000.0,45000.0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,-9999,-9999,-1,-1,255
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5000.0,655000.0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,-9999,-9999,-1,-1,255
5000.0,665000.0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,-9999,-9999,-1,-1,255
5000.0,675000.0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,-9999,-9999,-1,-1,255
5000.0,685000.0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,-9999,-9999,-1,-1,255


In [4]:
# For each fertiliser, append its raster data to the main dataframe
fertiliser = {'Fertiliser K' : FERTILISER_K_PATH, 'Fertiliser N' : FERTILISER_N_PATH, 'Fertiliser P' : FERTILISER_P_PATH}

for key in fertiliser:
    fert_dataset = rioxarray.open_rasterio(fertiliser[key])
    fert_dataset.name = key
    fert_df = fert_dataset.to_dataframe().drop(columns='spatial_ref')
    fert_df = fert_df.drop(index=2).droplevel('band')
    main_df = main_df.join(fert_df)
main_df


Unnamed: 0_level_0,Unnamed: 1_level_0,Deciduous woodland,Coniferous woodland,Arable,Improve grassland,Neutral grassland,Calcareous grassland,Acid grassland,Fen,Heather,Heather grassland,...,Urban,Suburban,Elevation,Cumulative catchment area,Surface type,Outflowing drainage direction,Inflowing drainage direction,Fertiliser K,Fertiliser N,Fertiliser P
y,x,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1295000.0,5000.0,0,0,0,0,0,0,0,0,0,0,...,0,0,-9999,-9999,-1,-1,255,,,
1295000.0,15000.0,0,0,0,0,0,0,0,0,0,0,...,0,0,-9999,-9999,-1,-1,255,,,
1295000.0,25000.0,0,0,0,0,0,0,0,0,0,0,...,0,0,-9999,-9999,-1,-1,255,,,
1295000.0,35000.0,0,0,0,0,0,0,0,0,0,0,...,0,0,-9999,-9999,-1,-1,255,,,
1295000.0,45000.0,0,0,0,0,0,0,0,0,0,0,...,0,0,-9999,-9999,-1,-1,255,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5000.0,655000.0,0,0,0,0,0,0,0,0,0,0,...,0,0,-9999,-9999,-1,-1,255,,,
5000.0,665000.0,0,0,0,0,0,0,0,0,0,0,...,0,0,-9999,-9999,-1,-1,255,,,
5000.0,675000.0,0,0,0,0,0,0,0,0,0,0,...,0,0,-9999,-9999,-1,-1,255,,,
5000.0,685000.0,0,0,0,0,0,0,0,0,0,0,...,0,0,-9999,-9999,-1,-1,255,,,


In [5]:
# For each file in pesticide folder, append its raster data to the main dataframe
for file in os.listdir(PESTICIDE_FOLDER_PATH):
    filename = os.fsdecode(file)
    if not filename.endswith('.tif'):
         continue
    
    pest_dataset = rioxarray.open_rasterio(PESTICIDE_FOLDER_PATH+filename)
    pest_dataset.name = filename[:-4]
    pest_df = pest_dataset.to_dataframe().drop(columns='spatial_ref')
    pest_df = pest_df.drop(index=2).droplevel('band')
    main_df = main_df.join(pest_df)
main_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Deciduous woodland,Coniferous woodland,Arable,Improve grassland,Neutral grassland,Calcareous grassland,Acid grassland,Fen,Heather,Heather grassland,...,Chlorothalonil_10km,Glyphosate_10km,Mancozeb_10km,Mecoprop-P_10km,Metamitron_10km,Pendimethalin_10km,PropamocarbHydrochloride_10km,Prosulfocarb_10km,Sulphur_10km,Tri-allate_10km
y,x,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1295000.0,5000.0,0,0,0,0,0,0,0,0,0,0,...,,,,,,,,,,
1295000.0,15000.0,0,0,0,0,0,0,0,0,0,0,...,,,,,,,,,,
1295000.0,25000.0,0,0,0,0,0,0,0,0,0,0,...,,,,,,,,,,
1295000.0,35000.0,0,0,0,0,0,0,0,0,0,0,...,,,,,,,,,,
1295000.0,45000.0,0,0,0,0,0,0,0,0,0,0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5000.0,655000.0,0,0,0,0,0,0,0,0,0,0,...,,,,,,,,,,
5000.0,665000.0,0,0,0,0,0,0,0,0,0,0,...,,,,,,,,,,
5000.0,675000.0,0,0,0,0,0,0,0,0,0,0,...,,,,,,,,,,
5000.0,685000.0,0,0,0,0,0,0,0,0,0,0,...,,,,,,,,,,


In [6]:
# Label data with entire bird dataset
# -> Convert every non-null value into 1s and every null value into 0

bird_dataset = rioxarray.open_rasterio(INVASIVE_BIRDS_PATH)
bird_dataset.name = 'data'
bird_df = bird_dataset.squeeze().drop("spatial_ref").drop("band").to_dataframe()

bird_df['Occurrence'] = [0 if x == 0 else 1 for x in bird_df['data']]
main_df = main_df.join(bird_df.drop(columns='data'))
main_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Deciduous woodland,Coniferous woodland,Arable,Improve grassland,Neutral grassland,Calcareous grassland,Acid grassland,Fen,Heather,Heather grassland,...,Glyphosate_10km,Mancozeb_10km,Mecoprop-P_10km,Metamitron_10km,Pendimethalin_10km,PropamocarbHydrochloride_10km,Prosulfocarb_10km,Sulphur_10km,Tri-allate_10km,Occurrence
y,x,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1295000.0,5000.0,0,0,0,0,0,0,0,0,0,0,...,,,,,,,,,,0
1295000.0,15000.0,0,0,0,0,0,0,0,0,0,0,...,,,,,,,,,,0
1295000.0,25000.0,0,0,0,0,0,0,0,0,0,0,...,,,,,,,,,,0
1295000.0,35000.0,0,0,0,0,0,0,0,0,0,0,...,,,,,,,,,,0
1295000.0,45000.0,0,0,0,0,0,0,0,0,0,0,...,,,,,,,,,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5000.0,655000.0,0,0,0,0,0,0,0,0,0,0,...,,,,,,,,,,0
5000.0,665000.0,0,0,0,0,0,0,0,0,0,0,...,,,,,,,,,,0
5000.0,675000.0,0,0,0,0,0,0,0,0,0,0,...,,,,,,,,,,0
5000.0,685000.0,0,0,0,0,0,0,0,0,0,0,...,,,,,,,,,,0


In [7]:
# Checking to see values
main_df.value_counts('Occurrence')

# Needs to be cleaned as there's too many rows with no data

Occurrence
0    6428
1    2672
dtype: int64

In [8]:
# Cleaning data
main_df = main_df.loc[main_df['Deciduous woodland']
                        + main_df['Coniferous woodland']
                        + main_df['Arable']
                        + main_df['Improve grassland']
                        + main_df['Neutral grassland']
                        + main_df['Calcareous grassland']
                        + main_df['Acid grassland']
                        + main_df['Fen']
                        + main_df['Heather']
                        + main_df['Heather grassland']
                        + main_df['Bog']
                        + main_df['Inland rock']
                        + main_df['Saltwater']
                        + main_df['Freshwater']
                        + main_df['Supralittoral rock']
                        + main_df['Supralittoral sediment']
                        + main_df['Littoral rock']
                        + main_df['Littoral sediment']
                        + main_df['Saltmarsh']
                        + main_df['Urban']
                        + main_df['Suburban'] 
                        + main_df['Occurrence']!= 0
                    ]
main_df.value_counts('Occurrence')

Occurrence
1    2672
0     207
dtype: int64

In [9]:
main_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Deciduous woodland,Coniferous woodland,Arable,Improve grassland,Neutral grassland,Calcareous grassland,Acid grassland,Fen,Heather,Heather grassland,...,Glyphosate_10km,Mancozeb_10km,Mecoprop-P_10km,Metamitron_10km,Pendimethalin_10km,PropamocarbHydrochloride_10km,Prosulfocarb_10km,Sulphur_10km,Tri-allate_10km,Occurrence
y,x,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1215000.0,465000.0,0,0,0,29,0,0,0,0,0,1,...,,,,,,,,,,1
1205000.0,455000.0,0,0,0,0,0,0,0,0,3,0,...,,,,,,,,,,1
1205000.0,465000.0,0,0,0,0,0,0,0,0,0,0,...,,,,,,,,,,1
1195000.0,435000.0,0,0,0,0,0,0,0,0,0,0,...,,,,,,,,,,1
1195000.0,445000.0,0,0,0,0,0,0,3,0,0,0,...,,,,,,,,,,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15000.0,185000.0,0,0,0,0,0,0,0,0,0,0,...,,,,-3.400000e+38,,-3.400000e+38,,,,1
15000.0,255000.0,0,0,0,0,0,0,0,0,0,0,...,,,,-3.400000e+38,,-3.400000e+38,,,,1
5000.0,85000.0,0,0,0,0,0,0,0,0,0,0,...,,,,,,,,,,1
5000.0,95000.0,0,0,0,0,0,0,0,0,0,0,...,,,,,,,,,,1


In [10]:
# Output dataframe as csv
main_df.to_csv('Datasets/Machine Learning/Dataframes/10km_All_Birds_DF.csv')