## Import packages

In [1]:
# Import packages
import argparse
import os
#import geoviews
import warnings
import rasterio as rio
import rasterio
import pandas as pd
import fiona
import matplotlib.pyplot as plt
import numpy.ma as ma
import numpy as np
import xarray as xr
import rioxarray as rxr
from shapely.geometry import mapping, box
import geopandas as gpd
import earthpy as et
import earthpy.spatial as es
import earthpy.plot as ep
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from rasterio.features import rasterize
from rasterio.transform import from_origin
from rasterio.mask import mask

warnings.simplefilter('ignore')

### Code loops through the folder and prints out metadata

In [3]:
# Define the path to the specific TIFF file
mosaic_file_path = r'C:\Users\User\OneDrive - Stellenbosch University\MAPWAPS\DataChapter1\uMzi_ROI\data\EMIT\mosaic_2\EMIT_mosaic.tif'

print(f"\nProcessing file: {mosaic_file_path}")

# Open the image
with rasterio.open(mosaic_file_path) as img:
    # Print the original image metadata
    metadata = img.meta
    print("Metadata:", metadata)
    
    # Get NoData value
    nodata = img.nodatavals[0] if img.nodatavals else None
    if nodata is not None:
        print(f"NoData value: {nodata}")
    
    # Loop through each band and print its statistics
    for band in range(1, img.count + 1):
        band_data = img.read(band)
        
        # Mask NoData values
        if nodata is not None:
            band_data = np.ma.masked_equal(band_data, nodata)
        
        # Calculate statistics ignoring NaN values
        valid_data = band_data[np.isfinite(band_data)]
        
        if valid_data.size > 0:
            print(f"\nBand {band} Statistics:")
            print(f"Min: {valid_data.min()}")
            print(f"Max: {valid_data.max()}")
            print(f"Mean: {valid_data.mean()}")
            print(f"Standard Deviation: {valid_data.std()}")
            print(f"First 5 valid pixel values in Band {band}: {valid_data.flatten()[:5]}")
        else:
            print(f"\nBand {band} contains only NaN values.")



Processing file: C:\Users\User\OneDrive - Stellenbosch University\MAPWAPS\DataChapter1\uMzi_ROI\data\EMIT\mosaic_2\EMIT_mosaic.tif
Metadata: {'driver': 'GTiff', 'dtype': 'float32', 'nodata': None, 'width': 741, 'height': 686, 'count': 277, 'crs': CRS.from_epsg(4326), 'transform': Affine(0.0005422325202563664, 0.0, 28.79877741333535,
       0.0, -0.0005422325202563676, -30.673875192500052)}

Band 1 Statistics:
Min: 0.0038172972854226828
Max: 0.25613316893577576
Mean: 0.028315644711256027
Standard Deviation: 0.007758589927107096
First 5 valid pixel values in Band 1: [0.03343217 0.02979366 0.02979366 0.02811558 0.03058411]

Band 2 Statistics:
Min: 0.003957449924200773
Max: 0.29933997988700867
Mean: 0.0310385599732399
Standard Deviation: 0.009054050780832767
First 5 valid pixel values in Band 2: [0.0362503  0.03142168 0.03142168 0.02955993 0.0325173 ]

Band 3 Statistics:
Min: 0.004167942795902491
Max: 0.3212912380695343
Mean: 0.033452749252319336
Standard Deviation: 0.01003800518810749
Fi

### This code cell calculates and adds indices to each image in the folder

In [7]:
# Define output path
added_indices = r'C:\Users\SkosanaT\OneDrive - Stellenbosch University\MAPWAPS\DataChapter1\uMzi_ROI\data\EMIT\13092014_emit\AddedIndices'

# Define custom band names for the added indices
band_names = ['GI', 'IRG', 'NGRDI', 'VARI', 'VDVI']

try:
    # Open the image
    with rasterio.open(mosaic_file_path) as img:
        # Read all bands
        full_img = img.read()

        # Extract bands
        red_band = full_img[35, :, :]
        green_band = full_img[20, :, :]
        blue_band = full_img[10, :, :]

        # Calculate vegetation indices
        # 1) Greenness Index (GI) (Green/Red)
        GI = np.divide(green_band, red_band, out=np.full_like(green_band, np.nan, dtype=float), where=red_band != 0)

        # 2) IRG (Red-Green)
        IRG = red_band - green_band

        # 3) NGRDI (Green-Red)/(Green + Red)
        NGRDI = np.divide((green_band - red_band),
                          (green_band + red_band),
                          where=(green_band + red_band) != 0)

        # 4) Calculate VARI (Green-Red)/(Green+Red+Blue)
        VARI = np.divide((green_band - red_band),
                         (green_band + red_band + blue_band),
                         out=np.full_like(green_band, np.nan, dtype=float),  # Specify dtype=float
                         where=(green_band + red_band + blue_band) != 0)

        # 5) VDVI (2* Green-Red-Blue)/(2*Green+Red+Blue)
        VDVI = np.divide((2 * green_band - red_band - blue_band),
                         (2 * green_band + red_band + blue_band),
                         out=np.full_like(green_band, np.nan, dtype=float),  # Specify dtype=float
                         where=(2 * green_band + red_band + blue_band) != 0)

        # Add the calculated indices as new bands to the image
        indices = np.stack([GI, IRG, NGRDI, VARI, VDVI], axis=0)
        updated_img = np.concatenate((full_img, indices), axis=0)

        # Update band names in metadata profile
        profile = img.profile.copy()  # Copy the profile to avoid modifying the original
        if 'descriptions' not in profile:
            profile['descriptions'] = [''] * img.count
        band_names_all = profile['descriptions'] + band_names
        profile.update(count=profile['count'] + len(indices), dtype='float32', descriptions=band_names_all)

        # Write the modified image array to a new raster file
        with rasterio.open(added_indices, 'w', **profile) as dst:
            dst.write(updated_img.astype('float32'))  # Convert to float32 before writing

        print(f"Indices added to the image and saved as {added_indices}")

except Exception as e:
    print(f"Error processing the image: {e}")


Indices added to the image and saved as C:\Users\User\OneDrive - Stellenbosch University\MAPWAPS\DataChapter1\uMzi_ROI\data\EMIT\AddedIndices\emit_added_indices.tif


### This cell prints out band numbers of each image to see if they were added as bands

In [8]:
# Open the image
with rasterio.open(added_indices) as img:
    # Print the number of bands
    print(f"Image '{added_indices}' has {img.count} bands.")

Image 'C:\Users\User\OneDrive - Stellenbosch University\MAPWAPS\DataChapter1\uMzi_ROI\data\EMIT\AddedIndices\emit_added_indices.tif' has 282 bands.


### This code cell extracts pixel values

In [3]:
# Define paths
pntsshp_path = r'C:\Users\SkosanaT\OneDrive - Stellenbosch University\MAPWAPS\DataChapter1\uMzi_ROI\Train_Val\Training_data\train_shapefile.shp'

# Open shapefile and extract points' coordinates [0]= x coordinate, [1]= y coordinate and attributes
with fiona.open(pntsshp_path, 'r') as shapefile:
    points = [[(point['geometry']['coordinates'][0], point['geometry']['coordinates'][1]),
               (int(point['properties']['ID']), point['properties']['Val_id']),
               (point['properties']['X'], point['properties']['Y'])] for point in shapefile]

# Initialize dictionary to store band pixel values associated with each point
point_pixel_values = {}

# Open the raster image
with rasterio.open(added_indices) as src:
    imagename = os.path.splitext(os.path.basename(added_indices))[0]  # Remove the file extension
    
    # Iterate over each band in the image
    for band in range(1, src.count + 1):
        
        # Iterate over each point and extract pixel values
        for point in points:
            row, col = src.index(point[0][0], point[0][1])  # Find the point within the raster image
            values = src.read(band, window=((row, row+1), (col, col+1)))  # Read the band values

            if values.size != 0:  # Check if bands are valid/not empty
                # Prepare key for the current point
                point_key = f"Point_id: {point[1][1]}, X: {point[2][0]}, Y: {point[2][1]}, Class_ID: {point[1][0]}"
                if point_key not in point_pixel_values:
                    point_pixel_values[point_key] = {}

                # Add pixel values to the dictionary
                point_pixel_values[point_key][f'{imagename}_Band_{band}'] = values[0][0]

# Print point_pixel_values
for point_key, pixel_values in point_pixel_values.items():
    print(f"Point: {point_key}, Pixel Values: {pixel_values}")

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



### This cell changes point_pixel_value dictionary to a dataframe

In [4]:
# Create a list of dictionaries for DataFrame creation
data_list = []

for point_key, pixel_values in point_pixel_values.items():
    data_dict = {}
    # Extracting Point ID, X, Y, and Class ID from point_key
    point_id = point_key.split(',')[0].split(':')[1].strip()  
    x_coord = point_key.split(',')[1].split(':')[1].strip()
    y_coord = point_key.split(',')[2].split(':')[1].strip()
    class_id = point_key.split(',')[3].split(':')[1].strip()
    
    # Add extracted information as separate columns
    data_dict['Point'] = point_id
    data_dict['X'] = x_coord
    data_dict['Y'] = y_coord
    data_dict['Class_ID'] = class_id
    
    # Add band pixel values as columns
    data_dict.update(pixel_values)

    data_list.append(data_dict)

# Create DataFrame from list of dictionaries
pixelvalues_df = pd.DataFrame(data_list)

# Print the DataFrame
print(pixelvalues_df)

# Save the DataFrame to a CSV file
# Define the CSV file path
csv_file_path = r'C:\Users\User\OneDrive - Stellenbosch University\MAPWAPS\DataChapter1\uMzi_ROI\data\EMIT\Band_values\pixel_values.csv'

# Save the DataFrame to a CSV file
pixelvalues_df.to_csv(csv_file_path, index=False)

print(f'DataFrame successfully saved to {csv_file_path}')

        Point            X             Y Class_ID  emit_added_indices2_Band_1  \
0     VAL_214  702104.9463  6588745.0354        6                    0.020354   
1    VAL_2226  708475.5249  6590234.8424        3                    0.020397   
2    VAL_3294  688484.9531  6575784.3999        8                    0.033354   
3     VAL_205   695464.998  6568135.0354        2                    0.014485   
4     VAL_246  696218.0801  6598498.9627        5                    0.028165   
..        ...          ...           ...      ...                         ...   
193  VAL_2075  694724.7618  6567514.4758        2                    0.016504   
194   VAL_201  694615.3231  6567555.1575        2                    0.014737   
195   VAL_125  700894.8945  6577725.0944        4                    0.040150   
196  VAL_2062  679634.4435  6575755.0431        5                    0.024881   
197  VAL_2889  695455.1923  6568115.2588        2                    0.017829   

     emit_added_indices2_Ba

### This cell block trains the random forest classifier 

In [5]:
# Define features 
# Find the index of the last non-band column
last_non_band_index = pixelvalues_df.columns.tolist().index('Class_ID')  #

# Select columns starting from the column following the last non-band column
#These columns are the band features used in the classification
features = pixelvalues_df.iloc[:, last_non_band_index + 1:]
#print(features)

# Define target (Class ID column)
target = pixelvalues_df['Class_ID']

# Convert features and target columns to arrays
features_array = features.values
target_array = target.values

# Train the classifier using RandomForest with 500 trees
classifier = RandomForestClassifier(n_estimators=500)
classifier.fit(features_array, target_array)

In [None]:
# Define features 
# Find the index of the last non-band column
last_non_band_index = pixelvalues_df.columns.tolist().index('Class_ID')  #

# Select columns starting from the column following the last non-band column
#These columns are the band features used in the classification
features = pixelvalues_df.iloc[:, last_non_band_index + 1:]
#print(features)

# Define target (Class ID column)
target = pixelvalues_df['Class_ID']

# Convert features and target columns to arrays
features_array = features.values
target_array = target.values

# Train the classifier using RandomForest with 500 trees
classifier = RandomForestClassifier(n_estimators=500)
classifier.fit(features_array, target_array)

# Print information about features_array
print("Features Array:")
print(f"Shape: {features_array.shape}")
print(f"Data Type: {features_array.dtype}")
print("First 5 rows:\n", features_array[:5])

# Print information about target_array
print("\nTarget Array:")
print(f"Shape: {target_array.shape}")
print(f"Data Type: {target_array.dtype}")
print("First 5 values:\n", target_array[:5])


# Train the classifier using RandomForest with 500 trees
classifier = RandomForestClassifier(n_estimators=500)
classifier.fit(features_array, target_array)

### This cell block performs the classification on each image in the folder using the trained classifer in the previous cell

In [6]:
# Define the directory where you want to save the classified images
output_folder_classified = r'C:\Users\User\OneDrive - Stellenbosch University\MAPWAPS\DataChapter1\uMzi_ROI\data\EMIT\Classified'
output_folder= r'C:\Users\User\OneDrive - Stellenbosch University\MAPWAPS\DataChapter1\uMzi_ROI\data\EMIT\AddedIndices'
os.makedirs(output_folder, exist_ok=True)

# Iterate over all .tif or .TIF files in the specified directory
for filename in os.listdir(output_folder):
    if filename.lower().endswith('.tif') or filename.endswith('.TIF'):
        image_path = os.path.join(output_folder, filename)
        
        # Open the image using rasterio
        with rasterio.open(image_path) as src:
            # Get the shape of the image
            rows, cols = src.height, src.width
        
            # Define batch size (adjust as needed based on memory constraints)
            batch_size = 1000  # You can adjust this value
        
            # Initialize the classification result array for the entire image
            classification_result = np.zeros((rows, cols), dtype=np.uint8)
        
            # Iterate over the image in batches
            for row_start in range(0, rows, batch_size):
                for col_start in range(0, cols, batch_size):
                    row_end = min(row_start + batch_size, rows)
                    col_end = min(col_start + batch_size, cols)
        
                    # Read the batch of bands
                    bands = [src.read(band_idx, window=((row_start, row_end), (col_start, col_end))) 
                             for band_idx in range(1, src.count + 1)]
        
                    # Stack the bands into a single array
                    stacked_bands = np.stack(bands, axis=-1)
        
                    # Reshape the array to 2D (rows, columns) for classification
                    reshaped_bands = stacked_bands.reshape(-1, src.count)
        
                    # Predict using the trained classifier (replace this with your classifier)
                    predicted_labels = classifier.predict(reshaped_bands)
        
                    # Create a temporary result array for this batch
                    batch_result = predicted_labels.reshape(row_end - row_start, col_end - col_start)
        
                    # Write the batch result to the corresponding window in the full classification result array
                    classification_result[row_start:row_end, col_start:col_end] = batch_result
        
            # Construct the output file path
            output_filename = 'emit_classified20240827.tif'
            output_path = os.path.join(output_folder_classified, output_filename)
        
            # Prepare metadata for the classified image
            meta = src.meta.copy()
            meta.update({
                'driver': 'GTiff',
                'dtype': 'uint8',  # Ensure data type is appropriate for classification results
                'count': 1,  # Single band
                'compress': 'lzw',  # Compression method (Lempel-Ziv-Welch)
                'crs': src.crs,  # Use the same CRS as the input raster
                'transform': src.transform,  # Use the same transform as the input raster
            })
        
            # Write the full classified image to a new GeoTIFF file
            with rasterio.open(output_path, 'w', **meta) as dst:
                dst.write(classification_result.astype('uint8'), 1)
        
        print("Classification completed and classified image saved successfully!")


Classification completed and classified image saved successfully!
