In [1]:
import h5py
import numpy as np

In [2]:
import os
import numpy as np
import pandas as pd
import h5py

# Paths to the data folders and files
ndvi_folder = "C:\\Users\\Musae\\Documents\\GitHub-REPOs\\Senior-project_Doc\\Docs\\Array\\NDVI-Array"
ndmi_folder = "C:\\Users\\Musae\\Documents\\GitHub-REPOs\\Senior-project_Doc\\Docs\\Array\\NDMI-Array"
csv_path = "C:\\Users\\Musae\\Documents\\GitHub-REPOs\\Senior-project_Doc\\monthly_averages_formatted.csv"

# Load CSV data
climate_data = pd.read_csv(csv_path)

# Create or open HDF5 file
with h5py.File('environmental_data.h5', 'w') as hdf:
    # Create groups for each type of data
    climate_grp = hdf.create_group('Climate')
    ndvi_grp = hdf.create_group('NDVI')
    ndmi_grp = hdf.create_group('NDMI')
    
    # Add CSV data to the Climate group
    for column in climate_data.columns:
        climate_grp.create_dataset(column, data=climate_data[column].to_numpy())

    # Function to add data to HDF5
    def add_data_to_group(group, folder_path, file_prefix):
        for filename in os.listdir(folder_path):
            if filename.endswith(".npy"):
                file_path = os.path.join(folder_path, filename)
                data = np.load(file_path)
                dataset_name = file_prefix + "_" + filename.split('.')[0]
                group.create_dataset(dataset_name, data=data)

    # Add NDVI data to the NDVI group
    add_data_to_group(ndvi_grp, ndvi_folder, 'NDVI')

    # Add NDMI data to the NDMI group
    add_data_to_group(ndmi_grp, ndmi_folder, 'NDMI')

print("All data has been successfully written to the HDF5 file.")


All data has been successfully written to the HDF5 file.


In [None]:
"C:\\Users\\Musae\\Documents\\GitHub-REPOs\\Vegetation-Cover-In-Riyadh\\Codes\\Structring the data\\environmental_data.h5"

In [5]:
import h5py
import numpy as np
from sklearn.preprocessing import StandardScaler
import pandas as pd

# Open the existing HDF5 file
hdf5_file_path = "C:\\Users\\Musae\\Documents\\GitHub-REPOs\\Vegetation-Cover-In-Riyadh\\Codes\\Structring the data\\environmental_data.h5"
with h5py.File(hdf5_file_path, 'a') as hdf:
    # Load climate data
    temp = hdf['Climate']['Temp Average'][:]  # Adjusted dataset name
    precip = hdf['Climate']['PRECTOTCORR Average'][:]  # Adjusted dataset name

    # Convert to DataFrame for easier handling
    climate_data = pd.DataFrame({
        'Temperature': temp,
        'Precipitation': precip
    })

    # Handle missing values by filling with the median
    climate_data.fillna(climate_data.median(), inplace=True)

    # Scale the data
    scaler = StandardScaler()
    climate_scaled = scaler.fit_transform(climate_data)

    # Replace the original datasets with scaled data
    del hdf['Climate']['Temp Average'], hdf['Climate']['PRECTOTCORR Average']
    climate_grp = hdf['Climate']
    climate_grp.create_dataset('Temperature_Scaled', data=climate_scaled[:, 0])
    climate_grp.create_dataset('Precipitation_Scaled', data=climate_scaled[:, 1])

print("Data preprocessing completed and updated in HDF5 file.")


Data preprocessing completed and updated in HDF5 file.


In [8]:
import h5py
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

with h5py.File(hdf5_file_path, 'r') as hdf:
    # Initialize lists to store features and labels
    features = []
    labels = []

    # Assuming climate data has the same length as NDVI datasets and is aligned
    temp_scaled = hdf['Climate']['Temperature_Scaled'][:]
    precip_scaled = hdf['Climate']['Precipitation_Scaled'][:]
    
    # Load and prepare NDVI features
    ndvi_group = hdf['NDVI']
    sorted_datasets = sorted(ndvi_group.keys())
    for i in range(1, len(sorted_datasets)):
        current_ndvi = ndvi_group[sorted_datasets[i]][:]
        previous_ndvi = ndvi_group[sorted_datasets[i-1]][:]

        # Combine current climate and previous NDVI into features
        # Flatten NDVI data if necessary or use summary statistics like mean, max
        combined_features = np.hstack([temp_scaled[i], precip_scaled[i], np.mean(previous_ndvi)])
        features.append(combined_features)
        labels.append(np.mean(current_ndvi))

# Convert lists to numpy arrays for training
features = np.array(features)
labels = np.array(labels)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)

# Train a Random Forest model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Predict on the test set
predictions = model.predict(X_test)

# Calculate and print the root mean squared error
rmse = np.sqrt(mean_squared_error(y_test, predictions))
print(f"Root Mean Squared Error: {rmse}")


Root Mean Squared Error: 0.00576519238459658


In [9]:
from sklearn.model_selection import cross_val_score

# Assuming 'features' and 'labels' are your full dataset and 'model' is your RandomForestRegressor
scores = cross_val_score(model, features, labels, scoring='neg_mean_squared_error', cv=5)
rmse_scores = np.sqrt(-scores)

print("Cross-validation RMSE scores:", rmse_scores)
print("Mean RMSE:", rmse_scores.mean())
print("Standard deviation:", rmse_scores.std())


Cross-validation RMSE scores: [0.00902148 0.00756931 0.00320033 0.00672428 0.0055663 ]
Mean RMSE: 0.006416337858432081
Standard deviation: 0.001963478473652581
