In [None]:
#!pip install rasterio

In [None]:
#!pip install gdal

In [None]:
#!conda install -c conda-forge gdal

In [1]:
import os
import numpy as np
#from osgeo import gdal, gdal_array, ogr
import rasterio as rio
import pandas as pd
import pickle
import cubist
from sklearn.cross_decomposition import PLSRegression
from sklearn.model_selection import train_test_split
import multiprocessing as mp
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.impute import SimpleImputer

In [2]:
# Function to calculate first-order differences of spectra
def fod (spectra):
    fo_spec = spectra.iloc[:,:]
    (row,col) = fo_spec.shape
    
    for i in range(0, col):
        if i==col-1:
            fo_spec.iloc[:,i] = fo_spec.iloc[:,i-1]
        else:    
            fo_spec.iloc[:,i] = (spectra.iloc[:,i+1]- spectra.iloc[:,i])
    return fo_spec

# Worker function for parallel predictions
def worker(arr):
    return model.predict(arr)

# Function to make parallel predictions
def make_parallel_predictions(model, X):
    num_processes = mp.cpu_count()
    chunk_size = int(X.shape[0] / num_processes)
    chunks = [X[i:i + chunk_size] for i in range(0, X.shape[0], chunk_size)]

    with mp.Pool(processes=num_processes) as pool:
        results = pool.map(worker, chunks)

    y_pred = np.concatenate(results, axis=0)
    return y_pred
    

In [3]:
## VENuS Image
# fname = './VE_VM03_VSC_L2VALD_ISRAELWI_20220824/VE_VM03_VSC_L2VALD_ISRAELWI_20220824.DBL.DIR/VE_VM03_VSC_PDTIMG_L2VALD_ISRAELWI_20220824_SRE.DBL.TIF'
fname = 'VE_VM03_VSC_L2VALD_ISRAELWI_20220824_Clip.tif'

src = rio.open(fname)
metadata = src.meta
num_bands = src.count

venus_image = []

for i in range(num_bands):
    image = src.read(i + 1)
    image = image.astype(float)
    image[image < 0] = np.nan
    image = image / 1000
    image[image < 0] = np.nan
    venus_image.append(image.flatten())

venus_image = np.array(venus_image).T

In [4]:
# Function to preprocess input data based on soil property
def get_process_func(soil_property):
    if soil_property == 'caco3' or soil_property == 'toc':
        return lambda df: fod((1 / df).apply(np.log))
    else:
        return lambda df: fod(df)

# Function to get appropriate model based on soil property
def get_model_dump(soil_property):
    if soil_property == 'caco3':
        return './models/CaCO3_model_cubist.pkl'
    elif soil_property == 'clay':
        return './models/clay_model_cubist.pkl'
    elif soil_property == 'toc':
        return './models/TOC_model_cubist.pkl'
    elif soil_property == 'silt':
        return './models/silt_model_gbrt.pkl'
    elif soil_property == 'sand':
        return './models/sand_model_gbrt.pkl'

# Function to predict soil property using pre-trained model
def predict_soil_property(soil_property, venus_df_cleaned):
    imputer = SimpleImputer(strategy='mean')
    process_func = get_process_func(soil_property)
    process2 = process_func(venus_df_cleaned)
    process2.replace([np.inf, -np.inf], np.nan, inplace=True)
    x_pred = pd.DataFrame(imputer.fit_transform(process2), columns=process2.columns)
    print(f'Model loaded for {soil_property}')
    dumpName = get_model_dump(soil_property)
    model = pickle.load(open(dumpName, 'rb'))
    return model.predict(x_pred)

In [5]:
venus_df = pd.DataFrame(venus_image)
venus_df.columns = ['420', '443', '490', '555', '619', '638', '672', '702', '742', '782', '865', '910']
venus_df.drop('619', axis=1, inplace=True)

In [6]:
# 1. Remove NaN values from the dataframe
nan_indices = venus_df.index[venus_df.isna().any(axis=1)]
venus_df_cleaned = venus_df.dropna()

In [7]:
# # Initialize DataFrame with all five soil properties
# df = pd.DataFrame(index=venus_df_cleaned.index)

In [None]:
# with open ('y_pred.pickle', 'rb') as file:
#     y_pred = pickle.load(file)
 

In [8]:
# Define soil properties
soil_properties = ['caco3', 'clay', 'toc', 'silt', 'sand']

In [11]:
# Check if pickle file exists
pickle_file = 'predictions.pickle'
try:
    # Try to load predictions from pickle file
    with open(pickle_file, 'rb') as f:
        predictions = pickle.load(f)
    print("Predictions loaded from pickle file.")
except FileNotFoundError:
    # If pickle file doesn't exist, make predictions and save to pickle file
    predictions = {}

In [13]:
# Predict each soil property and store in DataFrame
for soil_property in soil_properties:
    y_pred = predict_soil_property(soil_property, venus_df_cleaned)
    print(f'Soil property: {soil_property}, y_pred size: {y_pred.size}')
    predictions[soil_property] = y_pred

Model loaded for caco3


  x = x.applymap(lambda a: a.lstrip())


Soil property: caco3, y_pred size: 3138920
Model loaded for clay


  x = x.applymap(lambda a: a.lstrip())


Soil property: clay, y_pred size: 3138920
Model loaded for toc


  x = x.applymap(lambda a: a.lstrip())


Soil property: toc, y_pred size: 3138920
Model loaded for silt
Soil property: silt, y_pred size: 3138920
Model loaded for sand
Soil property: sand, y_pred size: 3138920


In [None]:
# Save predictions to pickle file
with open(pickle_file, 'wb') as f:
    pickle.dump(predictions, f)

print("Predictions saved to pickle file.")

In [None]:
# Store predictions in DataFrame
df = pd.DataFrame(predictions)

In [None]:
# Reshape y_pred using the dimensions of the VENuS image
y_pred_reshaped = y_pred.reshape(src.height, src.width)

In [None]:
y_pred.size

In [None]:
src.height

In [None]:
src.width

In [None]:
# Printing minimum and maximum values of caco3 in the predictions
minimum_value = df['caco3'].min()
maximum_value = df['caco3'].max()

print(f"Minimum value: {minimum_value}")
print(f"Maximum value: {maximum_value}")

In [None]:
# Printing minimum and maximum values of caco3 in the predictions
minimum_value = df['clay'].min()
maximum_value = df['clay'].max()

print(f"Minimum value: {minimum_value}")
print(f"Maximum value: {maximum_value}")

In [None]:
# Printing minimum and maximum values of caco3 in the predictions
minimum_value = df['toc'].min()
maximum_value = df['toc'].max()

print(f"Minimum value: {minimum_value}")
print(f"Maximum value: {maximum_value}")

In [None]:
# Printing minimum and maximum values of caco3 in the predictions
minimum_value = df['sand'].min()
maximum_value = df['sand'].max()

print(f"Minimum value: {minimum_value}")
print(f"Maximum value: {maximum_value}")

In [None]:
# Printing minimum and maximum values of caco3 in the predictions
minimum_value = df['silt'].min()
maximum_value = df['silt'].max()

print(f"Minimum value: {minimum_value}")
print(f"Maximum value: {maximum_value}")

In [None]:
# y_pred = y_pred.flatten()
# predicted_image = y_pred.reshape(metadata['height'], metadata['width'])
# predicted_image = np.where(np.isnan(predicted_image), 0, predicted_image)
# predicted_image = np.expand_dims(predicted_image, axis=0)
# model_name = dumpName.split('.pkl')[0].split('_')[-1]
# output_path = f'./results/20220824_Clip_Predicted_{model_name}_{soil_property}.tif'
# output_image = rio.open(output_path, 'w', driver = 'GTiff', dtype = 'float32', crs = src.crs, width=metadata['width'], height=metadata['height'], count=1, transform = src.transform)
# output_image.write(predicted_image)
# output_image.close()

# print("Predicted results saved :", output_path)

In [None]:
# Write predicted image to GeoTIFF file
output_path = f'./results/20220824_Clip_Predicted_{model_name}_{soil_property}.tif'
output_image = rio.open(output_path, 'w', driver='GTiff', dtype='float32', crs=src.crs, 
                            width=src.width, height=src.height, count=1, transform=src.transform)
output_image.write(y_pred_reshaped, 1)
output_image.close()

print("Predicted results saved:", output_path)