In [None]:
# Pandas is used for data manipulation
import pandas as pd


In [7]:
import pandas as pd
import numpy as np

# Read the data in CSV format
soil_data = pd.read_csv('soil.txt', header=0, sep=",")

# Remove rows with missing values
soil_data = soil_data.dropna()

# Select specific columns (excluding columns 1 to 4, 17, and 20)
soil_df = soil_data#.drop(soil_data.columns[[0, 1, 2, 3, 16, 19]], axis=1)

# Convert column 16 (now column 12 after dropping columns) to categorical type
soil_df.iloc[:, 12] = soil_df.iloc[:, 12].astype('category')

# Number of data
nd = soil_df.shape[0]

# Split Calibration: 75% and Validation 25%
np.random.seed(111)  # Set a random seed
ic = np.random.choice(nd, size=round(nd * 0.75), replace=False)  # Generate a random permutation of data

# Form the calibration set
cali = soil_df.iloc[ic, :]

# Form the validation set by selecting the indices not in the calibration set
vali_indices = soil_df.index.difference(ic)
vali = soil_df.loc[vali_indices]

# Print the number of samples in each set to verify
print(f'Number of samples in calibration set: {cali.shape[0]}')
print(f'Number of samples in validation set: {vali.shape[0]}')


Number of samples in calibration set: 2206
Number of samples in validation set: 876


In [8]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score

# Define the features and target variable
features = ['Depth', 'tmax', 'tmin', 'prcp', 'lc', 'clay', 'silt', 'sand', 'dem', 'slope', 'aspect', 'hillshade', 'twi', 'mrvbf']
target = 'SOC'

# Extract features and target variable for calibration set
X_cali = cali[features]
y_cali = cali[target]

# Train the Random Forest model
rf_model = RandomForestRegressor(n_estimators=500, max_features=10, random_state=42)
rf_model.fit(X_cali, y_cali)

# Predict on the calibration data
rf_predict = rf_model.predict(X_cali)

# Calculate goodness of fit
gof_rf_predict = r2_score(y_cali, rf_predict)

# Calibration
print("Goodness of fit:", gof_rf_predict)


Goodness of fit: 0.9222235623515054


In [18]:
X_cali.columns

Index(['Depth', 'tmax', 'tmin', 'prcp', 'lc', 'clay', 'silt', 'sand', 'dem',
       'slope', 'aspect', 'hillshade', 'twi', 'mrvbf'],
      dtype='object')

In [9]:
from joblib import dump

# Guarda el modelo entrenado en un archivo
dump(rf_model, 'rf_model.joblib')


['rf_model.joblib']

In [11]:
from joblib import load

# Carga el modelo desde el archivo
rf_model = load('rf_model.joblib')

# Usa el modelo cargado para hacer predicciones
predictions = rf_model.predict(X_cali)  # Por ejemplo, X_test es tu conjunto de datos de prueba


In [12]:
predictions

array([0.2800792 , 0.42556075, 2.37493076, ..., 1.78812623, 1.88244104,
       0.85083708])

In [13]:
grid_2002=pd.read_csv('grid_2002.txt', header=0, sep=",")
#read.table('grid_2002.txt',header = T,sep = ",")
grid_2002.iloc[:, 16] = grid_2002.iloc[:, 16].astype('category')

In [20]:
grid_20021 = grid_2002.dropna()

In [23]:
grid_20021

Unnamed: 0,Column,POINT_X,POINT_Y,Depth,dem,slope,aspect,hillshade,twi,mrvbf,clay,silt,sand,BD,pH,lc,tmax,tmin,prcp
0,1,387009.77612,2.677053e+06,5,189,0.000057,6.283185,0.785358,11.223489,2.568818,10.0,35.0,55.0,0.680,4.8,9.0,10.702740,0.556164,753.0
1,2,385009.77612,2.676553e+06,5,195,0.000036,4.321398,0.785384,10.409254,1.721937,8.0,34.0,58.0,0.643,4.7,9.0,10.704109,0.517808,762.0
2,3,386009.77612,2.676553e+06,5,188,0.023566,0.785398,0.785676,6.614168,3.018307,9.0,31.0,60.0,0.722,4.7,9.0,10.695890,0.510959,764.0
3,4,386509.77612,2.676553e+06,5,185,0.000047,4.712389,0.785365,11.618745,3.698948,8.0,28.0,64.0,0.742,4.9,15.0,10.695890,0.510959,764.0
4,5,383509.77612,2.676053e+06,5,192,0.019766,0.001685,0.771545,7.118727,1.829659,9.0,35.0,56.0,0.685,4.7,9.0,10.756165,0.646575,750.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
711557,711558,333509.77612,2.426053e+06,15,226,0.000036,4.324070,0.785384,13.558636,7.956529,5.0,17.0,78.0,1.166,5.4,8.0,13.769863,2.810959,1083.0
711558,711559,334009.77612,2.426053e+06,15,226,0.000036,4.321398,0.785384,9.873368,7.960195,6.0,23.0,71.0,1.133,6.2,9.0,13.757534,2.789041,1090.0
711559,711560,334509.77612,2.426053e+06,15,224,0.000123,5.890779,0.785285,9.639392,7.952762,7.0,16.0,77.0,1.104,5.3,8.0,13.757534,2.789041,1090.0
711560,711561,335009.77612,2.426053e+06,15,226,0.000047,4.712389,0.785365,12.717940,7.962896,8.0,23.0,69.0,1.072,5.2,9.0,13.767123,2.793151,1105.0


In [21]:
grid_20021.columns

Index(['Column', 'POINT_X', 'POINT_Y', 'Depth', 'dem', 'slope', 'aspect',
       'hillshade', 'twi', 'mrvbf', 'clay', 'silt', 'sand', 'BD', 'pH', 'lc',
       'tmax', 'tmin', 'prcp'],
      dtype='object')

In [22]:
new=rf_model.predict(grid_20021[features])
new

array([5.6262144 , 4.71009272, 4.5481641 , ..., 1.54036715, 2.36567833,
       2.8841848 ])

In [29]:
grid_20021[features][:1]

Unnamed: 0,Depth,tmax,tmin,prcp,lc,clay,silt,sand,dem,slope,aspect,hillshade,twi,mrvbf
0,5,10.70274,0.556164,753.0,9.0,10.0,35.0,55.0,189,5.7e-05,6.283185,0.785358,11.223489,2.568818


In [None]:
exmp = {'Depth': 5,
                                                     'tmax': 10.702739716,
                                                     'tmin': 0.5561643839,
                                                     'prcp': 753.0,
                                                     'lc': 9.0,
                                                     'clay': 10.0,
                                                     'silt': 35.0,
                                                     'sand': 55.0,
                                                     'dem': 189,
                                                     'slope': 5.69661e-05,
                                                     'aspect': 6.283185482,
                                                     'hillshade': 0.7853578925,
                                                     'twi': 11.223488808,
                                                     'mrvbf': 2.5688176155}

In [32]:

#exmp
features = ['Depth', 'tmax', 'tmin', 'prcp', 'lc', 'clay', 'silt', 'sand', 'dem', 'slope', 'aspect', 'hillshade', 'twi', 'mrvbf']
dictionary = {key: [value] for key, value in class_model.dict().items()}
df = DataFrame(dictionary)
df = df[features]
df = df.dropna()

5.626214404736811

In [30]:
selected_features = grid_20021[features].iloc[:1].to_dict(orient='records')[0]
selected_features

{'Depth': 5,
 'tmax': 10.702739716,
 'tmin': 0.5561643839,
 'prcp': 753.0,
 'lc': 9.0,
 'clay': 10.0,
 'silt': 35.0,
 'sand': 55.0,
 'dem': 189,
 'slope': 5.69661e-05,
 'aspect': 6.283185482,
 'hillshade': 0.7853578925,
 'twi': 11.223488808,
 'mrvbf': 2.5688176155}

In [25]:
grid_20021[features].dtypes

Depth           int64
tmax         category
tmin          float64
prcp          float64
lc            float64
clay          float64
silt          float64
sand          float64
dem             int64
slope         float64
aspect        float64
hillshade     float64
twi           float64
mrvbf         float64
dtype: object