PREPROCESSING

In [1]:
import pandas as pd

def preprocess_data(df):
    # Check for missing values
    if df.isnull().values.any():
        # Calculate mean for numeric columns only
        numeric_cols = df.select_dtypes(include='number').columns
        df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].mean())
    
    # Change non-numeric values to numeric values
    df = df.apply(pd.to_numeric, errors='coerce')
    
    return df

# Example usage:
# Load your dataset into a pandas DataFrame
df = pd.read_csv('values.csv')

# Preprocess the data
preprocessed_df = preprocess_data(df)
print (preprocessed_df)


         A(410)       B(435)       C(460)       D(485)       E(510)   F(535)  \
0   2429.580000  1108.990000  2091.930000   737.330000   835.140000  1376.82   
1   1707.900000   725.380000  1974.490000   670.810000   835.930000  1685.28   
2   1750.410000  1102.020000  2481.050000   816.800000   960.650000  1741.43   
3   2546.680000  1548.400000  2791.560000  1007.160000  1128.000000  1573.73   
4   3090.540000  1266.420000  2933.880000   940.610000  1109.840000  2047.64   
..          ...          ...          ...          ...          ...      ...   
95  2247.430000   833.990000  2195.430000   523.900000   786.990000  1074.36   
96  2374.070000   989.420000  2382.530000   567.000000   918.030000  1189.65   
97  2318.550000   821.030000  2096.900000   507.260000   781.470000  1070.61   
98  3289.170000  1234.540000  3234.430000   873.160000  1112.210000  1742.18   
99  2163.039798  1015.779354  2471.640101   744.820303   963.049697      NaN   

        G(560)      H(585)       R(610)

PLSR AND SVMR WITH PCA FOR PREPROCESSING 

In [17]:
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.cross_decomposition import PLSRegression
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer


data = pd.read_csv("preprocessed_data.csv")  


X = data[['A(410)', 'B(435)', 'C(460)', 'D(485)', 'E(510)', 'F(535)', 'G(560)',
               'H(585)', 'R(610)', 'I(645)', 'S(680)', 'J(705)', 'U(760)',
               'V(810)', 'W(860)', 'K(900)', 'L(940)', 'T(730)']]  
y = data[['P   (kg/ha)', 'K (kg/ha)', 'Ca (meq/100g)', 'Mg (meq/100g)',
             'S (ppm)', 'Fe (ppm)', 'Mn (ppm)', 'Cu (ppm)', 'Zn (ppm)', 'B (ppm)']]  

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Handle missing values
imputer = SimpleImputer(strategy='mean')  # You can change the strategy if needed
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)

# Apply PCA
pca = PCA(n_components=10)  # Adjust the number of components as needed
X_train_pca = pca.fit_transform(X_train_imputed)
X_test_pca = pca.transform(X_test_imputed)

# Partial Least Squares Regression (PLSR) with PCA
pls_model = PLSRegression(n_components=2)
pls_model.fit(X_train_pca, y_train)
y_pred_pls = pls_model.predict(X_test_pca)
# mse_pls = mean_squared_error(y_test, y_pred_pls)
# print("PLSR with PCA Mean Squared Error:", mse_pls)

# # Support Vector Machine Regression (SVMR) with PCA
# scaler = StandardScaler()
# X_train_scaled = scaler.fit_transform(X_train_pca)
# X_test_scaled = scaler.transform(X_test_pca)
# svr_model = SVR(kernel='rbf')
# svr_model.fit(X_train_scaled, y_train)
# y_pred_svr = svr_model.predict(X_test_scaled)
# mse_svr = mean_squared_error(y_test, y_pred_svr)
# print("SVMR with PCA Mean Squared Error:", mse_svr)

from sklearn.multioutput import MultiOutputRegressor

# Support Vector Machine Regression (SVMR) with PCA and multiple target variables
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_pca)
X_test_scaled = scaler.transform(X_test_pca)

# Initialize SVR
svr_model = SVR(kernel='rbf')

# Wrap SVR in MultiOutputRegressor
multioutput_svr = MultiOutputRegressor(svr_model)

# Fit the multi-output SVR model
multioutput_svr.fit(X_train_scaled, y_train)

# Predict for the test set
y_pred_svr = multioutput_svr.predict(X_test_scaled)

# Calculate Mean Squared Error
# mse_svr = mean_squared_error(y_test, y_pred_svr)
# print("SVMR with PCA Mean Squared Error:", mse_svr)




In [16]:
import pandas as pd

# Load your dataset
data = pd.read_csv("preprocessed_data.csv")  # Replace with the path to your dataset file

# Print the column names to check for mismatches
print(data.columns)



X = data[['A(410)', 'B(435)', 'C(460)', 'D(485)', 'E(510)', 'F(535)', 'G(560)',
          'H(585)', 'R(610)', 'I(645)', 'S(680)', 'J(705)', 'U(760)',
          'V(810)', 'W(860)', 'K(900)', 'L(940)', 'T(730)']]

# Update these column names to match your dataset
y = data[['P   (kg/ha)','K (kg/ha)','Ca (meq/100g)','Mg (meq/100g)','S (ppm)','Fe (ppm)','Mn (ppm)','Cu (ppm)','Zn (ppm)','B (ppm)']]

# Proceed with the rest of the code as before
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Handle missing values
imputer = SimpleImputer(strategy='mean')
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)

# Apply PCA
pca = PCA(n_components=10)
X_train_pca = pca.fit_transform(X_train_imputed)
X_test_pca = pca.transform(X_test_imputed)

# Partial Least Squares Regression (PLSR) with PCA
pls_model = PLSRegression(n_components=2)
pls_model.fit(X_train_pca, y_train)
y_pred_pls = pls_model.predict(X_test_pca)
# mse_pls = mean_squared_error(y_test, y_pred_pls)
# print("PLSR with PCA Mean Squared Error:", mse_pls)

# Support Vector Machine Regression (SVMR) with PCA and multiple target variables
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_pca)
X_test_scaled = scaler.transform(X_test_pca)

svr_model = SVR(kernel='rbf')
multioutput_svr = MultiOutputRegressor(svr_model)
multioutput_svr.fit(X_train_scaled, y_train)
y_pred_svr = multioutput_svr.predict(X_test_scaled)
# mse_svr = mean_squared_error(y_test, y_pred_svr)
# print("SVMR with PCA Mean Squared Error:", mse_svr)

# Function to predict nutrient values based on user input wavelengths
def predict_nutrients(wavelengths):
    # Convert the input list to a DataFrame
    user_data = pd.DataFrame([wavelengths], columns=X.columns)
    
    # Handle missing values
    user_data_imputed = imputer.transform(user_data)
    
    # Apply PCA
    user_data_pca = pca.transform(user_data_imputed)
    
    # Scale the data
    user_data_scaled = scaler.transform(user_data_pca)
    
    # Predict using the trained PLS model
    pls_prediction = pls_model.predict(user_data_pca)
    
    # Predict using the trained SVM model
    svr_prediction = multioutput_svr.predict(user_data_scaled)
    
    return pls_prediction, svr_prediction


def get_user_wavelengths():
    wavelengths = []
    for i in range(18):
        while True:
            try:
                value = float(input(f"Enter value for wavelength {i+1}: "))
                wavelengths.append(value)
                break
            except ValueError:
                print("Invalid input. Please enter a numerical value.")
    return wavelengths

# Get user input
user_wavelengths = get_user_wavelengths()

# Predict nutrient values
pls_pred, svr_pred = predict_nutrients(user_wavelengths)

print("PLSR Prediction:", pls_pred)
print("SVMR Prediction:", svr_pred)


Index(['A(410)', 'B(435)', 'C(460)', 'D(485)', 'E(510)', 'F(535)', 'G(560)',
       'H(585)', 'R(610)', 'I(645)', 'S(680)', 'J(705)', 'U(760)', 'V(810)',
       'W(860)', 'K(900)', 'L(940)', 'T(730)', 'pH', 'EC  (dS/m)', 'OC (%)',
       'P   (kg/ha)', 'K (kg/ha)', 'Ca (meq/100g)', 'Mg (meq/100g)', 'S (ppm)',
       'Fe (ppm)', 'Mn (ppm)', 'Cu (ppm)', 'Zn (ppm)', 'B (ppm)'],
      dtype='object')
PLSR with PCA Mean Squared Error: 4519.7858676866035
SVMR with PCA Mean Squared Error: 5958.694726061934
PLSR Prediction: [[-4.34041863e+01 -3.29043354e+02  2.44734749e+01 -2.35618863e-01
   1.55654687e+01  9.48891971e+01  5.01862280e+01  1.46025755e+01
  -1.21114539e+00  1.21095800e+00]]
SVMR Prediction: [[ 22.89136681 251.09161838   7.68852505   2.76025689  14.51893539
   12.46953651  13.75793649   1.79836452   1.13876407   0.56297738]]


PLS-ANN-PLS for dimensionality reduction followed by ANN for regression

In [13]:
import pandas as pd
from sklearn.cross_decomposition import PLSRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.neural_network import MLPRegressor
from sklearn.multioutput import MultiOutputRegressor

# Load your dataset
data = pd.read_csv("preprocessed_data.csv")  # Replace with the path to your dataset file

# Print the column names to check for mismatches
print(data.columns)

# Update these column names to match the actual column names in your dataset
feature_columns = ['A(410)', 'B(435)', 'C(460)', 'D(485)', 'E(510)', 'F(535)', 'G(560)',
                   'H(585)', 'R(610)', 'I(645)', 'S(680)', 'J(705)', 'U(760)',
                   'V(810)', 'W(860)', 'K(900)', 'L(940)', 'T(730)']

target_columns = ['K (kg/ha)', 'Ca (meq/100g)', 'Mg (meq/100g)',
                  'S (ppm)', 'Fe (ppm)', 'Mn (ppm)', 'Cu (ppm)', 'Zn (ppm)', 'B (ppm)']

# Ensure these columns exist in your dataset
missing_features = [col for col in feature_columns if col not in data.columns]
missing_targets = [col for col in target_columns if col not in data.columns]

if missing_features:
    print(f"Feature columns missing from dataset: {missing_features}")

if missing_targets:
    print(f"Target columns missing from dataset: {missing_targets}")

# Separate features (X) and target variables (y)
X = data[feature_columns]
y = data[target_columns]

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Handle missing values
imputer = SimpleImputer(strategy='mean')
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)

# Apply PLS for dimensionality reduction
pls = PLSRegression(n_components=10)
X_train_pls = pls.fit_transform(X_train_imputed, y_train)[0]
X_test_pls = pls.transform(X_test_imputed)

# Train Multilayer Perceptron (MLP) model for multiple target variables
mlp = MLPRegressor(hidden_layer_sizes=(100, 100), max_iter=500, random_state=42)

# Wrap MLP in MultiOutputRegressor to handle multiple outputs
multioutput_mlp = MultiOutputRegressor(mlp)

# Fit the MLP model
multioutput_mlp.fit(X_train_pls, y_train)

# Predict for the test set
y_pred_mlp = multioutput_mlp.predict(X_test_pls)

# Calculate Mean Squared Error
mse_mlp = mean_squared_error(y_test, y_pred_mlp)
print("PLS + MLP Mean Squared Error:", mse_mlp)

# Function to predict nutrient values based on user input wavelengths
def predict_nutrients(wavelengths):
    # Convert the input list to a DataFrame
    user_data = pd.DataFrame([wavelengths], columns=feature_columns)
    
    # Handle missing values
    user_data_imputed = imputer.transform(user_data)
    
    # Apply PLS
    user_data_pls = pls.transform(user_data_imputed)
    
    # Predict using the trained MLP model
    mlp_prediction = multioutput_mlp.predict(user_data_pls)
    
    return mlp_prediction

# Function to get user input for wavelengths
def get_user_wavelengths():
    wavelengths = []
    for i in range(18):
        while True:
            try:
                value = float(input(f"Enter value for wavelength {i+1}: "))
                wavelengths.append(value)
                break
            except ValueError:
                print("Invalid input. Please enter a numerical value.")
    return wavelengths

# Get user input
user_wavelengths = get_user_wavelengths()

# Predict nutrient values
mlp_pred = predict_nutrients(user_wavelengths)

# Print the predictions with labels
nutrients = ['K (kg/ha)', 'Ca (meq/100g)', 'Mg (meq/100g)', 'S (ppm)', 'Fe (ppm)', 'Mn (ppm)', 'Cu (ppm)', 'Zn (ppm)', 'B (ppm)']

print("\nMLP Prediction:")
for nutrient, value in zip(nutrients, mlp_pred[0]):
    print(f"{nutrient}: {value:.2f}")


Index(['A(410)', 'B(435)', 'C(460)', 'D(485)', 'E(510)', 'F(535)', 'G(560)',
       'H(585)', 'R(610)', 'I(645)', 'S(680)', 'J(705)', 'U(760)', 'V(810)',
       'W(860)', 'K(900)', 'L(940)', 'T(730)', 'pH', 'EC  (dS/m)', 'OC (%)',
       'P   (kg/ha)', 'K (kg/ha)', 'Ca (meq/100g)', 'Mg (meq/100g)', 'S (ppm)',
       'Fe (ppm)', 'Mn (ppm)', 'Cu (ppm)', 'Zn (ppm)', 'B (ppm)'],
      dtype='object')




PLS + MLP Mean Squared Error: 5637.375657240275

MLP Prediction:
K (kg/ha): 25495.59
Ca (meq/100g): 10449.76
Mg (meq/100g): -4496.72
S (ppm): -268.63
Fe (ppm): 80163.98
Mn (ppm): 32659.99
Cu (ppm): 804.57
Zn (ppm): -791.55
B (ppm): -41.22


GBRT-GRADIENT BOOSTING TREE REGRESSION

In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import mean_squared_error

# Load your dataset
data = pd.read_csv("preprocessed_data.csv")  # Replace with the path to your dataset file

# Define feature and target columns (make sure these columns exist in your dataset)
feature_columns = ['A(410)', 'B(435)', 'C(460)', 'D(485)', 'E(510)', 'F(535)', 'G(560)',
                   'H(585)', 'R(610)', 'I(645)', 'S(680)', 'J(705)', 'U(760)',
                   'V(810)', 'W(860)', 'K(900)', 'L(940)', 'T(730)']
target_columns = ['K (kg/ha)', 'Ca (meq/100g)', 'Mg (meq/100g)',
                  'S (ppm)', 'Fe (ppm)', 'Mn (ppm)', 'Cu (ppm)', 'Zn (ppm)', 'B (ppm)']

# Check if all required columns are present in the dataset
missing_features = [col for col in feature_columns if col not in data.columns]
missing_targets = [col for col in target_columns if col not in data.columns]

if missing_features:
    raise ValueError(f"Missing feature columns: {missing_features}")

if missing_targets:
    raise ValueError(f"Missing target columns: {missing_targets}")

# Separate features (X) and target variables (y)
X = data[feature_columns]
y = data[target_columns]

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Handle missing values
imputer = SimpleImputer(strategy='mean')
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)

# Scale the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_imputed)
X_test_scaled = scaler.transform(X_test_imputed)

# Train Gradient Boosting Regression Trees (GBRT) model for multiple target variables
gbrt = GradientBoostingRegressor(random_state=42)

# Wrap GBRT in MultiOutputRegressor to handle multiple outputs
multioutput_gbrt = MultiOutputRegressor(gbrt)

# Fit the GBRT model
multioutput_gbrt.fit(X_train_scaled, y_train)

# Predict for the test set
y_pred_gbrt = multioutput_gbrt.predict(X_test_scaled)

# Calculate Mean Squared Error
mse_gbrt = mean_squared_error(y_test, y_pred_gbrt)
print("GBRT Mean Squared Error:", mse_gbrt)

# Function to predict nutrient values based on user input wavelengths
def predict_nutrients(wavelengths):
    # Convert the input list to a DataFrame
    user_data = pd.DataFrame([wavelengths], columns=feature_columns)
    
    # Handle missing values
    user_data_imputed = imputer.transform(user_data)
    
    # Scale the data
    user_data_scaled = scaler.transform(user_data_imputed)
    
    # Predict using the trained GBRT model
    gbrt_prediction = multioutput_gbrt.predict(user_data_scaled)
    
    return gbrt_prediction

# Function to get user input for wavelengths
def get_user_wavelengths():
    wavelengths = []
    for i in range(len(feature_columns)):
        while True:
            try:
                value = float(input(f"Enter value for {feature_columns[i]}: "))
                wavelengths.append(value)
                break
            except ValueError:
                print("Invalid input. Please enter a numerical value.")
    return wavelengths

# Get user input
user_wavelengths = get_user_wavelengths()

# Predict nutrient values
gbrt_pred = predict_nutrients(user_wavelengths)

# Print the predictions with labels
print("\nGBRT Prediction:")
for nutrient, value in zip(target_columns, gbrt_pred[0]):
    print(f"{nutrient}: {value:.2f}")


GBRT Mean Squared Error: 7165.527231836269

GBRT Prediction:
K (kg/ha): 536.31
Ca (meq/100g): 34.26
Mg (meq/100g): 3.67
S (ppm): 45.51
Fe (ppm): 71.07
Mn (ppm): 37.16
Cu (ppm): 6.44
Zn (ppm): 1.97
B (ppm): 0.72


In [8]:
# Calculate the mean of the target values as a simple baseline
baseline_prediction = y_train.mean(axis=0)

# Predict the baseline for the test set
baseline_predictions = [baseline_prediction] * len(y_test)


baseline_mse = mean_squared_error(y_test, baseline_predictions)
print("Baseline Mean Squared Error:", baseline_mse)


print("GBRT Mean Squared Error:", mse_gbrt)


Baseline Mean Squared Error: 5409.085520762519
GBRT Mean Squared Error: 7165.527231836269
