In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cross_decomposition import PLSRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error

def plsRegression(X, Y, random_state=None):
    # Set the random seed
    np.random.seed(random_state)

    # Use 100% of the data for training
    X_train = X
    Y_train = Y


    # Randomly select indices for test data
    '''  generates test data by randomly selecting indices from the input data (X) using np.random.choice()
    replace=False argument ensures that the same index is not selected multiple times.
    np.random.normal() generates random numbers from a normal distribution with a mean of 0 (loc=0) and a
    standard deviation of 0.1 (scale=0.1). The size parameter is set to (len(test_indices), X.shape[1]) to match the
    shape of X_test, where X.shape[1] represents the number of columns (features) in X '''
    scale_values = [0.1 , 0.01 , 0.001]
    for x in scale_values:
            test_indices = np.random.choice(X.index, size=int(0.3 * len(X)), replace=False)
            X_test = X.loc[test_indices] + np.random.normal(loc=0, scale=x, size=(len(test_indices), X.shape[1]))
            Y_test = Y.loc[test_indices]
            rf = PLSRegression(n_components=100)
              rf.fit(X_train, Y_train)

              preds_train = rf.predict(X_train)  # predicted values for training data
              true_y_train = Y_train.to_numpy().flatten()
              error_rf_mse_train = mean_squared_error(true_y_train, preds_train)
              error_rf_abs_train = mean_absolute_error(true_y_train, preds_train)

              preds = rf.predict(X_test)  # predicted value
              true_y = Y_test.to_numpy().flatten()
              error_rf_mse = mean_squared_error(true_y, preds)
              error_rf_abs = mean_absolute_error(true_y, preds)

              print("For STD : ", x)
              print("\n Training set MSE: ", error_rf_mse_train)
              print("\n Training set MAE: ", error_rf_abs_train, "\n")

              print("\n Test set R-squared: ", rf.score(X_test, Y_test))
              print("\n Test set MSE: ", error_rf_mse)
              print("\n Test set MAE: ", error_rf_abs, "\n")


          # Plot true vs predicted values
          fig, ax = plt.subplots(figsize=(8, 8))
          ax.scatter(true_y, preds, color='blue', label='True vs Predicted')
          ax.plot([true_y.min(), true_y.max()], [true_y.min(), true_y.max()], color='red', linestyle='--', label='Ideal')
          ax.set_xlabel('True Values')
          ax.set_ylabel('Predicted Values')
          ax.legend()
          plt.show()

    # fig, ax = plt.subplots(figsize=(8, 8))
    # ax.scatter(X_train.iloc[:, 0], Y_train, color='blue', label='Training Data')
    # ax.scatter(X_test.iloc[:, 0], Y_test, color='red', label='Testing Data')
    # ax.set_xlabel('Input Features : Reflectance')
    # ax.set_ylabel('Target Variable : TSM')
    # ax.legend()
    # plt.show()


    return rf.score(X_test, Y_test)


# Load the data
water = pd.read_csv("dataset_for_tsm.csv")

# Extract X (input features) and Y (target variable)
X = water.filter(like='wvl_')
Y = water['tsm']

# Call the function with X and Y as inputs and get the accuracy
accuracy = plsRegression(X, Y, random_state=42)
print("Accuracy: ", accuracy)
