In [None]:
from google.colab import drive
drive.mount('/content/drive')
import os
os.chdir("/content/drive/My Drive/Colab_Notebooks/hyperpectral_imaging")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Load Cleaned reduced data

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
def load_data(file):
  data = pd.read_csv(file)
  X = data.iloc[:,:-1].to_numpy()
  y = data.iloc[:,-1].to_numpy()
  return X, y


In [None]:
X, y = load_data('cleaned_data.csv')

In [None]:
print(f"shape of X: {X.shape}")
print(f"shape of y: {y.shape}")

shape of X: (500, 24)
shape of y: (500,)


In [None]:
print(y[0])
X[0]

1100.0


array([-14.77258148,   2.34254264,   1.91988117,  -2.24911151,
         0.41172253,   0.04153874,  -0.46065693,   0.20963427,
         0.45399747,  -0.25166671,  -0.69969988,   0.2886935 ,
         0.15687378,   0.69153361,   0.19789234,  -0.09844169,
         0.31722809,  -0.21598171,  -0.41855185,   0.35375963,
         0.02160687,  -0.464635  ,  -0.20249281,  -0.45660422])

# Split data 80-20 to train and test

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"shape of X_train: {X_train.shape}\t\t shape of y_train: {y_train.shape}")
print(f"shape of X_test: {X_test.shape}\t\t shape of y_test: {y_test.shape}")

shape of X_train: (400, 24)		 shape of y_train: (400,)
shape of X_test: (100, 24)		 shape of y_test: (100,)


## Evaluate model function

In [None]:
def evaluate_model(model, X_val, y_val):
  y_pred = model.predict(X_val)
  mse = mean_squared_error(y_val, y_pred)
  r2 = r2_score(y_val, y_pred)
  print(f"MSE: {mse}\tR2: {r2}")
  return

# Training model

## regualarized linear regression model

bad perfomance

In [None]:
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error, r2_score,root_mean_squared_error

ridge = Ridge(alpha=0.01)
ridge.fit(X_train, y_train)

print("For Redge Regression")
print("Train data")
evaluate_model(ridge, X_train, y_train)
print("\nValidation data")
evaluate_model(ridge,X_test, y_test)

For Redge Regression
Train data
MSE: 83153722.66091126	R2: 0.4215340232846435

Validation data
MSE: 133547586.02270013	R2: 0.5222477058640986


## Randomforest estimator
Overfits data, to increase performance trying xgb

In [None]:
from sklearn.ensemble import RandomForestRegressor

# Train randomforest
rf = RandomForestRegressor(
    n_estimators=100,
    max_depth=10,
    min_samples_split=5,
    min_samples_leaf=2,
    random_state=42,
    n_jobs=-1
)

# Train the model
rf.fit(X_train, y_train)

print("For RandomForest Regression on train data")
evaluate_model(rf, X_train, y_train)

print("For RandomForest Regression on validation data")
evaluate_model(rf, X_test, y_test)

For RandomForest Regression on train data
MSE: 23796518.178005867	R2: 0.8344574880141145
For RandomForest Regression on validation data
MSE: 77421246.27914758	R2: 0.7230337205912789


## Extreme Gradient Boosting algorithm
to capture complexity, performance well but overfits


In [None]:
from xgboost import XGBRegressor

# Train XGBoost model
xgb = XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
xgb.fit(X_train, y_train)


# Evaluate
evaluate_model(xgb, X_train, y_train)
evaluate_model(xgb, X_test, y_test)

MSE: 37926.16284985285	R2: 0.9997361634075546
MSE: 82253073.77972138	R2: 0.705748371286083


## Tunning hyperparameters of xgb using Randomsearch
 to get best param and reduce variance

In [None]:
from sklearn.model_selection import RandomizedSearchCV

# Define parameter grid
param_grid = {
    "n_estimators": [100, 200, 300, 500],
    "learning_rate": [0.01, 0.05, 0.1],
    "max_depth": [3, 5, 7],
    "colsample_bytree": [0.6, 0.8, 1.0],
    "reg_lambda": [5, 7, 10]
}

# Initialize XGBoost model
xgb = XGBRegressor(random_state=42)

# Perform Randomized Search with 5-fold cross-validation
random_search = RandomizedSearchCV(
    estimator=xgb,
    param_distributions=param_grid,
    scoring="r2",
    cv=3,
    verbose=True,
    n_jobs=-1
)

# Fit on training data
random_search.fit(X_train, y_train)

# Get best parameters
best_params = random_search.best_params_
print("Best Parameters:", best_params)

Fitting 3 folds for each of 10 candidates, totalling 30 fits
Best Parameters: {'reg_lambda': 10, 'n_estimators': 300, 'max_depth': 5, 'learning_rate': 0.01, 'colsample_bytree': 1.0}


In [None]:
# Train final model with best parameters
best_xgb = XGBRegressor(**best_params, random_state=42)
best_xgb.fit(X_train, y_train)

print("for train")
evaluate_model(best_xgb, X_train, y_train)
print("for test")
evaluate_model(best_xgb, X_test, y_test)

for train
MSE: 23463729.37589594	R2: 0.8367725617509509
for test
MSE: 38391483.47012218	R2: 0.8626585485415383


# Conclusion :
As data is small (500 exaples) started with linear model,

*   Not using Neural nets cause data is small that might lead to high overfitting (high variance)
*   to increase perfomance tried randomforest but overfits
*   Extreme boosted tree worked good after tunning parameters( variance reduced while keeping good performance)



# Saving xgb model

In [None]:
import pickle
# Save the trained model
with open("xgb_model.pkl", "wb") as f:
    pickle.dump(best_xgb, f)