In [None]:
import os
import config
from data_loader import *
from model import *
import tensorflow as tf

# Print current working directory
print(f"Current working directory: {os.getcwd()}")

# Verify file path
file_path = config.DATA_PATH
print(f"Config file path: {file_path}")

# Check if the dataset file exists
if not os.path.isfile(file_path):
    raise FileNotFoundError(f"Error: The dataset file was not found at {file_path}")

# Load and preprocess data
X_raw, y = load_data(file_path)


# Print confirmation
print(f"Data Loaded: X shape = {X_raw.shape}, y shape = {y.shape}")
print("\nTop 5 rows of X_raw:")
print(X_raw.head())

print("\nTop 5 rows of y:")
print(y.head())


In [None]:
#!pip install tensorflow xgboost optuna --user

In [None]:
# EDA
print(generate_summary_statistics(X_raw, y))

In [None]:
handle_missing_data(X_raw, strategy='mean')

In [None]:
X_scaled = preprocess_data(X_raw)
print(f"Preprocessed Data Shape: {X_scaled.shape}")

In [None]:
plot_boxplots(X_raw)


Boxplot of Spectral Reflectance Features:

There are outliers in the spectral reflectance values, especially at lower and higher wavelengths. These need to be flagged or removed to avoid skewing the model.

In [None]:
plot_spectral_reflectance(X_raw)

Key Observations from Visualizations: Line Plot (Average Reflectance Over Wavelengths): Wavelengths which is been set or taken in the interval of 10

Reflectance increases sharply at lower wavelengths, stabilizes in the middle range, and slightly declines at higher wavelengths.

This trend suggests that certain wavelength ranges are more reflective and might be more predictive.

In [None]:
plot_sample_heatmap(X_scaled)

Strong correlations exist between adjacent wavelength bands, indicating redundancy in features. Dimensionality reduction techniques like PCA can help reduce this redundancy without losing essential information.

In [None]:
outlier_count, outlier_indices = detect_outlier_zscore(X_raw)
print(f"Number of outliers: {outlier_count}")
print(f"First 10 outlier indices: {outlier_indices[:29]}")

In [None]:
# Remove outliers
X_cleaned, y_cleaned = remove_outliers(X_raw, y)

In [None]:
X_cleaned

In [None]:
y_cleaned

In [None]:
detect_sensor_drift(X_cleaned)

In [None]:
detect_sensor_drift_PCA(X_cleaned,y_cleaned)

The spread of points is relatively evenly distributed, meaning there is no clear trend of shift in any one direction. It is found that yes there was for sensor drift or inconsistencies through checks

Color Distribution:

Dark Blue Points: Represent the majority of data, indicating normal sensor readings.

Yellow, Orange, and Red Points: These are likely outliers or drifted points, suggesting that the sensor readings in these regions deviate from the normal.

These anomalous points are concentrated in the lower region (PC2 ~ -10 to -15) and right side (PC1 ~ 20 to 40), suggesting that certain sensors might have experienced drift over time, indicating possible sensor drift.

In [None]:
create_spectral_indices(X_raw)

In [None]:
X_scaled = preprocess_data(X_cleaned)
print(f"Preprocessed Data Shape: {X_scaled.shape}")

In [None]:
X_train,X_test,y_train,y_test = split_data(X_scaled,y_cleaned)

In [None]:
X_train.shape

In [None]:
X_test.shape

In [None]:
# Build the model
model = build_simple_nn(X_train.shape[1])

# Compile the model before training
model.compile(optimizer="adam", loss="mse", metrics=["mae"])

# Train the model with training data
trained_model = train_model(model, X_train, y_train)



In [None]:
# Build the XGBoost model
xgb_model = build_xgboost(n_estimators=200, learning_rate=0.05)

# Train the model
xgb_model.fit(X_train, y_train)

# Make predictions
y_pred = xgb_model.predict(X_test)

# Evaluate the model
from sklearn.metrics import mean_squared_error

mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")


In [None]:
# Build the XGBoost model
xgb_model = build_xgboost(n_estimators=200, learning_rate=0.05)

# Perform cross-validation
cv_results = cross_validate(xgb_model, X_scaled, y_cleaned, n_splits=5)

# Print results
print(f"Mean MAE: {cv_results['mean_mae']}")
print(f"Standard Deviation of MAE: {cv_results['std_mae']}")


In [None]:
# Custom number of trials
best_params = optimize_nn_hyperparams(
    X_train, 
    y_train, 
    X_test, 
    y_test, 
    n_trials=100  # Increase number of trials for more comprehensive search
)

# After getting best parameters, you can build and train final model
final_model = build_simple_nn(
    input_shape=X_train.shape[1], 
    **best_params
)

# Fit the model
final_model.fit(
    X_train, 
    y_train, 
    epochs=50, 
    validation_data=(X_test, y_test),
    verbose=1
)

# Evaluate the model
test_loss, test_mae = final_model.evaluate(X_test, y_test)
print(f"Test MAE: {test_mae}")

In [None]:
print(f"Test MAE: {test_mae}")

In [1]:
#pip install shap --user

Collecting shap
  Using cached shap-0.44.1-cp38-cp38-win_amd64.whl (450 kB)
Collecting packaging>20.9
  Using cached packaging-24.2-py3-none-any.whl (65 kB)
Installing collected packages: packaging, shap
Successfully installed packaging-24.2 shap-0.44.1
Note: you may need to restart the kernel to use updated packages.
