In [1]:
import pandas as pd
import numpy as np
import shap
import optuna
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import SMOTE


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load dataset
file_path = "Updated_CubeSat_Sensor_Data.csv"
df = pd.read_csv(file_path)

# Encode categorical variables
label_encoders = {}
for col in ["Temp Threshold (°C)", "Pressure Threshold (Pa)", "Humidity Threshold (%RH)", "Condition"]:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le  # Store encoders for later use

# Split data into features and target variable
X = df.drop(columns=["Condition"])
y = df["Condition"]

# Normalize Features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Apply SMOTE to handle class imbalance
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_scaled, y)

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42, stratify=y_resampled)


In [3]:
# Train a basic Random Forest model for feature importance
rf_temp = RandomForestClassifier(n_estimators=100, random_state=42)
rf_temp.fit(X_train, y_train)

# Compute SHAP values
explainer = shap.TreeExplainer(rf_temp)
shap_values = explainer.shap_values(X_train)

# Convert SHAP output to 2D (Fix dimension issue)
shap_importances = np.abs(shap_values).mean(axis=0)

# Select the top 10 most important features
top_features = np.argsort(shap_importances)[-10:]

# Reduce the dataset to only these features
X_train_shap = X_train[:, top_features]
X_test_shap = X_test[:, top_features]


In [6]:
import pandas as pd
import numpy as np
import shap
import optuna
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import SMOTE

# 1️⃣ Load dataset
file_path = "Updated_CubeSat_Sensor_Data.csv"
df = pd.read_csv(file_path)

# 2️⃣ Encode categorical variables
label_encoders = {}
for col in ["Temp Threshold (°C)", "Pressure Threshold (Pa)", "Humidity Threshold (%RH)", "Condition"]:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le  # Store encoders for later use

# 3️⃣ Split data into features and target
X = df.drop(columns=["Condition"])
y = df["Condition"]

# 4️⃣ Normalize Features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# 5️⃣ Apply SMOTE for class balance
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_scaled, y)

# 6️⃣ Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42, stratify=y_resampled)

# 7️⃣ Feature Selection using SHAP
rf_temp = RandomForestClassifier(n_estimators=100, random_state=42)
rf_temp.fit(X_train, y_train)

explainer = shap.TreeExplainer(rf_temp)
shap_values = explainer.shap_values(X_train)

# 🔥 FIX: Ensure SHAP output is 2D
if isinstance(shap_values, list):  # If multi-class, shap_values is a list
    shap_values = np.mean(np.abs(shap_values), axis=0)  # Convert to (n_samples, n_features)
else:
    shap_values = np.abs(shap_values)  # Already 2D for binary classification

# Compute feature importance
shap_importances = shap_values.mean(axis=0)

# Select the top 10 most important features
top_features = np.argsort(shap_importances)[-10:]

# Convert to NumPy array to avoid shape issues
X_train_np = np.array(X_train)
X_test_np = np.array(X_test)

# Reduce dataset to selected features
X_train_shap = X_train_np[:, top_features]
X_test_shap = X_test_np[:, top_features]

print("Selected Features Shape:", X_train_shap.shape)  # Ensure correct shape

# 8️⃣ Hyperparameter Optimization with Optuna
def objective(trial):
    n_estimators = trial.suggest_int('n_estimators', 200, 500)
    max_depth = trial.suggest_int('max_depth', 10, 50)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 20)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 10)
    
    model = RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        class_weight='balanced',
        bootstrap=True,
        oob_score=True,
        random_state=42
    )
    
    # Train and test with correct data shape
    model.fit(X_train_shap, y_train)
    y_pred = model.predict(X_test_shap)
    return accuracy_score(y_test, y_pred)

# Run Bayesian Optimization
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=30)

# Train the best model
best_rf_model = RandomForestClassifier(**study.best_params, class_weight='balanced', bootstrap=True, oob_score=True, random_state=42)
best_rf_model.fit(X_train_shap, y_train)
y_pred_rf = best_rf_model.predict(X_test_shap)

# Print Final Accuracy
rf_accuracy = accuracy_score(y_test, y_pred_rf)
print("🔥 Optimized Random Forest Accuracy:", rf_accuracy)


[I 2025-03-30 13:52:08,960] A new study created in memory with name: no-name-27a3f89c-3fcd-4e66-bff3-a232ccf4cc6e
[W 2025-03-30 13:52:08,971] Trial 0 failed with parameters: {'n_estimators': 433, 'max_depth': 27, 'min_samples_split': 8, 'min_samples_leaf': 4} because of the following error: ValueError('Found array with dim 3. RandomForestClassifier expected <= 2.').
Traceback (most recent call last):
  File "c:\Users\surya\AppData\Local\Programs\Python\Python312\Lib\site-packages\optuna\study\_optimize.py", line 197, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "C:\Users\surya\AppData\Local\Temp\ipykernel_12664\1280749889.py", line 85, in objective
    model.fit(X_train_shap, y_train)
  File "c:\Users\surya\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\surya\AppData\Local\Prog

Selected Features Shape: (443, 9, 2)


ValueError: Found array with dim 3. RandomForestClassifier expected <= 2.