In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
import matplotlib.pyplot as plt
import seaborn as sns

# Step 1: Load the Dataset
file_path = "soil_sensor_training_template.csv"  # Update with your actual path
df = pd.read_csv(file_path)
df.head()

# Step 2: Basic Cleaning
df_clean = df.drop(columns=['Sample ID', 'Notes'], errors='ignore')
df_clean = df_clean.dropna()  # Optional: Drop rows with missing data

# Step 3: Encode Nutrient Labels (only if needed for classification)
label_encoder = LabelEncoder()
df_clean['Nutrient_Label'] = label_encoder.fit_transform(df_clean['Nutrients'])

# Step 4: Select Features and Targets
feature_cols = ['F1 (415nm)', 'F2 (445nm)', 'F3 (480nm)', 'F4 (515nm)',
                'F5 (555nm)', 'F6 (590nm)', 'F7 (630nm)', 'F8 (680nm)',
                'Clear', 'NIR']
X = df_clean[feature_cols]
y_regression = df_clean['ppm']
y_classification = df_clean['Nutrient_Label']

# Step 5: Feature Scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Step 6: Split the Data (for Regression example)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_regression, test_size=0.2, random_state=42)

# Step 7: Train a Simple Random Forest Regressor
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Step 8: Feature Importance Visualization
importances = rf.feature_importances_
plt.figure(figsize=(10, 6))
sns.barplot(x=importances, y=feature_cols)
plt.title("Feature Importance from Random Forest")
plt.xlabel("Importance")
plt.ylabel("Spectral Channels")
plt.tight_layout()
plt.show()

# Step 9: Save Preprocessed Data (Optional)
pd.DataFrame(X_scaled, columns=feature_cols).to_csv("scaled_features.csv", index=False)
df_clean[['ppm', 'Nutrient_Label']].to_csv("targets.csv", index=False)

# Notes: