In [2]:
import pandas as pd
import numpy as np
import optuna

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import GroupShuffleSplit, GroupKFold, train_test_split
from sklearn.metrics import f1_score, precision_score, recall_score, confusion_matrix, classification_report

from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier

from xgboost import XGBClassifier

from database.query import fetch_all, load_csv


  from .autonotebook import tqdm as notebook_tqdm


In [None]:
df = pd.read_csv("database/agriculture_dataset.csv")
df = df.drop("id", axis=1)

TypeError: 'DataFrame' object is not callable

In [None]:
# Feature engineering
df["days_since_planting"] = (df["timestamp"] - df.groupby("plant_id")["timestamp"].transform("min")).dt.days

In [None]:
# Rolling statistics (window=3)
for col in ['soil_moisture', 'chlorophyll_content', 'ambient_temperature', 'soil_temperature', 'humidity', 'nitrogen_level', 'phosphorus_level', 'potassium_level',
       'chlorophyll_content', 'electrochemical_signal', 'light_intensity']:
    for time_range in range(3, 8, 2):
        df[f'{col}_rolling_mean_{time_range}'] = df.groupby('plant_id')[col].transform(lambda x: x.rolling(time_range, min_periods=1).mean()).fillna(0)
        df[f'{col}_rolling_std_{time_range}'] = df.groupby('plant_id')[col].transform(lambda x: x.rolling(time_range, min_periods=1).std()).fillna(0)


In [None]:
for col in ['soil_moisture', 'ambient_temperature',
       'soil_temperature', 'humidity', 'light_intensity', 'soil_ph',
       'nitrogen_level', 'phosphorus_level', 'potassium_level', 'chlorophyll_content', 'electrochemical_signal']:
    df[f"delta_{col}"] = df.groupby("plant_id")["soil_moisture"].diff().fillna(0)

In [None]:
drop_col = ['timestamp', 'soil_moisture', 'ambient_temperature',
       'soil_temperature', 'humidity', 'light_intensity', 'soil_ph', 'chlorophyll_content', 'electrochemical_signal']

In [None]:
df = df.drop(drop_col, axis=1)

In [None]:
# Encode target
le = LabelEncoder()
df["plant_health_status"] = le.fit_transform(df["plant_health_status"])
num_classes = len(le.classes_)

In [None]:
import seaborn as sns
sns.heatmap(df.corr())

# Split

In [None]:
train, test = df[df["days_since_planting"] <= 21], df[df["days_since_planting"] > 21]

groups = train.plant_id.values
train = train.drop("plant_id", axis=1)
test = test.drop("plant_id", axis=1)

trainX, trainY = train.drop("plant_health_status", axis=1), train.plant_health_status
testX, testY = test.drop("plant_health_status", axis=1), test.plant_health_status

# Objective

In [None]:
def xgb_objective(trial):
    params = {
        "device":"cuda",
        "tree_method": "hist",
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
        "n_estimators": trial.suggest_int("n_estimators", 50, 500),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "objective": "multi:softmax",
        "num_class": num_classes,
        "eval_metric": "mlogloss",
        "n_jobs": -2,
        "random_state": 2503
    }
    f1_scores = []
    gkf = GroupKFold(n_splits=8)
    
    for train_idx, val_idx in gkf.split(trainX, trainY, groups=groups):
        X_train, X_val = trainX.iloc[train_idx], trainX.iloc[val_idx]
        y_train, y_val = trainY.iloc[train_idx], trainY.iloc[val_idx]

        model = XGBClassifier(**params)
        
        pipe = Pipeline(
            steps=[
                # ('preprocess', preprocessor),
                ('classifier', model)
            ]
        )
        pipe.fit(X_train, y_train)
        
        y_pred = pipe.predict(X_val)
        f1 = f1_score(y_val, y_pred, average='macro')
        f1_scores.append(f1)


    return 1 - np.mean(f1_scores)

In [None]:
study = optuna.create_study(direction="minimize")
study.optimize(xgb_objective, n_trials=100)

In [None]:
params = study.best_params
# params = {'max_depth': 5, 'learning_rate': 0.17069691050602884, 'n_estimators': 184, 'subsample': 0.8636025430269557, 'colsample_bytree': 0.5950374387244108}
model = XGBClassifier(**params)

In [None]:
model.fit(trainX, trainY)
y_pred = model.predict(testX)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Macro-averaged metrics
f1 = f1_score(testY, y_pred, average="macro")
precision = precision_score(testY, y_pred, average="macro")
recall = recall_score(testY, y_pred, average="macro")

print("=== Overall Performance (Macro-Averaged) ===")
print(f"Macro F1-score : {f1:.4f}")
print(f"Macro Precision: {precision:.4f}")
print(f"Macro Recall   : {recall:.4f}")


# Confusion Matrix
cm = confusion_matrix(testY, y_pred)

plt.figure(figsize=(6, 5))
sns.heatmap(
    cm,
    annot=True,
    fmt="d",
    cmap="Blues",
    xticklabels=le.classes_,
    yticklabels=le.classes_
)
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.title("Confusion Matrix")
plt.tight_layout()
plt.show()


# Detailed Classification Report
print("\n=== Classification Report (Per Class) ===")
print(classification_report(
    testY,
    y_pred,
    target_names=le.classes_,
    digits=4
))


In [None]:
importances = model.feature_importances_
feat_imp = pd.DataFrame({
    'Feature': trainX.columns,
    'Importance': importances
}).sort_values(by='Importance', ascending=False)

top_k = 20
feat_imp = feat_imp.head(top_k)

# Plot
plt.figure(figsize=(8, 6))

plt.barh(
    feat_imp['Feature'],
    feat_imp['Importance'],
    edgecolor='black'
)

plt.gca().invert_yaxis()  # highest importance on top
plt.xlabel("Feature Importance Score", fontsize=11)
plt.ylabel("Environmental Features", fontsize=11)
plt.title("Feature Importance Analysis", fontsize=12)

plt.grid(axis='x', linestyle='--', linewidth=0.5, alpha=0.7)
plt.tight_layout()
plt.show()
