In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, roc_auc_score
from sklearn.preprocessing import LabelEncoder

df = pd.read_csv("data/normalized-data-crop-recommendation.csv")

label_encoder = LabelEncoder()
df["label"] = label_encoder.fit_transform(df["label"])  # Convert crop names to numeric

x = df.iloc[:, :-1]  # All columns except the last one (features)
y = df.iloc[:, -1]   # Last column (target label)

# Split dataset into 90% seen (train+test) and 10% unseen
x_seen, x_unseen, y_seen, y_unseen = train_test_split(x, y, test_size=0.10, random_state=42, stratify=y)

# Split the 90% seen data into 80% training and 20% testing
x_train, x_test, y_train, y_test = train_test_split(x_seen, y_seen, test_size=0.20, random_state=42, stratify=y_seen)

# Train a Decision Tree Classifier
dtc_model = DecisionTreeClassifier(random_state=42)

# Apply 10-Fold Cross-Validation
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
cv_scores = cross_val_score(dtc_model, x_train, y_train, cv=cv, scoring="accuracy")

print(f"10-Fold Cross-Validation Accuracy: {np.mean(cv_scores):.2f}")

# Train on full training data & Predict
dtc_model.fit(x_train, y_train)
y_pred = dtc_model.predict(x_test)

# Evaluation Metrics
conf_matrix = confusion_matrix(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average="weighted")
recall = recall_score(y_test, y_pred, average="weighted")
roc_auc = roc_auc_score(y_test, dtc_model.predict_proba(x_test), multi_class="ovr")

# Display results
print("\nConfusion Matrix:\n", conf_matrix)
print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"ROC-AUC Score: {roc_auc:.2f}")

# # Save Model (Optional)
# import joblib
# joblib.dump(clf, "decision_tree_model.pkl")


10-Fold Cross-Validation Accuracy: 0.98

Confusion Matrix:
 [[18  0  0  0  0  0  0  0  0  0]
 [ 0 18  0  0  0  0  0  0  0  0]
 [ 0  0 18  0  0  0  0  0  0  0]
 [ 0  0  0 16  0  0  0  0  2  0]
 [ 0  0  0  0 18  0  0  0  0  0]
 [ 0  0  0  0  0 18  0  0  0  0]
 [ 0  0  0  0  0  0 18  0  0  0]
 [ 0  0  0  0  0  0  0 18  0  0]
 [ 0  0  0  1  0  0  0  0 17  0]
 [ 0  0  0  0  0  0  0  0  0 18]]
Accuracy: 0.98
Precision: 0.98
Recall: 0.98
ROC-AUC Score: 0.99
