In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report

# Load dataset
df = pd.read_csv("plant_disease_classification.csv")  # Replace with actual dataset

# Strip spaces from column names (to avoid hidden errors)
df.columns = df.columns.str.strip()

# Identify categorical columns
categorical_columns = df.select_dtypes(include=["object"]).columns  

# Apply Label Encoding to categorical columns
for col in categorical_columns:
    df[col] = LabelEncoder().fit_transform(df[col])

# Split features and target variable
X = df.drop(columns=["disease"])  # Feature columns
y = df["disease"]  # Target column

# Handle class imbalance (check if some classes have very few samples)
print(y.value_counts())

# Stratified split to maintain class distribution
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# Hyperparameter tuning using GridSearchCV
param_grid = {
    "criterion": ["gini", "entropy"],
    "max_depth": [5, 10, 15, None],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
    "class_weight": [None, "balanced"]  # Helps handle imbalanced classes
}

grid_search = GridSearchCV(DecisionTreeClassifier(random_state=42), param_grid, cv=5, scoring="accuracy")
grid_search.fit(X_train, y_train)

# Best model
best_clf = grid_search.best_estimator_
print("Best Parameters:", grid_search.best_params_)

# Predictions
y_pred = best_clf.predict(X_test)

# Feature Importance
importances = best_clf.feature_importances_
feature_names = X.columns
important_features = sorted(zip(feature_names, importances), key=lambda x: x[1], reverse=True)

print("\nTop Features:")
for feature, importance in important_features:
    print(f"{feature}: {importance:.4f}")

# Results
print("\nAccuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

KeyError: "['disease'] not found in axis"

In [3]:
print(df.columns)  # Debugging step

Index(['Leaf_Color', 'Leaf_Spot_Size', 'Moisture_Level', 'Temperature',
       'Disease'],
      dtype='object')


In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report

# Load dataset
df = pd.read_csv("plant_disease_classification.csv")  # Replace with actual dataset

# Strip spaces from column names
df.columns = df.columns.str.strip()

# Print column names to verify "disease" exists
print("Columns in dataset:", df.columns)

# Check if 'disease' is present
if "disease" not in df.columns:
    raise ValueError("The column 'disease' is missing! Please check the dataset.")

# Identify categorical columns
categorical_columns = df.select_dtypes(include=["object"]).columns  
print("Categorical columns:", categorical_columns)

# Apply Label Encoding to categorical columns
for col in categorical_columns:
    df[col] = LabelEncoder().fit_transform(df[col])

# Split features and target variable
X = df.drop(columns=["disease"])  # Feature columns
y = df["disease"]  # Target column

# Stratified train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# Train Decision Tree with hyperparameter tuning
clf = DecisionTreeClassifier(max_depth=10, min_samples_split=5, class_weight="balanced", random_state=42)
clf.fit(X_train, y_train)

# Predictions
y_pred = clf.predict(X_test)

# Results
print("\nAccuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Columns in dataset: Index(['Leaf_Color', 'Leaf_Spot_Size', 'Moisture_Level', 'Temperature',
       'Disease'],
      dtype='object')


ValueError: The column 'disease' is missing! Please check the dataset.