In [19]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics
from sklearn.tree import export_text

In [20]:
# Load in the dataset. (Thanks Izzy for the process)

data = pd.read_csv('cleaned_dummy_dataset.csv')           #Everything but labels
data_full = pd.read_csv('soybean_csv.csv')                #Has labels in "class" column
data['Class'] = data_full['class']                        #Add labels to clean dataset
data = data.loc[:,'plant-growth_abnorm':]                 #Dropping indexing column
data.tail()

Unnamed: 0,plant-growth_abnorm,leafspots-halo_no-yellow-halos,leafspots-marg_w-s-marg,leafspot-size_gt-1/8,leaf-mild_absent,stem-cankers_absent,canker-lesion_dk-brown-blk,fruiting-bodies_absent,fruiting-bodies_present,external-decay_absent,int-discolor_brown,int-discolor_none,fruit-pods_diseased,fruit-pods_norm,fruit-spots_absent,fruit-spots_colored,mold-growth_absent,roots_rotted,Class
678,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,2-4-d-injury
679,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,herbicide-injury
680,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,herbicide-injury
681,True,True,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,True,herbicide-injury
682,True,True,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,True,herbicide-injury


In [21]:
features = data.drop('Class', axis=1) 
target = data['Class']

X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

clf = DecisionTreeClassifier()

# Define the hyperparameter grid for the grid search
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(estimator=clf, param_grid=param_grid, cv=5)
grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)

best_clf = grid_search.best_estimator_
best_clf.fit(X_train, y_train)

y_pred = best_clf.predict(X_test)

accuracy = metrics.accuracy_score(y_test, y_pred)
print(f"Accuracy on Test Set: {accuracy:.2f}")

tree_rules = export_text(best_clf, feature_names=list(features.columns))
print("Decision Tree Rules:\n", tree_rules)


Best Hyperparameters: {'criterion': 'gini', 'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2}
Accuracy on Test Set: 0.89
Decision Tree Rules:
 |--- leafspot-size_gt-1/8 <= 0.50
|   |--- canker-lesion_dk-brown-blk <= 0.50
|   |   |--- int-discolor_brown <= 0.50
|   |   |   |--- external-decay_absent <= 0.50
|   |   |   |   |--- fruit-pods_norm <= 0.50
|   |   |   |   |   |--- fruiting-bodies_absent <= 0.50
|   |   |   |   |   |   |--- plant-growth_abnorm <= 0.50
|   |   |   |   |   |   |   |--- leaf-mild_absent <= 0.50
|   |   |   |   |   |   |   |   |--- class: 2-4-d-injury
|   |   |   |   |   |   |   |--- leaf-mild_absent >  0.50
|   |   |   |   |   |   |   |   |--- class: anthracnose
|   |   |   |   |   |   |--- plant-growth_abnorm >  0.50
|   |   |   |   |   |   |   |--- mold-growth_absent <= 0.50
|   |   |   |   |   |   |   |   |--- class: herbicide-injury
|   |   |   |   |   |   |   |--- mold-growth_absent >  0.50
|   |   |   |   |   |   |   |   |--- fruiting-bodie