In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report

# Load dataset
df = pd.read_csv("animal_classification.csv")

# Drop non-essential columns
if "name" in df.columns:
    df = df.drop(columns=["name"])

# Encode categorical variables
label_encoders = {}
for col in df.select_dtypes(include=["object"]).columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le  # Store encoders for later use

# Check class distribution
print(df["class"].value_counts())

# Optionally, duplicate rare classes to balance the dataset
min_class_count = df["class"].value_counts().min()
if min_class_count == 1:
    print("Warning: Some classes have only one sample! Consider balancing the dataset.")

# Split features and target variable
X = df.drop(columns=["class"])
y = df["class"]

# Fix stratification issue by setting shuffle=True
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=42)

# Train Decision Tree model
clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)

# Predictions
y_pred = clf.predict(X_test)

# Results
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred, zero_division=1))

class
3    5
1    2
2    1
0    1
4    1
Name: count, dtype: int64
Accuracy: 0.0
Classification Report:
               precision    recall  f1-score   support

           1       0.00      1.00      0.00       0.0
           2       0.00      1.00      0.00       0.0
           3       1.00      0.00      0.00       2.0

    accuracy                           0.00       2.0
   macro avg       0.33      0.67      0.00       2.0
weighted avg       1.00      0.00      0.00       2.0

