In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import accuracy_score, classification_report


In [2]:
## Load the dataset
df = pd.read_csv('/kaggle/input/flower-dataset/flower_dataset.csv')
print(df.head())
print(df.shape)

           species    size fragrance  height_cm
0             rose  medium      mild      48.55
1  shoeblack plant  medium      mild     147.07
2  shoeblack plant  medium      none     102.93
3         hibiscus   large      none     184.00
4  shoeblack plant   large      mild      83.07
(10000, 4)


In [3]:
# encode the categorical features
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(), ['size', 'fragrance'])
    ],
    remainder='passthrough'
)

# encode the target variable
label_encoder = LabelEncoder()
df['species'] = label_encoder.fit_transform(df['species'])


In [4]:
# split the data into features (X) and target (y)
X = df.drop('species', axis=1)
y = df['species']

# split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [5]:
# define the models
models = {
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'Logistic Regression': LogisticRegression(max_iter=200, random_state=42),
    'Support Vector Machine': SVC(random_state=42),
    'K-Nearest Neighbors': KNeighborsClassifier()
}

# evaluate each model
for model_name, model in models.items():
    # Build a pipeline with preprocessing and model
    model_pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('scaler', StandardScaler()),
        ('classifier', model)
    ])
    
    # Train the model
    model_pipeline.fit(X_train, y_train)
    y_pred = model_pipeline.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f'\nModel: {model_name}')
    print(f'Accuracy: {accuracy * 100:.2f}%')
    
    # convert the label encoded classes back to original species names
    target_names = label_encoder.inverse_transform(range(len(label_encoder.classes_)))
    print('Classification Report:')
    print(classification_report(y_test, y_pred, target_names=target_names))


Model: Random Forest
Accuracy: 87.65%
Classification Report:
                 precision    recall  f1-score   support

       hibiscus       0.88      0.91      0.89       647
           rose       0.92      0.92      0.92       654
shoeblack plant       0.84      0.80      0.82       699

       accuracy                           0.88      2000
      macro avg       0.88      0.88      0.88      2000
   weighted avg       0.88      0.88      0.88      2000


Model: Logistic Regression
Accuracy: 90.75%
Classification Report:
                 precision    recall  f1-score   support

       hibiscus       0.87      1.00      0.93       647
           rose       0.93      0.93      0.93       654
shoeblack plant       0.92      0.80      0.86       699

       accuracy                           0.91      2000
      macro avg       0.91      0.91      0.91      2000
   weighted avg       0.91      0.91      0.91      2000


Model: Support Vector Machine
Accuracy: 89.80%
Classification Rep