In [18]:
import numpy as np
from sklearn import datasets
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.metrics import classification_report, make_scorer, accuracy_score, precision_score, recall_score, f1_score
import pandas as pd

In [19]:
iris = datasets.load_iris()
X = iris.data
y = iris.target

In [29]:
df = pd.DataFrame(X, columns=iris.feature_names)
df['species'] = iris.target

# Map target integers back to original species name so that we don't see digits but species name
df['species'] = df['species'].map(dict(enumerate(iris.target_names)))

# Display basic info and first few rows
print(df.info())
print(df.head())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   sepal length (cm)  150 non-null    float64
 1   sepal width (cm)   150 non-null    float64
 2   petal length (cm)  150 non-null    float64
 3   petal width (cm)   150 non-null    float64
 4   species            150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB
None
   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)  \
0                5.1               3.5                1.4               0.2   
1                4.9               3.0                1.4               0.2   
2                4.7               3.2                1.3               0.2   
3                4.6               3.1                1.5               0.2   
4                5.0               3.6                1.4               0.2   

  species  
0  setosa  
1  s

In [30]:
# Step 2: Check for and handle missing values
if df.isnull().values.any():
    # Impute missing values with the mean (no missing values in Iris, but good practice)
    df.fillna(df.mean(), inplace=True)

# Step 3: Scale features
scaler = StandardScaler()
features_scaled = scaler.fit_transform(df[iris.feature_names])
df_scaled = pd.DataFrame(features_scaled, columns=iris.feature_names)

# Step 4: Encode categorical variables
encoder = LabelEncoder()
df['species_encoded'] = encoder.fit_transform(df['species'])

# Show final scaled and encoded DataFrame
print(df_scaled.head())
print(df[['species', 'species_encoded']].drop_duplicates())


   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)
0          -0.900681          1.019004          -1.340227         -1.315444
1          -1.143017         -0.131979          -1.340227         -1.315444
2          -1.385353          0.328414          -1.397064         -1.315444
3          -1.506521          0.098217          -1.283389         -1.315444
4          -1.021849          1.249201          -1.340227         -1.315444
        species  species_encoded
0        setosa                0
50   versicolor                1
100   virginica                2


In [34]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define a pipeline to include scaling and the classifier
pipeline = make_pipeline(StandardScaler(), SVC(random_state=42))

# Define the parameter grid to search
param_grid = {
    'svc__C': [0.1, 1, 10, 100],
    'svc__kernel': ['linear', 'rbf', 'poly'],
    'svc__gamma': ['scale', 'auto'],
    'svc__degree': [2, 3, 4]  # Only used by 'poly' kernel
}


In [33]:
# Define the scoring functions
scoring = {
    'accuracy': make_scorer(accuracy_score),
    'precision': make_scorer(precision_score, average='weighted'),
    'recall': make_scorer(recall_score, average='weighted'),
    'f1_score': make_scorer(f1_score, average='weighted')
}

# Create the GridSearchCV object
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring=scoring, refit='f1_score', return_train_score=True, verbose=1)

# Fit GridSearchCV
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 72 candidates, totalling 360 fits


In [35]:

# Extract and print detailed evaluation results from GridSearchCV
print("Summary of model evaluations:")
results = grid_search.cv_results_
for mean_accuracy, mean_precision, mean_recall, mean_f1, params in zip(
        results['mean_test_accuracy'], results['mean_test_precision'], results['mean_test_recall'], results['mean_test_f1_score'], results['params']):
    print(f"Evaluating SVM with kernel={params['svc__kernel']}, C={params['svc__C']}, gamma={params['svc__gamma']}")
    print(f"Accuracy: {mean_accuracy:.2f}")
    print(f"Precision: {mean_precision:.2f}")
    print(f"Recall: {mean_recall:.2f}")
    print(f"F1 Score: {mean_f1:.2f}\n")

Summary of model evaluations:
Evaluating SVM with kernel=linear, C=0.1, gamma=scale
Accuracy: 0.96
Precision: 0.96
Recall: 0.96
F1 Score: 0.96

Evaluating SVM with kernel=rbf, C=0.1, gamma=scale
Accuracy: 0.87
Precision: 0.88
Recall: 0.87
F1 Score: 0.86

Evaluating SVM with kernel=poly, C=0.1, gamma=scale
Accuracy: 0.68
Precision: 0.72
Recall: 0.68
F1 Score: 0.65

Evaluating SVM with kernel=linear, C=0.1, gamma=auto
Accuracy: 0.96
Precision: 0.96
Recall: 0.96
F1 Score: 0.96

Evaluating SVM with kernel=rbf, C=0.1, gamma=auto
Accuracy: 0.87
Precision: 0.88
Recall: 0.87
F1 Score: 0.86

Evaluating SVM with kernel=poly, C=0.1, gamma=auto
Accuracy: 0.68
Precision: 0.72
Recall: 0.68
F1 Score: 0.65

Evaluating SVM with kernel=linear, C=0.1, gamma=scale
Accuracy: 0.96
Precision: 0.96
Recall: 0.96
F1 Score: 0.96

Evaluating SVM with kernel=rbf, C=0.1, gamma=scale
Accuracy: 0.87
Precision: 0.88
Recall: 0.87
F1 Score: 0.86

Evaluating SVM with kernel=poly, C=0.1, gamma=scale
Accuracy: 0.83
Precisi