In [1]:
from sklearn.datasets import load_iris
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline

# 1. Load dataset
X, y = load_iris(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 2. Create pipeline
pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA()),
    ('classifier', SVC())
])

# 3. Define parameter grid
param_grid = {
    'pca__n_components': [2, 3],
    'classifier__C': [0.1, 1, 10],
    'classifier__kernel': ['linear', 'rbf']
}

# 4. GridSearchCV
grid = GridSearchCV(pipe, param_grid)
grid.fit(X_train, y_train)

# 5. Results
print("Best parameters found:", grid.best_params_)
print("Best cross-validation score: {:.2f}".format(grid.best_score_))
print("Test set score: {:.2f}".format(grid.score(X_test, y_test)))


Best parameters found: {'classifier__C': 0.1, 'classifier__kernel': 'linear', 'pca__n_components': 3}
Best cross-validation score: 0.96
Test set score: 1.00


Check for 3 fold, 5 fold and 7 fold cross validation
Replace classifier, SVC with RandomForestClassifier and LogisticRegression, Perceptron, knn .
Update the param_grid accordingly (e.g., for RandomForestClassifier, use n_estimators, max_depth, etc.)
Also replace Gridsearch with randomnsearch function.

In [2]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, Perceptron
from sklearn.neighbors import KNeighborsClassifier

# Define classifiers and their parameter grids
classifiers = {
    "RandomForest": (
        RandomForestClassifier(),
        {
            'pca__n_components': [2, 3],
            'classifier__n_estimators': [50, 100, 200],      # Number of trees
            'classifier__max_depth': [None, 5, 10],          # Max depth of each tree
            'classifier__min_samples_split': [2, 5],         # Min samples to split a node
        }
    ),
    "LogisticRegression": (
        LogisticRegression(max_iter=1000),
        {
            'pca__n_components': [2, 3],
            'classifier__C': [0.1, 1, 10],                  # Inverse regularization strength
            'classifier__penalty': ['l2'],                  # Regularization type
            'classifier__solver': ['lbfgs']                 # Solver
        }
    ),
    "Perceptron": (
        Perceptron(),
        {
            'pca__n_components': [2, 3],
            'classifier__penalty': [None, 'l2', 'l1', 'elasticnet'],  # Regularization
            'classifier__alpha': [0.0001, 0.001, 0.01]                # Regularization strength
        }
    ),
    "KNN": (
        KNeighborsClassifier(),
        {
            'pca__n_components': [2, 3],
            'classifier__n_neighbors': [3, 5, 7],           # Number of neighbors
            'classifier__weights': ['uniform', 'distance'], # Weight function
            'classifier__p': [1, 2]                        # Power parameter for Minkowski metric (1=manhattan, 2=euclidean)
        }
    )
}

for name, (clf, param_grid) in classifiers.items():
    pipe = Pipeline([
        ('scaler', StandardScaler()),
        ('pca', PCA()),
        ('classifier', clf)
    ])
    grid = GridSearchCV(pipe, param_grid, cv=5)
    grid.fit(X_train, y_train)
    print(f"\n{name} Results:")
    print("Best parameters found:", grid.best_params_)
    print("Best cross-validation score: {:.2f}".format(grid.best_score_))
    print("Test set score: {:.2f}".format(grid.score(X_test, y_test)))


RandomForest Results:
Best parameters found: {'classifier__max_depth': None, 'classifier__min_samples_split': 2, 'classifier__n_estimators': 200, 'pca__n_components': 3}
Best cross-validation score: 0.94
Test set score: 1.00

LogisticRegression Results:
Best parameters found: {'classifier__C': 1, 'classifier__penalty': 'l2', 'classifier__solver': 'lbfgs', 'pca__n_components': 3}
Best cross-validation score: 0.96
Test set score: 1.00

Perceptron Results:
Best parameters found: {'classifier__alpha': 0.0001, 'classifier__penalty': 'l1', 'pca__n_components': 3}
Best cross-validation score: 0.94
Test set score: 0.90

KNN Results:
Best parameters found: {'classifier__n_neighbors': 3, 'classifier__p': 2, 'classifier__weights': 'uniform', 'pca__n_components': 3}
Best cross-validation score: 0.96
Test set score: 1.00


# Replace with with your own csv dataset using code below:

In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from google.colab import files
uploaded = files.upload()

data = pd.read_csv("pd_speech_features.csv")
X = data.drop(['id', 'class'], axis=1)
y = data['class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Saving pd_speech_features.csv to pd_speech_features.csv


In [5]:
# Define classifiers and their parameter grids
classifiers = {
    "RandomForest": (
        RandomForestClassifier(),
        {
            'pca__n_components': [2, 3],
            'classifier__n_estimators': [50, 100, 200],
            'classifier__max_depth': [None, 5, 10],
            'classifier__min_samples_split': [2, 5],
        }
    ),
    "LogisticRegression": (
        LogisticRegression(max_iter=1000),
        {
            'pca__n_components': [2, 3],
            'classifier__C': [0.1, 1, 10],
            'classifier__penalty': ['l2'],
            'classifier__solver': ['lbfgs']
        }
    ),
    "Perceptron": (
        Perceptron(),
        {
            'pca__n_components': [2, 3],
            'classifier__penalty': [None, 'l2', 'l1', 'elasticnet'],
            'classifier__alpha': [0.0001, 0.001, 0.01]
        }
    ),
    "KNN": (
        KNeighborsClassifier(),
        {
            'pca__n_components': [2, 3],
            'classifier__n_neighbors': [3, 5, 7],
            'classifier__weights': ['uniform', 'distance'],
            'classifier__p': [1, 2]
        }
    )
}

for name, (clf, param_grid) in classifiers.items():
    pipe = Pipeline([
        ('scaler', StandardScaler()),
        ('pca', PCA()),
        ('classifier', clf)
    ])
    grid = GridSearchCV(pipe, param_grid, cv=5)
    grid.fit(X_train, y_train)
    print(f"\n{name} Results:")
    print("Best parameters found:", grid.best_params_)
    print("Best cross-validation score: {:.2f}".format(grid.best_score_))
    print("Test set score: {:.2f}".format(grid.score(X_test, y_test)))


RandomForest Results:
Best parameters found: {'classifier__max_depth': 10, 'classifier__min_samples_split': 5, 'classifier__n_estimators': 100, 'pca__n_components': 3}
Best cross-validation score: 0.80
Test set score: 0.81

LogisticRegression Results:
Best parameters found: {'classifier__C': 0.1, 'classifier__penalty': 'l2', 'classifier__solver': 'lbfgs', 'pca__n_components': 3}
Best cross-validation score: 0.80
Test set score: 0.80

Perceptron Results:
Best parameters found: {'classifier__alpha': 0.0001, 'classifier__penalty': 'l1', 'pca__n_components': 2}
Best cross-validation score: 0.76
Test set score: 0.73

KNN Results:
Best parameters found: {'classifier__n_neighbors': 7, 'classifier__p': 1, 'classifier__weights': 'uniform', 'pca__n_components': 3}
Best cross-validation score: 0.79
Test set score: 0.82
