<a href="https://colab.research.google.com/github/Rosalyn-DSAI/BDA_ICP1_A/blob/main/ICP_5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install scikit-learn



In [None]:
import numpy as np
from sklearn.datasets import load_iris
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline

# 1. Load dataset
X, y = load_iris(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 2. Create pipeline
pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA()),
    ('classifier', SVC())
])

# 3. Define parameter grid
param_grid = {
    'pca__n_components': [2, 3],
    'classifier__C': [0.1, 1, 10],
    'classifier__kernel': ['linear', 'rbf']
}

# 4. GridSearchCV
grid = GridSearchCV(pipe, param_grid)
grid.fit(X_train, y_train)

# 5. Results
print("Best parameters found:", grid.best_params_)
print("Best cross-validation score: {:.2f}".format(grid.best_score_))
print("Test set score: {:.2f}".format(grid.score(X_test, y_test)))


# 6. To check for 3 fold, 5 fold and 7 fold cross validation
# pass the cv parameter to GridSearchCV (the default is 5), and then loop over
# the values you want:
for cv in [3, 5, 7]:
    grid = GridSearchCV(pipe, param_grid, cv=cv)
    grid.fit(X_train, y_train)
    print(f"{cv}-Fold CV")
    print(" Best parameters found:", grid.best_params_)
    print(" Best CV accuracy:  {:.2f}".format(grid.best_score_))
    print(" Test-set accuracy: {:.2f}".format(grid.score(X_test, y_test)))
    print()

# 7. Replace classifier, SVC with RandomForestClassifier and LogisticRegression,
# Perceptron, knn.
# The grid search swap out four different classifiers (RandomForest,
# LogisticRegression, Perceptron and KNN)

from sklearn.ensemble       import RandomForestClassifier
from sklearn.linear_model   import LogisticRegression, Perceptron
from sklearn.neighbors      import KNeighborsClassifier

# update pipeline definition:
pipe = Pipeline([
    ('scaler',     StandardScaler()),
    ('pca',        PCA()),
    ('classifier', RandomForestClassifier())   # this is just a dummy—Grid/Random search will swap in all 4!
])


# 8. Update the param_grid accordingly (e.g., for RandomForestClassifier,
# use n_estimators, max_depth, etc.)
param_grid = [
    # Random Forest
    {
        'pca__n_components':       [2, 3],
        'classifier':             [RandomForestClassifier(random_state=42)],
        'classifier__n_estimators': [50, 100, 200],
        'classifier__max_depth':    [None, 5, 10]
    },
    # Logistic Regression
    {
        'pca__n_components': [2, 3],
        'classifier':       [LogisticRegression(max_iter=1000, solver='lbfgs')],
        'classifier__C':    [0.01, 0.1, 1, 10],
        'classifier__penalty':['l2']
    },
    # Perceptron
    {
        'pca__n_components': [2, 3],
        'classifier':       [Perceptron()],
        'classifier__alpha': [1e-4, 1e-3, 1e-2],
        'classifier__max_iter':[1000, 2000]
    },
    # k‑Nearest Neighbors
    {
        'pca__n_components':     [2, 3],
        'classifier':            [KNeighborsClassifier()],
        'classifier__n_neighbors': [3, 5, 7, 9],
        'classifier__weights':     ['uniform', 'distance']
    }
]


# 9. Also replace Gridsearch with randomnsearch function.
from sklearn.model_selection import RandomizedSearchCV

random_search = RandomizedSearchCV(
    estimator=pipe,
    param_distributions=param_grid,
    n_iter=10,           # how many random combos to try
    cv=5,                # 5‑fold CV by default
    random_state=42
)

# Fit & report
random_search.fit(X_train, y_train)

print("Best parameters found:", random_search.best_params_)
print("Best CV score:        {:.2f}".format(random_search.best_score_))
print("Test set score:       {:.2f}".format(random_search.score(X_test, y_test)))



Best parameters found: {'classifier__C': 0.1, 'classifier__kernel': 'linear', 'pca__n_components': 3}
Best cross-validation score: 0.96
Test set score: 1.00
3-Fold CV
 Best parameters found: {'classifier__C': 0.1, 'classifier__kernel': 'linear', 'pca__n_components': 3}
 Best CV accuracy:  0.97
 Test-set accuracy: 1.00

5-Fold CV
 Best parameters found: {'classifier__C': 0.1, 'classifier__kernel': 'linear', 'pca__n_components': 3}
 Best CV accuracy:  0.96
 Test-set accuracy: 1.00

7-Fold CV
 Best parameters found: {'classifier__C': 0.1, 'classifier__kernel': 'linear', 'pca__n_components': 3}
 Best CV accuracy:  0.97
 Test-set accuracy: 1.00

Best parameters found: {'pca__n_components': 3, 'classifier__weights': 'distance', 'classifier__n_neighbors': 7, 'classifier': KNeighborsClassifier()}
Best CV score:        0.96
Test set score:       1.00


In [None]:
# 10. Replace with with your own csv dataset using code below:
from google.colab import drive
drive.mount('/content/drive')

import pandas as pd

wdbc_data = pd.read_csv('/content/drive/My Drive/breast+cancer+wisconsin+dataset/wdbc.csv')
wdbc_data.head()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Unnamed: 0,Id,Diagnosis,MeanRadius,MeanTexture,MeanPerimeter,MeanArea,MeanSmoothness,MeanCompactness,MeanConcavity,MeanConcavePoints,...,RadiusWorst,TextureWorst,PerimeterWorst,AreaWorst,SmoothnessWorst,CompactnessWorst,ConcavityWorst,ConcavePointsWorst,SymmetryWorst,FractalDimensionWorst
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing   import StandardScaler
from sklearn.decomposition    import PCA
from sklearn.pipeline        import Pipeline
from sklearn.svm             import SVC


# 1. Load dataset
wdbc_data = pd.read_csv('/content/drive/My Drive/breast+cancer+wisconsin+dataset/wdbc.csv')
X = wdbc_data.drop(['Id','Diagnosis'], axis=1)
y = wdbc_data['Diagnosis']

# 2. Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y)

# 3. Create pipeline
pipe = Pipeline([
    ('scaler',     StandardScaler()),
    ('pca',        PCA()),             # we’ll tune n_components
    ('classifier', SVC())              # dummy placeholder
])

# 4. Define parameter grid
param_grid = {   # SVC (baseline)
        'pca__n_components':   [5, 10, 15],
        'classifier':         [SVC()],
        'classifier__C':       [0.1, 1, 10],
        'classifier__kernel': ['linear','rbf']
    }

# 5. GridSearchCV (default cv=5)
grid = GridSearchCV(pipe, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid.fit(X_train, y_train)

# 6. Results
print("GridSearchCV (cv=5) Results:")
print(" Best parameters found:", grid.best_params_)
print(" Best CV accuracy: {:.3f}".format(grid.best_score_))
print(" Test-set accuracy: {:.3f}".format(grid.score(X_test, y_test)))
print()


# 7. To check for 3 fold, 5 fold and 7 fold cross validation
for cv in (3, 5, 7):
    gs = GridSearchCV(pipe, param_grid, cv=cv, scoring='accuracy', n_jobs=-1)
    gs.fit(X_train, y_train)
    print(f" {cv}-fold CV")
    print(" Best params:       ", gs.best_params_)
    print(" Best CV accuracy:  {:.3f}".format(gs.best_score_))
    print(" Test-set accuracy: {:.3f}".format(gs.score(X_test, y_test)))
    print()


# 8. Replace classifier, SVC with RandomForestClassifier and LogisticRegression,
# Perceptron, knn.
# The grid search swap out four different classifiers (RandomForest,
# LogisticRegression, Perceptron and KNN)

from sklearn.ensemble       import RandomForestClassifier
from sklearn.linear_model   import LogisticRegression, Perceptron
from sklearn.neighbors      import KNeighborsClassifier

# update pipeline definition:
pipe = Pipeline([
    ('scaler',     StandardScaler()),
    ('pca',        PCA()),
    ('classifier', RandomForestClassifier())   # a dummy—Grid/Random search will swap in all 4!
])


# 9. Update the param_grid accordingly (e.g., for RandomForestClassifier,
# use n_estimators, max_depth, etc.)
param_grid = [
    {   # Random Forest
        'pca__n_components':       [5, 10, 15],
        'classifier':             [RandomForestClassifier(random_state=42)],
        'classifier__n_estimators':[50, 100, 200],
        'classifier__max_depth':   [None, 5, 10]
    },
    {   # Logistic Regression
        'pca__n_components': [5, 10, 15],
        'classifier':       [LogisticRegression(max_iter=1000, solver='lbfgs')],
        'classifier__C':    [0.01, 0.1, 1, 10],
        'classifier__penalty':['l2']
    },
    {   # Perceptron
        'pca__n_components': [5, 10, 15],
        'classifier':       [Perceptron()],
        'classifier__alpha': [1e-4, 1e-3, 1e-2],
        'classifier__max_iter':[1000, 2000]
    },
    {   # knn
        'pca__n_components':    [5, 10, 15],
        'classifier':           [KNeighborsClassifier()],
        'classifier__n_neighbors':[3, 5, 7, 9],
        'classifier__weights':   ['uniform','distance']
    }
]

# 10. RandomizedSearchCV
random = RandomizedSearchCV(
    estimator=pipe,
    param_distributions=param_grid,
    n_iter=20,            # how many random parameter settings
    cv=5,
    scoring='accuracy',
    n_jobs=-1,
    random_state=42
)
random.fit(X_train, y_train)

print("RandomizedSearchCV Results")
print(" Best parameters found:", random.best_params_)
print(" Best CV accuracy:  {:.3f}".format(random.best_score_))
print(" Test-set accuracy: {:.3f}".format(random.score(X_test, y_test)))


GridSearchCV (cv=5) Results:
 Best parameters found: {'classifier': SVC(), 'classifier__C': 0.1, 'classifier__kernel': 'linear', 'pca__n_components': 10}
 Best CV accuracy: 0.978
 Test-set accuracy: 0.982

 3-fold CV
 Best params:        {'classifier': SVC(), 'classifier__C': 0.1, 'classifier__kernel': 'linear', 'pca__n_components': 10}
 Best CV accuracy:  0.976
 Test-set accuracy: 0.982

 5-fold CV
 Best params:        {'classifier': SVC(), 'classifier__C': 0.1, 'classifier__kernel': 'linear', 'pca__n_components': 10}
 Best CV accuracy:  0.978
 Test-set accuracy: 0.982

 7-fold CV
 Best params:        {'classifier': SVC(), 'classifier__C': 1, 'classifier__kernel': 'rbf', 'pca__n_components': 10}
 Best CV accuracy:  0.978
 Test-set accuracy: 0.965

RandomizedSearchCV Results
 Best parameters found: {'pca__n_components': 5, 'classifier__penalty': 'l2', 'classifier__C': 1, 'classifier': LogisticRegression(max_iter=1000)}
 Best CV accuracy:  0.976
 Test-set accuracy: 0.974
