In [57]:
import numpy as np
import time
from sklearn import datasets, neighbors, tree
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, StratifiedKFold, RepeatedStratifiedKFold, cross_validate
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import balanced_accuracy_score


In [58]:
digits = datasets.load_digits()
raw_data = digits.images
target = digits.target

In [59]:
data = raw_data.reshape((len(raw_data), -1))
data.shape # first dimension as before, second and third have been reshaped

(1797, 64)

In [73]:
def cross_grid_search(data, target, estimator, parameter_grid):
    # Create pipeline for scaling
    pipeline = Pipeline(steps=[
        ('scaler', StandardScaler()),  # Scaling step
        ('classifier', estimator)      # Classifier step
    ])
    # Inner split for GridSearchCV
    in_skf = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=42)

    # Define the GridSearchCV for hyperparameter tuning
    grid_search = GridSearchCV(
        estimator=pipeline,
        scoring='balanced_accuracy',
        param_grid=parameter_grid,
        cv=in_skf,
        n_jobs=-1
    )
    # Outer Split for Cross Validation
    out_skf = StratifiedKFold(n_splits=10)  

    # Get scores using cross_validate
    scores = cross_validate(
        grid_search,
        X=data,
        y=target,
        scoring='balanced_accuracy',
        cv=out_skf,
        n_jobs=-1
    )    
    outer_mean = np.mean(scores['test_score'])
    outer_std = np.std(scores['test_score'])
    outer_min = np.min(scores['test_score'])
    outer_max = np.max(scores['test_score'])
    train_times = np.mean(scores['fit_time'])
    test_time = np.mean(scores['score_time'])
    
    # Print results
    print('Outer cross-validation accuracy mean:', outer_mean)
    print('Outer cross-validation standard deviation:', outer_std)
    print('Outer cross-validation min accuracy:', outer_min)
    print('Outer cross-validation max accuracy:', outer_max)
    print('Average training time:', train_times)
    print('Average testing time:', test_time)
    print('Training times for each fold:', scores['fit_time'])


In [61]:
def cross_grid_search_man(data, target, estimator, parameter_grid):
    # Outer and inner Split with 3 splits each
    out_skf = StratifiedKFold(n_splits= 3)      

    in_skf = StratifiedKFold(n_splits= 3)
    outer_score = []

    for train_idx, test_idx in out_skf.split(data, target):
        # Split the outer data into train and test data:
        X_train, X_test = data[train_idx], data[test_idx]
        y_train, y_test = target[train_idx], target[test_idx]

        # Start Grid search for parameters
        grid_seach = GridSearchCV(
            estimator= estimator,
            scoring= 'accuracy',
            param_grid= parameter_grid,
            cv = in_skf,
            n_jobs= -1
        )

        # Fit the Grid search on each train data 
        grid_seach.fit(X_train,y_train)

        # find the best model
        best_model = grid_seach.best_estimator_
        score = best_model.score(X_test, y_test)
        outer_score.append(score)
    outer_mean = np.mean(outer_score)
    print('Outer cross validation accuracy mean:',outer_mean)


In [62]:
knn = neighbors.KNeighborsClassifier()

# 
parameter_grid_knn = {
    'classifier__n_neighbors':[1,2,3,4,5,6,7,8,9,10],
    'classifier__weights':['uniform', 'distance']
}

In [63]:
cross_grid_search(data, target, knn, parameter_grid_knn )

Outer cross validation accuracy mean: 0.9599724802201584
Outer cross validation accuracy mean: 0.9599724802201584
Outer cross validation standard deviation: 0.024419749950505894
Outer cross validation min accuracy: 0.9111111111111112
Outer cross validation max accuracy: 0.988562091503268
average training time: 14.242198204994201
average testing time: 0.0074977874755859375
the training times are: [14.12559891 14.10630274 14.20438147 14.25317717 14.32961798 14.32807255
 14.24914694 14.32749915 14.2393353  14.25884986]


In [64]:
from sklearn import tree

# Decision Tree classifier
baum = tree.DecisionTreeClassifier()

# Corrected parameter grid with 'classifier__' prefix
parameter_grid_baum = {
    'classifier__criterion': ['gini', 'entropy'],
    'classifier__max_depth': [1, 2, 4, 16, 32, 64, 128, 256, 512]
}

In [65]:
cross_grid_search(data, target, baum, parameter_grid_baum)

Outer cross validation accuracy mean: 0.8237736498108015
Outer cross validation accuracy mean: 0.8237736498108015
Outer cross validation standard deviation: 0.05210719522518863
Outer cross validation min accuracy: 0.7402304781561748
Outer cross validation max accuracy: 0.9055555555555556
average training time: 10.522475004196167
average testing time: 0.0055951356887817385
the training times are: [10.45115304 10.50179029 10.66974378 10.57261729 10.3226018  10.66127968
 10.56331062 10.52452421 10.43784094 10.5198884 ]


In [66]:
svm = SVC()

parameter_grid_svm_poly = {
    'classifier__C':[0.1,1,10,100],
    'classifier__kernel': ['poly'],
    'classifier__degree': [1,2,3]
}

In [67]:
cross_grid_search(data, target, svm, parameter_grid_svm_poly)

Outer cross validation accuracy mean: 0.9641486068111457
Outer cross validation accuracy mean: 0.9641486068111457
Outer cross validation standard deviation: 0.025877213503387234
Outer cross validation min accuracy: 0.9222222222222223
Outer cross validation max accuracy: 0.9944444444444445
average training time: 20.097489833831787
average testing time: 0.012108278274536134
the training times are: [20.15503931 20.16760755 20.04724765 20.05429268 20.1037941  20.20146036
 20.09235048 20.09788156 20.08543324 19.96979141]


In [68]:
svm = SVC()

parameter_grid_svm_rbf = {
    'classifier__C':[0.1,1,10,100],
    'classifier__kernel': ['rbf'],
    'classifier__gamma': [0.001,0.01,0.1,1,10],
}

In [69]:
cross_grid_search(data, target, svm, parameter_grid_svm_rbf)

Outer cross validation accuracy mean: 0.96062263501892
Outer cross validation accuracy mean: 0.96062263501892
Outer cross validation standard deviation: 0.02069778185978872
Outer cross validation min accuracy: 0.9221878224974202
Outer cross validation max accuracy: 0.9947368421052631
average training time: 62.82005243301391
average testing time: 0.022557663917541503
the training times are: [18.68593407 18.74477696 73.898314   73.92340565 73.81959748 73.76790118
 73.77402687 73.86487532 73.90044045 73.82125235]


In [70]:
pipeline = Pipeline(steps=[
        ('scaler', StandardScaler()),  # Scaling step
        ('classifier', svm)      # Classifier step
    ])

grid_seach = GridSearchCV(
            estimator= pipeline,
            scoring= 'accuracy',
            param_grid= parameter_grid_svm_poly,
            cv = StratifiedKFold(n_splits=3),
            n_jobs= -1
        )

# Fit the Grid search on each train data 
grid_seach.fit(data,target)


In [71]:
test_data = np.load('sample_digit.npy')

test_data = raw_data.reshape((len(raw_data), -1))
test_data.shape 

predictions = grid_seach.predict(test_data)