In [15]:
import os   #linar regression for binary classes
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,f1_score,confusion_matrix,ConfusionMatrixDisplay

import matplotlib.patheffects as PathEffects
from sklearn.preprocessing import StandardScaler, LabelBinarizer
from sklearn.metrics import classification_report, RocCurveDisplay
from sklearn.pipeline import Pipeline
from sklearn import svm
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, KFold
from sklearn.datasets import load_iris
from sklearn.model_selection import cross_val_score

In [16]:
inpDir = os.path.join( '..', 'input')


###moduleDir = 'MACHINE LEARNING'

outDir = os.path.join('output')


# define and set random state 
RANDOM_STATE = 24
np.random.seed(RANDOM_STATE) # Set Random Seed for reproducible  results

NOISE = 0.1
N_SAMPLES = 1000
ALPHA = 0.001

# parameters for Matplotlib
params = {'legend.fontsize': 'x-large',
          'figure.figsize': (15, 6),
          'axes.labelsize': 'x-large',
          'axes.titlesize':'x-large',
          'xtick.labelsize':'x-large',
          'ytick.labelsize':'x-large',
          'savefig.dpi': 150,
          'image.cmap': 'jet',
          'image.interpolation': 'none',
          'savefig.bbox' : 'tight',
          'lines.linewidth' : 2,
          'legend.numpoints' : 1
         }
CMAP = plt.cm.rainbow
plt.rcParams.update(params);
plt.set_cmap(CMAP);
plt.style.use('seaborn-v0_8-darkgrid') # plt.style.use('ggplot')

TEST_SIZE=0.2

<Figure size 1500x600 with 0 Axes>

In [17]:
X, y = load_iris(return_X_y = True)
X.shape, y.shape

((150, 4), (150,))

In [18]:
 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE) 

In [19]:
clf = svm.SVC(kernel = 'linear', C = 1)
clf.fit(X_train, y_train)
clf.score(X_test, y_test)

0.9666666666666667

In [26]:
# Scores for 5 CVs

clf = svm.SVC(kernel = 'linear', C = 1)

scores = cross_val_score(clf, X, y, cv = 5, scoring = 'f1_macro')      # split data in 5 folds and score

for i in range(5):
    print(f'Fold :{i}, {scores[i]:.3f} accuracy')
    
print(f"\nAverage: {scores.mean():.3f} accuracy with a standard deviation of {scores.std():.3f}")

Fold :0, 0.967 accuracy
Fold :1, 1.000 accuracy
Fold :2, 0.967 accuracy
Fold :3, 0.967 accuracy
Fold :4, 1.000 accuracy

Average: 0.980 accuracy with a standard deviation of 0.016


In [27]:
from sklearn.model_selection import ShuffleSplit

n_samples = X.shape[0]

n_splits = 5

clf = svm.SVC(kernel = 'linear', C = 1)

cv = ShuffleSplit(n_splits = n_splits, test_size = TEST_SIZE, random_state = RANDOM_STATE)

scores = cross_val_score(clf, X, y, cv = cv, scoring = 'f1_macro')      

for i in range(n_splits):
    print(f'Fold :{i}, {scores[i]:.3f} accuracy')
    
print(f"\nAverage: {scores.mean():.3f} accuracy with a standard deviation of {scores.std():.3f}")

Fold :0, 0.942 accuracy
Fold :1, 1.000 accuracy
Fold :2, 1.000 accuracy
Fold :3, 0.972 accuracy
Fold :4, 0.966 accuracy

Average: 0.976 accuracy with a standard deviation of 0.022


In [45]:
params = {'kernel': ('linear','rbf','poly'), 'C':[0.01, 0.1, 1, 10,100]}

svc = svm.SVC()

clf = GridSearchCV(svc, params)

clf.fit(X, y)

In [46]:
sorted(clf.cv_results_.keys())

['mean_fit_time',
 'mean_score_time',
 'mean_test_score',
 'param_C',
 'param_kernel',
 'params',
 'rank_test_score',
 'split0_test_score',
 'split1_test_score',
 'split2_test_score',
 'split3_test_score',
 'split4_test_score',
 'std_fit_time',
 'std_score_time',
 'std_test_score']

In [47]:
clf.cv_results_

{'mean_fit_time': array([0.00100012, 0.00099955, 0.00060005, 0.00059996, 0.00059991,
        0.00039988, 0.00100613, 0.00059819, 0.00059986, 0.00059996,
        0.000599  , 0.0006    , 0.0007936 , 0.00040526, 0.00140038]),
 'std_fit_time': array([4.94257298e-06, 1.15430054e-06, 4.89940362e-04, 4.89862464e-04,
        4.89823887e-04, 4.89745640e-04, 1.21784994e-05, 4.88438409e-04,
        4.89784698e-04, 4.89862464e-04, 4.89087961e-04, 4.89902520e-04,
        3.96997723e-04, 4.96449233e-04, 7.98393255e-04]),
 'mean_score_time': array([0.00059795, 0.00100012, 0.00039992, 0.00040002, 0.00079989,
        0.00040011, 0.        , 0.00040092, 0.00020008, 0.00039997,
        0.00040107, 0.00019989, 0.00020051, 0.00060048, 0.00019956]),
 'std_score_time': array([4.88234309e-04, 6.46813391e-07, 4.89804047e-04, 4.89920847e-04,
        3.99947547e-04, 4.90038622e-04, 0.00000000e+00, 4.91052698e-04,
        4.00161743e-04, 4.89862650e-04, 4.91206952e-04, 3.99780273e-04,
        4.01020050e-04, 4.90

In [48]:
clf.best_estimator_

In [50]:
params = {'kernel': ('linear','rbf','sigmoid'),
          'C':[1,2,5,8,10],
          'gamma': [0.001, 0.05,0.1]}

svc = svm.SVC()

clf = RandomizedSearchCV(svc, params)

clf.fit(X,y)

sorted(clf.cv_results_.keys())

['mean_fit_time',
 'mean_score_time',
 'mean_test_score',
 'param_C',
 'param_gamma',
 'param_kernel',
 'params',
 'rank_test_score',
 'split0_test_score',
 'split1_test_score',
 'split2_test_score',
 'split3_test_score',
 'split4_test_score',
 'std_fit_time',
 'std_score_time',
 'std_test_score']

In [51]:
clf.fit(X,y)

In [52]:
clf.cv_results_

{'mean_fit_time': array([0.00079975, 0.00100074, 0.00080042, 0.00060034, 0.00099998,
        0.00100007, 0.00079985, 0.00060005, 0.00079985, 0.00079999]),
 'std_fit_time': array([3.99876181e-04, 2.09265788e-06, 4.00210492e-04, 4.90174660e-04,
        1.78416128e-07, 2.43140197e-07, 3.99926011e-04, 4.89941453e-04,
        3.99929365e-04, 4.00023651e-04]),
 'mean_score_time': array([0.00039954, 0.00099893, 0.00040007, 0.00079932, 0.00060649,
        0.00099983, 0.00040011, 0.00040016, 0.0006001 , 0.        ]),
 'std_score_time': array([4.89336843e-04, 2.15053030e-06, 4.89979265e-04, 3.99661530e-04,
        4.95324807e-04, 6.14361702e-07, 4.90037787e-04, 4.90096251e-04,
        4.89983233e-04, 0.00000000e+00]),
 'param_kernel': masked_array(data=['rbf', 'rbf', 'linear', 'linear', 'rbf', 'sigmoid',
                    'sigmoid', 'rbf', 'rbf', 'linear'],
              mask=[False, False, False, False, False, False, False, False,
                    False, False],
        fill_value='?',
   

In [53]:
clf.best_estimator_

### K-Fold

In [56]:
kf = KFold(n_splits = 3, random_state = RANDOM_STATE, shuffle = True)
print(f'Number of splits: {kf.get_n_splits(X)}')
print(kf)
for i, (train_index, test_index) in enumerate(kf.split(X)):
    print(f"Fold {i}:")
    print(f" Train: index = {train_index}")
    print(f" Test: index = {test_index}")

Number of splits: 3
KFold(n_splits=3, random_state=24, shuffle=True)
Fold 0:
 Train: index = [  0   2   3   4   5   6   7   8  10  12  13  16  18  21  23  24  27  28
  32  33  34  35  36  38  39  41  42  43  44  45  46  47  50  52  54  55
  56  60  61  62  63  64  65  66  67  68  70  71  73  74  75  76  79  80
  81  82  83  85  86  87  89  90  91  93  94  96  97  98  99 100 101 102
 106 110 115 116 117 118 119 121 122 123 124 125 126 127 128 129 130 131
 132 134 138 139 142 143 145 146 147 149]
 Test: index = [  1   9  11  14  15  17  19  20  22  25  26  29  30  31  37  40  48  49
  51  53  57  58  59  69  72  77  78  84  88  92  95 103 104 105 107 108
 109 111 112 113 114 120 133 135 136 137 140 141 144 148]
Fold 1:
 Train: index = [  0   1   3   4   5   6   7   9  10  11  12  14  15  17  18  19  20  22
  25  26  27  28  29  30  31  33  35  37  40  41  42  45  46  47  48  49
  51  53  55  56  57  58  59  60  64  65  69  70  72  73  74  77  78  81
  82  83  84  87  88  89  92  95  97 1