In [1]:
# Load Libraries
import pandas as pd
import numpy as np

# Set random seed
np.random.seed(42)

C:\ProgramData\Anaconda3\lib\site-packages\numpy\.libs\libopenblas.NOIJJG62EMASZI6NYURL6JBKM4EVBGM7.gfortran-win_amd64.dll
C:\ProgramData\Anaconda3\lib\site-packages\numpy\.libs\libopenblas.WCDJNK7YVMPZQ2ME2ZZHJJRJ3JIKNDB7.gfortran-win_amd64.dll


### Load Data

In [2]:
# Load data to a DataFrame
beanDF = pd.read_excel('DryBeanDataset/Dry_Bean_Dataset.xlsx')

In [3]:
# Get a look at the data
beanDF.head()

Unnamed: 0,Area,Perimeter,MajorAxisLength,MinorAxisLength,AspectRation,Eccentricity,ConvexArea,EquivDiameter,Extent,Solidity,roundness,Compactness,ShapeFactor1,ShapeFactor2,ShapeFactor3,ShapeFactor4,Class
0,28395,610.291,208.178117,173.888747,1.197191,0.549812,28715,190.141097,0.763923,0.988856,0.958027,0.913358,0.007332,0.003147,0.834222,0.998724,SEKER
1,28734,638.018,200.524796,182.734419,1.097356,0.411785,29172,191.27275,0.783968,0.984986,0.887034,0.953861,0.006979,0.003564,0.909851,0.99843,SEKER
2,29380,624.11,212.82613,175.931143,1.209713,0.562727,29690,193.410904,0.778113,0.989559,0.947849,0.908774,0.007244,0.003048,0.825871,0.999066,SEKER
3,30008,645.884,210.557999,182.516516,1.153638,0.498616,30724,195.467062,0.782681,0.976696,0.903936,0.928329,0.007017,0.003215,0.861794,0.994199,SEKER
4,30140,620.134,201.847882,190.279279,1.060798,0.33368,30417,195.896503,0.773098,0.990893,0.984877,0.970516,0.006697,0.003665,0.9419,0.999166,SEKER


In [4]:
# Look at the shape of the DataFrame
beanDF.shape

(13611, 17)

In [5]:
# See the relative numbers of each target category
beanDF.value_counts('Class')

Class
DERMASON    3546
SIRA        2636
SEKER       2027
HOROZ       1928
CALI        1630
BARBUNYA    1322
BOMBAY       522
dtype: int64

## Split Training and Testing Data

In [6]:
# Load libraries
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [7]:
# Set up features target sets
X = beanDF[beanDF.columns[:-1]]
y = beanDF.Class

Let's try this without encoding the target:

In [8]:
# # Encode the target variables
# le = LabelEncoder()
# y = le.fit_transform(y)

In [9]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size =0.3, 
                                                    random_state=42)

In [10]:
# Standardize Features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train) 
X_test  = scaler.transform(X_test)

## Logistic Regression - Multiclass

### Multinomial Logistic Regression (MLR)

In [11]:
# Load libraries
from sklearn.linear_model import LogisticRegression

In [12]:
# Create logistic regression
logit_mlr = LogisticRegression(class_weight='balanced', multi_class='multinomial', n_jobs=-1)

In [13]:
%%time
# Train model
logit_mlr.fit(X_train, y_train)

Wall time: 2.73 s


LogisticRegression(class_weight='balanced', multi_class='multinomial',
                   n_jobs=-1)

In [14]:
# Get accuracy score
score = round(logit_mlr.score(X_test, y_test)*100, 2)
print(f"Accuracy:\t{score}")

Accuracy:	92.31


### Evaluate

In [15]:
# Load libraries
from sklearn.metrics import classification_report, confusion_matrix

In [16]:
# Get predictions
predictions = logit_mlr.predict(X_test)

#### Confusion matrix

In [17]:
# Print confusion matrix
cm = confusion_matrix(y_test, predictions)
print(cm)

[[369   0  20   0   0   1   5]
 [  0 161   0   0   0   0   0]
 [ 19   0 446   0   7   2   5]
 [  0   0   0 916   1  21 105]
 [  3   0   5   4 564   0  12]
 [  9   0   0   7   0 583  20]
 [  1   0   3  40  11  13 731]]


#### Classification report

In [18]:
# Print precision, recall, f1-score, and accuracy
cr = classification_report(y_test, predictions)
print(cr)

              precision    recall  f1-score   support

    BARBUNYA       0.92      0.93      0.93       395
      BOMBAY       1.00      1.00      1.00       161
        CALI       0.94      0.93      0.94       479
    DERMASON       0.95      0.88      0.91      1043
       HOROZ       0.97      0.96      0.96       588
       SEKER       0.94      0.94      0.94       619
        SIRA       0.83      0.91      0.87       799

    accuracy                           0.92      4084
   macro avg       0.94      0.94      0.94      4084
weighted avg       0.93      0.92      0.92      4084



### One-vs-Rest Regression (OVR)

MLR was better. Going to try to run a grid search for the best parameters.

### GridSearchCV with Logistic Regression

In [19]:
# Load libraries
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

In [20]:
# Create logistic regression
logistic = LogisticRegression(class_weight='balanced', 
                              random_state=42, 
                              max_iter=200, 
                              verbose=1, 
                              n_jobs=-1)

In [21]:
# Create range of candidate penalty hyperparameter values
parameter_space = {
    'penalty': ['l1', 'l2', 'elasticnet'], 
    'C': np.logspace(3, 5, 7), 
    'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'], 
    'multi_class': ['ovr', 'multinomial'],
}
grid = GridSearchCV(logistic, parameter_space, verbose=2, n_jobs=-1, cv=5)

In [22]:
%%time
grid_result = grid.fit(X_train, y_train)

Fitting 5 folds for each of 210 candidates, totalling 1050 fits


 0.91949126 0.92190546 0.91529289 0.915293          nan        nan
        nan        nan        nan        nan        nan        nan
        nan 0.9176021  0.92159056 0.91854665        nan 0.91760199
 0.9176021         nan        nan        nan        nan        nan
        nan        nan 0.92190552        nan 0.915293   0.91949104
 0.91949115 0.92148562 0.91529289 0.915293          nan        nan
        nan        nan        nan        nan        nan        nan
        nan 0.9176021  0.92201045 0.91875646        nan 0.91770692
 0.9176021         nan        nan        nan        nan        nan
        nan        nan 0.92169554        nan 0.915293   0.91928129
 0.91970118 0.9223253  0.91529289 0.915293          nan        nan
        nan        nan        nan        nan        nan        nan
        nan 0.9176021  0.92211538 0.91896633        nan 0.91770692
 0.9176021         nan        nan        nan        nan        nan
        nan        nan 0.92180053        nan 0.915293   0.9195

[LibLinear]Wall time: 23min 22s


In [23]:
print('Best parameters found:\n', grid_result.best_params_)

Best parameters found:
 {'C': 100000.0, 'multi_class': 'ovr', 'penalty': 'l2', 'solver': 'liblinear'}


In [24]:
# Get accuracy score
score = round(grid_result.score(X_test, y_test)*100, 2)
print(f"Accuracy:\t{score}")

Accuracy:	92.36


### Evaluate

In [25]:
# Load libraries
from sklearn.metrics import classification_report, confusion_matrix

In [26]:
# Get predictions
predictions = grid_result.predict(X_test)

#### Confusion matrix

In [27]:
# Print confusion matrix
cm = confusion_matrix(y_test, predictions)
print(cm)

[[366   0  22   0   0   1   6]
 [  0 161   0   0   0   0   0]
 [ 17   0 447   0   7   2   6]
 [  0   0   0 930   2  19  92]
 [  2   0   8   5 557   0  16]
 [  9   0   0   7   0 583  20]
 [  2   0   2  51   9   7 728]]


#### Classification report

In [28]:
# Print precision, recall, f1-score, and accuracy
cr = classification_report(y_test, predictions)
print(cr)

              precision    recall  f1-score   support

    BARBUNYA       0.92      0.93      0.93       395
      BOMBAY       1.00      1.00      1.00       161
        CALI       0.93      0.93      0.93       479
    DERMASON       0.94      0.89      0.91      1043
       HOROZ       0.97      0.95      0.96       588
       SEKER       0.95      0.94      0.95       619
        SIRA       0.84      0.91      0.87       799

    accuracy                           0.92      4084
   macro avg       0.94      0.94      0.94      4084
weighted avg       0.93      0.92      0.92      4084



### GridSearchCV with Random Forest

In [29]:
# Load libraries
from sklearn.ensemble import RandomForestClassifier

In [30]:
# Create classifier
classifier = RandomForestClassifier(random_state=42,
                                    verbose=1,
                                    class_weight='balanced', 
                                    n_jobs=-1)

In [31]:
# Create range of candidate penalty hyperparameter values
parameter_space = {
    'n_estimators': [10, 30, 100, 300, 1000], 
    'criterion': ['gini', 'entropy'], 
    'max_features': ['sqrt', 'log2'], 
}
grid = GridSearchCV(classifier, parameter_space, verbose=2, n_jobs=-1, cv=5)

In [32]:
%%time
grid_result = grid.fit(X_train, y_train)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.6s


Wall time: 5min 4s


[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    1.7s finished


In [33]:
print('Best parameters found:\n', grid_result.best_params_)

Best parameters found:
 {'criterion': 'entropy', 'max_features': 'sqrt', 'n_estimators': 100}


In [34]:
# Get accuracy score
score = round(grid_result.score(X_test, y_test)*100, 2)
print(f"Accuracy:\t{score}")

Accuracy:	92.41


[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.0s finished


### Evaluate

In [35]:
# Load libraries
from sklearn.metrics import classification_report, confusion_matrix

In [36]:
# Get predictions
predictions = grid_result.predict(X_test)

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.0s finished


#### Confusion matrix

In [37]:
# Print confusion matrix
cm = confusion_matrix(y_test, predictions)
print(cm)

[[355   0  26   0   2   3   9]
 [  0 161   0   0   0   0   0]
 [ 17   0 447   0   8   2   5]
 [  0   0   0 969   2  17  55]
 [  3   0   8   3 557   0  17]
 [  4   0   0  18   0 584  13]
 [  1   0   1  78   8  10 701]]


#### Classification report

In [38]:
# Print precision, recall, f1-score, and accuracy
cr = classification_report(y_test, predictions)
print(cr)

              precision    recall  f1-score   support

    BARBUNYA       0.93      0.90      0.92       395
      BOMBAY       1.00      1.00      1.00       161
        CALI       0.93      0.93      0.93       479
    DERMASON       0.91      0.93      0.92      1043
       HOROZ       0.97      0.95      0.96       588
       SEKER       0.95      0.94      0.95       619
        SIRA       0.88      0.88      0.88       799

    accuracy                           0.92      4084
   macro avg       0.94      0.93      0.93      4084
weighted avg       0.92      0.92      0.92      4084



### GridSearchCV with Decision Tree

In [39]:
# Load libraries
from sklearn.tree import DecisionTreeClassifier

In [40]:
# Create decision tree regressor object
decisiontree = DecisionTreeClassifier(random_state=42, class_weight='balanced')

In [41]:
# Create range of candidate penalty hyperparameter values
parameter_space = {
    'criterion': ['gini', 'entropy'], 
    'splitter': ['best', 'random'], 
    'max_features': ['sqrt', 'log2'], 
}
grid = GridSearchCV(decisiontree, parameter_space, verbose=2, n_jobs=-1, cv=5)

In [42]:
%%time
grid_result = grid.fit(X_train, y_train)

Fitting 5 folds for each of 8 candidates, totalling 40 fits
Wall time: 915 ms


In [43]:
print('Best parameters found:\n', grid_result.best_params_)

Best parameters found:
 {'criterion': 'entropy', 'max_features': 'sqrt', 'splitter': 'best'}


In [44]:
# Get accuracy score
score = round(grid_result.score(X_test, y_test)*100, 2)
print(f"Accuracy:\t{score}")

Accuracy:	88.34


### Evaluate

In [45]:
# Load libraries
from sklearn.metrics import classification_report, confusion_matrix

In [46]:
# Get predictions
predictions = grid_result.predict(X_test)

#### Confusion matrix

In [47]:
# Print confusion matrix
cm = confusion_matrix(y_test, predictions)
print(cm)

[[342   0  35   0   2   6  10]
 [  0 161   0   0   0   0   0]
 [ 38   0 428   0   9   1   3]
 [  0   0   0 918   7  17 101]
 [  5   0  15   8 537   0  23]
 [  6   0   1  34   0 557  21]
 [ 11   0   2  82  18  21 665]]


#### Classification report

In [48]:
# Print precision, recall, f1-score, and accuracy
cr = classification_report(y_test, predictions)
print(cr)

              precision    recall  f1-score   support

    BARBUNYA       0.85      0.87      0.86       395
      BOMBAY       1.00      1.00      1.00       161
        CALI       0.89      0.89      0.89       479
    DERMASON       0.88      0.88      0.88      1043
       HOROZ       0.94      0.91      0.93       588
       SEKER       0.93      0.90      0.91       619
        SIRA       0.81      0.83      0.82       799

    accuracy                           0.88      4084
   macro avg       0.90      0.90      0.90      4084
weighted avg       0.88      0.88      0.88      4084



### GridSearchCV with AdaBoost

In [49]:
# Load libraries
from sklearn.ensemble import AdaBoostClassifier

In [50]:
# Create ...
adaboost = AdaBoostClassifier(random_state=42)

In [51]:
# Create range of candidate penalty hyperparameter values
parameter_space = {
    'n_estimators': [10, 30, 100, 300, 1000], 
    'algorithm': ['SAMME', 'SAMME.R'], 
}
grid = GridSearchCV(adaboost, parameter_space, verbose=2, n_jobs=-1, cv=5)

In [52]:
%%time
grid_result = grid.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
Wall time: 2min 1s


In [53]:
print('Best parameters found:\n', grid_result.best_params_)

Best parameters found:
 {'algorithm': 'SAMME', 'n_estimators': 100}


In [54]:
# Get accuracy score
score = round(grid_result.score(X_test, y_test)*100, 2)
print(f"Accuracy:\t{score}")

Accuracy:	79.29


### Evaluate

In [55]:
# Load libraries
from sklearn.metrics import classification_report, confusion_matrix

In [56]:
# Get predictions
predictions = grid_result.predict(X_test)

#### Confusion matrix

In [57]:
# Print confusion matrix
cm = confusion_matrix(y_test, predictions)
print(cm)

[[355   0  29   0   0   2   9]
 [  0  35 126   0   0   0   0]
 [281   0 180   0  12   2   4]
 [  0   0   0 920   3  74  46]
 [ 30   0   3   6 541   0   8]
 [  7   0   0  24   0 573  15]
 [ 14   0   1 122  17  11 634]]


#### Classification report

In [58]:
# Print precision, recall, f1-score, and accuracy
cr = classification_report(y_test, predictions)
print(cr)

              precision    recall  f1-score   support

    BARBUNYA       0.52      0.90      0.66       395
      BOMBAY       1.00      0.22      0.36       161
        CALI       0.53      0.38      0.44       479
    DERMASON       0.86      0.88      0.87      1043
       HOROZ       0.94      0.92      0.93       588
       SEKER       0.87      0.93      0.89       619
        SIRA       0.89      0.79      0.84       799

    accuracy                           0.79      4084
   macro avg       0.80      0.72      0.71      4084
weighted avg       0.81      0.79      0.78      4084



### 17.1 Support Vector Classifier

In [59]:
# Load libraries
from sklearn.svm import LinearSVC

In [60]:
# Create support vector classifier
svc = LinearSVC(random_state=42, dual=False, class_weight='balanced', verbose=1, max_iter=100000)

In [61]:
# Create range of candidate penalty hyperparameter values
parameter_space = {
    'penalty': ['l1', 'l2'], 
    'loss': ['hinge', 'squared_hinge'], 
    'C': np.logspace(0, 3, 5), 
    'multi_class': ['ovr', 'crammer_singer'], 
}
grid = GridSearchCV(svc, parameter_space, verbose=2, n_jobs=-1, cv=5)

In [62]:
%%time
grid_result = grid.fit(X_train, y_train)

Fitting 5 folds for each of 40 candidates, totalling 200 fits


 0.9191763  0.9191763         nan        nan 0.91791706 0.91791706
 0.91949076 0.91739158 0.91791706 0.91791706        nan        nan
 0.91970107 0.91970107 0.92085559 0.91917602 0.91970107 0.91970107
        nan        nan 0.92264009 0.92264009 0.92096064 0.92075061
 0.92264009 0.92264009        nan        nan 0.75953676 0.75953676
 0.92085565 0.92096064 0.75953676 0.75953676]


[LibLinear]Wall time: 27min 37s




In [63]:
print('Best parameters found:\n', grid_result.best_params_)

Best parameters found:
 {'C': 177.82794100389228, 'loss': 'hinge', 'multi_class': 'crammer_singer', 'penalty': 'l1'}


In [64]:
# Get accuracy score
score = round(grid_result.score(X_test, y_test)*100, 2)
print(f"Accuracy:\t{score}")

Accuracy:	92.41


### Evaluate

In [65]:
# Load libraries
from sklearn.metrics import classification_report, confusion_matrix

In [66]:
# Get predictions
predictions = grid_result.predict(X_test)

#### Confusion matrix

In [67]:
# Print confusion matrix
cm = confusion_matrix(y_test, predictions)
print(cm)

[[370   0  20   0   0   0   5]
 [  0 161   0   0   0   0   0]
 [ 21   0 445   0   7   1   5]
 [  0   0   0 927   5  18  93]
 [  3   0   7   3 559   0  16]
 [ 10   0   0   8   0 583  18]
 [  3   0   2  46  10   9 729]]


#### Classification report

In [68]:
# Print precision, recall, f1-score, and accuracy
cr = classification_report(y_test, predictions)
print(cr)

              precision    recall  f1-score   support

    BARBUNYA       0.91      0.94      0.92       395
      BOMBAY       1.00      1.00      1.00       161
        CALI       0.94      0.93      0.93       479
    DERMASON       0.94      0.89      0.91      1043
       HOROZ       0.96      0.95      0.96       588
       SEKER       0.95      0.94      0.95       619
        SIRA       0.84      0.91      0.88       799

    accuracy                           0.92      4084
   macro avg       0.94      0.94      0.94      4084
weighted avg       0.93      0.92      0.92      4084



### 17.2 Linearly Inseparable Classes Using Kernels (& SVC)

In [69]:
# Load libraries
from sklearn.svm import SVC

In [70]:
# Create support vector classifier
svc = SVC(random_state=42, class_weight='balanced', verbose=1, max_iter=5000)

In [71]:
# Create range of candidate penalty hyperparameter values
parameter_space = {
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'gamma': ['scale', 'auto'], 
    'C': np.logspace(0, 2, 5), 
    'shrinking': [True, False], 
    'decision_function_shape': ['ovo', 'ovr'],
}
grid = GridSearchCV(svc, parameter_space, verbose=2, n_jobs=-1, cv=5)

In [72]:
%%time
grid_result = grid.fit(X_train, y_train)

Fitting 5 folds for each of 160 candidates, totalling 800 fits
[LibSVM]Wall time: 4min 27s


In [73]:
print('Best parameters found:\n', grid_result.best_params_)

Best parameters found:
 {'C': 31.622776601683793, 'decision_function_shape': 'ovo', 'gamma': 'scale', 'kernel': 'rbf', 'shrinking': True}


In [74]:
# Get accuracy score
score = round(grid_result.score(X_test, y_test)*100, 2)
print(f"Accuracy:\t{score}")

Accuracy:	93.22


### Evaluate

In [75]:
# Load libraries
from sklearn.metrics import classification_report, confusion_matrix

In [76]:
# Get predictions
predictions = grid_result.predict(X_test)

#### Confusion matrix

In [77]:
# Print confusion matrix
cm = confusion_matrix(y_test, predictions)
print(cm)

[[372   0  17   0   1   2   3]
 [  0 161   0   0   0   0   0]
 [ 19   0 447   0   8   2   3]
 [  0   0   0 952   1  18  72]
 [  3   0   6   6 563   0  10]
 [  4   0   0  12   0 591  12]
 [  2   0   1  58   8   9 721]]


#### Classification report

In [78]:
# Print precision, recall, f1-score, and accuracy
cr = classification_report(y_test, predictions)
print(cr)

              precision    recall  f1-score   support

    BARBUNYA       0.93      0.94      0.94       395
      BOMBAY       1.00      1.00      1.00       161
        CALI       0.95      0.93      0.94       479
    DERMASON       0.93      0.91      0.92      1043
       HOROZ       0.97      0.96      0.96       588
       SEKER       0.95      0.95      0.95       619
        SIRA       0.88      0.90      0.89       799

    accuracy                           0.93      4084
   macro avg       0.94      0.94      0.94      4084
weighted avg       0.93      0.93      0.93      4084

