In [16]:
# Load Libraries
import pandas as pd
import numpy as np

# Set random seed
np.random.seed(42)

### Load Data

In [17]:
# Load data to a DataFrame
beanDF = pd.read_excel('DryBeanDataset/Dry_Bean_Dataset.xlsx')

In [18]:
# Drop least useful features
beanDF.drop(columns=['Extent', 'Solidity', 'Eccentricity', 'ShapeFactor3'], inplace=True)

## Model Selection & Evaluation
Used GridSearchCV or RandomizedSearchCV to find the best hyperparameters for each modeling type

In [19]:
# Load libraries
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

### Split Training and Testing Data

In [20]:
# Set up features target sets
X = beanDF[beanDF.columns[:-1]]
y = beanDF.Class

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size =0.3, 
                                                    random_state=42)

# Standardize Features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train) 
X_test  = scaler.transform(X_test)

In [21]:
# Set up dictionary for model results
models_dict = {}

### Logistic Regression

In [24]:
# Load libraries
from sklearn.linear_model import LogisticRegression

# Create logistic regression
logistic = LogisticRegression(class_weight='balanced', 
                              random_state=42, 
                              max_iter=200, 
                              n_jobs=-1)

# Create range of candidate penalty hyperparameter values
parameter_space = {
    'penalty': ['l1', 'l2', 'elasticnet'], 
    'C': np.logspace(0, 5, 6), 
    'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'], 
    'multi_class': ['ovr', 'multinomial'],
}
grid = GridSearchCV(logistic, parameter_space, n_jobs=-1, cv=5)

In [25]:
%%time
# Fit models
grid_result = grid.fit(X_train, y_train)

 0.91466319 0.91329874 0.91466319 0.91434828        nan        nan
        nan        nan        nan        nan        nan        nan
        nan 0.91476801 0.91466297 0.91455803        nan 0.91455803
 0.91466297        nan        nan        nan        nan        nan
        nan        nan 0.91938633        nan 0.91487266 0.91728676
 0.91728676 0.91728676 0.91581738 0.91508253        nan        nan
        nan        nan        nan        nan        nan        nan
        nan 0.91539771 0.91623744 0.91623738        nan 0.91508286
 0.9151879         nan        nan        nan        nan        nan
        nan        nan 0.92022606        nan 0.91508258 0.91812665
 0.91812665 0.91812654 0.91707711 0.9149776         nan        nan
        nan        nan        nan        nan        nan        nan
        nan 0.91529278 0.91749695 0.91749684        nan 0.91550275
 0.91529278        nan        nan        nan        nan        nan
        nan        nan 0.92054102        nan 0.91508258 0.9191

Wall time: 11min 6s




In [26]:
# Show best parameters
print('Best parameters found:\n', grid_result.best_params_)
# Get accuracy score
score = round(grid_result.score(X_test, y_test)*100, 2)
print(f"Accuracy:  {score}")

Best parameters found:
 {'C': 100000.0, 'multi_class': 'ovr', 'penalty': 'l1', 'solver': 'liblinear'}
Accuracy:  92.14


### Random Forest

In [29]:
# Load libraries
from sklearn.ensemble import RandomForestClassifier

# Create classifier
rfc = RandomForestClassifier(random_state=42,
                                    class_weight='balanced', 
                                    n_jobs=-1)

# Create range of candidate penalty hyperparameter values
parameter_space = {
    'n_estimators': [10, 30, 100, 300, 1000], 
    'criterion': ['gini', 'entropy'], 
    'max_features': ['sqrt', 'log2'], 
}
grid = GridSearchCV(rfc, parameter_space, n_jobs=-1, cv=5)

In [30]:
%%time
# Fit models
grid_result = grid.fit(X_train, y_train)

Wall time: 4min 37s


In [31]:
# Show best parameters
print('Best parameters found:\n', grid_result.best_params_)
# Get accuracy score
score = round(grid_result.score(X_test, y_test)*100, 2)
print(f"Accuracy:  {score}")

Best parameters found:
 {'criterion': 'entropy', 'max_features': 'sqrt', 'n_estimators': 300}
Accuracy:  92.19


### Decision Tree

In [34]:
# Load libraries
from sklearn.tree import DecisionTreeClassifier

# Create decision tree regressor object
decisiontree = DecisionTreeClassifier(random_state=42, class_weight='balanced')

# Create range of candidate penalty hyperparameter values
parameter_space = {
    'criterion': ['gini', 'entropy'], 
    'splitter': ['best', 'random'], 
    'max_features': ['sqrt', 'log2'], 
}
grid = GridSearchCV(decisiontree, parameter_space, verbose=2, n_jobs=-1, cv=5)

In [35]:
%%time
# Fit models
grid_result = grid.fit(X_train, y_train)

Fitting 5 folds for each of 8 candidates, totalling 40 fits
Wall time: 845 ms


In [36]:
# Show best parameters
print('Best parameters found:\n', grid_result.best_params_)
# Get accuracy score
score = round(grid_result.score(X_test, y_test)*100, 2)
print(f"Accuracy:  {score}")

Best parameters found:
 {'criterion': 'entropy', 'max_features': 'sqrt', 'splitter': 'best'}
Accuracy:  89.03


### AdaBoost

In [42]:
# Load libraries
from sklearn.ensemble import AdaBoostClassifier

# Create classifier
adaboost = AdaBoostClassifier(random_state=42)

# Create range of candidate penalty hyperparameter values
parameter_space = {
    'n_estimators': [10, 30, 100, 300, 1000], 
    'algorithm': ['SAMME', 'SAMME.R'], 
}
grid = GridSearchCV(adaboost, parameter_space, n_jobs=-1, cv=5)

In [43]:
%%time
# Fit models
grid_result = grid.fit(X_train, y_train)

Wall time: 2min 38s


In [44]:
# Show best parameters
print('Best parameters found:\n', grid_result.best_params_)
# Get accuracy score
score = round(grid_result.score(X_test, y_test)*100, 2)
print(f"Accuracy:  {score}")

Best parameters found:
 {'algorithm': 'SAMME', 'n_estimators': 100}
Accuracy:  86.29


### Support Vector Classifier (Linear)

In [45]:
np.logspace(0,3,5)

array([   1.        ,    5.62341325,   31.6227766 ,  177.827941  ,
       1000.        ])

In [47]:
# Load libraries
from sklearn.svm import LinearSVC

# Create support vector classifier
svc = LinearSVC(random_state=42, dual=False, class_weight='balanced', max_iter=100000)

# Create range of candidate penalty hyperparameter values
parameter_space = {
    'penalty': ['l1', 'l2'], 
    'loss': ['hinge', 'squared_hinge'], 
    'C': [1, 5, 30, 200, 1000], 
    'multi_class': ['ovr', 'crammer_singer'], 
}
grid = GridSearchCV(svc, parameter_space, n_jobs=-1, cv=5)

In [48]:
%%time
# Fit models
grid_result = grid.fit(X_train, y_train)

 0.91508308 0.91508308        nan        nan 0.91644747 0.91644747
 0.91844123 0.91697163 0.91644747 0.91644747        nan        nan
 0.91812676 0.91812676 0.9183364  0.91802161 0.91812676 0.91812676
        nan        nan 0.91970123 0.91970123 0.9186512  0.91844134
 0.91970123 0.91970123        nan        nan 0.90805006 0.90805006
 0.9186512  0.91928101 0.90805006 0.90805006]


Wall time: 18min 4s




In [49]:
# Show best parameters
print('Best parameters found:\n', grid_result.best_params_)
# Get accuracy score
score = round(grid_result.score(X_test, y_test)*100, 2)
print(f"Accuracy:  {score}")

Best parameters found:
 {'C': 200, 'loss': 'hinge', 'multi_class': 'crammer_singer', 'penalty': 'l1'}
Accuracy:  92.38


### Support Vector Classifier (SVC)

In [46]:
np.logspace(0, 2, 5)

array([  1.        ,   3.16227766,  10.        ,  31.6227766 ,
       100.        ])

In [50]:
# Load libraries
from sklearn.svm import SVC

# Create support vector classifier
svc = SVC(random_state=42, class_weight='balanced', max_iter=5000)

# Create range of candidate penalty hyperparameter values
parameter_space = {
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'gamma': ['scale', 'auto'], 
    'C': [1, 3, 10, 30, 100], 
    'shrinking': [True, False], 
    'decision_function_shape': ['ovo', 'ovr'],
}
grid = GridSearchCV(svc, parameter_space, n_jobs=-1, cv=5)

In [None]:
%%time
# Fit models
grid_result = grid.fit(X_train, y_train)

In [None]:
# Show best parameters
print('Best parameters found:\n', grid_result.best_params_)
# Get accuracy score
score = round(grid_result.score(X_test, y_test)*100, 2)
print(f"Accuracy:  {score}")

### Gaussian Naive Bayes Classifier

In [None]:
# Load libraries
from sklearn.naive_bayes import GaussianNB

# Create Gaussian naive Bayes object
nBayes = GaussianNB()

# Create range of candidate hyperparameter values
parameter_space = {
    'var_smoothing': [1e-7, 1e-8, 1e-9, 1e-10]
}
grid = GridSearchCV(nBayes, parameter_space, n_jobs=-1, cv=5)

In [None]:
%%time
# Fit models
grid_result = grid.fit(X_train, y_train)

In [None]:
# Show best parameters
print('Best parameters found:\n', grid_result.best_params_)
# Get accuracy score
score = round(grid_result.score(X_test, y_test)*100, 2)
print(f"Accuracy:  {score}")

### MLPClassifier

In [None]:
# Load libraries
from sklearn.neural_network import MLPClassifier

# Create classifier
mlp = MLPClassifier(max_iter=5000)

# Create range of candidate hyperparameter values
parameter_space = {
    'hidden_layer_sizes': [(rd.randint(10,100), rd.randint(10,100), rd.randint(10,100))],
    'activation': ['relu', 'tanh', 'identity', 'logistic'], 
    'solver': ['adam', 'sgd', 'lbfgs'], 
    'alpha': [0.001, 0.0001, 0.00001],
    'learning_rate': ['constant', 'adaptive', 'invscaling'],
    'early_stopping': [True],
}
rand = RandomizedSearchCV(mlp, parameter_space, random_state=42, n_iter=1000, n_jobs=-1, cv=5)

In [None]:
%%time
# Fit models
rand_result = rand.fit(X_train, y_train)

In [None]:
# Show best parameters
print('Best parameters found:\n', rand_result.best_params_)
# Get accuracy score
score = round(rand_result.score(X_test, y_test)*100, 2)
print(f"Accuracy:  {score}")

## Neural Network - Keras

In [50]:
# Load libraries
import numpy as np
from keras.models import Sequential
from keras.layers import Dense

In [51]:
# Encode the target variables
le = LabelEncoder()
le.fit(y)

le_y_train = le.transform(y_train)
le_y_test  = le.transform(y_test)

from keras.utils.np_utils import to_categorical
cat_y_train = to_categorical(le_y_train)
cat_y_test  = to_categorical(le_y_test)

# NOTE: Add dropout? (see M5_test2) 

In [52]:
# Set the number of features
number_of_features = X.shape[1]

# Start neural network
network = Sequential()

# Add fully connected layer w/a ReLU activation function
network.add(Dense(units=100, activation='relu', 
                  input_shape=(number_of_features,)))

# Add fully connected layer w/a ReLU activation function
network.add(Dense(units=100, activation='relu'))

# Add fully connected layer w/a softmax activation function
network.add(Dense(units=7, activation='softmax'))

# Compile neural network
network.compile(loss='categorical_crossentropy', 
                optimizer='adam', 
                metrics=['accuracy'])

In [53]:
# Fit model
history = network.fit(X_train, cat_y_train, 
                      epochs=3, 
                      batch_size=100, 
                      validation_data=(X_test, cat_y_test))

Epoch 1/3
Epoch 2/3
Epoch 3/3


Note: I'm not adding this one to the dictionary of methods because I had to categorize my target variable.

## Compare Accuracies

In [93]:
# Create lists of models and accuracy scores
modelName = []
score = []
for key in models_dict:
    modelName += [key]
    model = models_dict[key]
    value = model.score(X_test, y_test)*100
    score += [round(value, 2)]

# Add Keras accuracy
modelName += ['Keras']
kerasAcc = history.history['accuracy'][np.argmin(history.history['loss'])]*100
score += [round(kerasAcc, 2)]

# Create DataFrame of results
d = {'Model': modelName, 'Accuracy': score}
results = pd.DataFrame(d).sort_values(by=['Accuracy'], ascending=False)
results

Unnamed: 0,Model,Accuracy
5,SVC,93.0
6,MLP,92.92
4,LinearSVC,92.38
1,RandomForest,92.26
0,Logistic,92.21
7,Keras,92.04
2,DecisionTree,89.03
3,AdaBoost,86.29
