In [1]:
# Load Libraries
import pandas as pd
import numpy as np

# Set random seed
np.random.seed(42)

C:\ProgramData\Anaconda3\lib\site-packages\numpy\.libs\libopenblas.NOIJJG62EMASZI6NYURL6JBKM4EVBGM7.gfortran-win_amd64.dll
C:\ProgramData\Anaconda3\lib\site-packages\numpy\.libs\libopenblas.WCDJNK7YVMPZQ2ME2ZZHJJRJ3JIKNDB7.gfortran-win_amd64.dll


### Load Data

In [2]:
# Load data to a DataFrame
beanDF = pd.read_excel('DryBeanDataset/Dry_Bean_Dataset.xlsx')

In [29]:
# Drop least useful features
beanDF.drop(columns=['Extent', 'Solidity', 'Eccentricity', 'ShapeFactor3'], inplace=True)

## Model Selection & Evaluation
Used GridSearchCV or RandomizedSearchCV to find the best hyperparameters for each modeling type

### Split Training and Testing Data

In [35]:
# Set up features target sets
X = beanDF[beanDF.columns[:-1]]
y = beanDF.Class

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size =0.3, 
                                                    random_state=42)

# Standardize Features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train) 
X_test  = scaler.transform(X_test)

In [37]:
# Set up dictionary for model results
models_dict = {}

### Logistic Regression

In [38]:
# Load libraries
from sklearn.linear_model import LogisticRegression

# Create classification model
logistic = LogisticRegression(C=100000, 
                              multi_class='ovr', 
                              penalty='l2', 
                              solver='liblinear', 
                              class_weight='balanced', 
                              random_state=42, 
                              max_iter=200)

In [39]:
# Fit model
models_dict['Logistic'] = logistic.fit(X_train, y_train)

### Random Forest

In [40]:
# Load libraries
from sklearn.ensemble import RandomForestClassifier

# Create classification model
rfclassifier = RandomForestClassifier(criterion='entropy', 
                                      max_features='sqrt', 
                                      n_estimators=100, 
                                      random_state=42,
                                      class_weight='balanced', 
                                      n_jobs=-1)

In [41]:
# Fit model
models_dict['RandomForest'] = rfclassifier.fit(X_train, y_train)

### Decision Tree

In [42]:
# Load libraries
from sklearn.tree import DecisionTreeClassifier

# Create classification model
decisiontree = DecisionTreeClassifier(criterion='entropy', 
                                      max_features='sqrt', 
                                      splitter='best', 
                                      random_state=42, 
                                      class_weight='balanced')

In [43]:
# Fit model
models_dict['DecisionTree'] = decisiontree.fit(X_train, y_train)

### AdaBoost

In [44]:
# Load libraries
from sklearn.ensemble import AdaBoostClassifier

# Create classification model
adaboost = AdaBoostClassifier(algorithm='SAMME', 
                              n_estimators=100, 
                              random_state=42)

In [45]:
# Fit model
models_dict['AdaBoost'] = adaboost.fit(X_train, y_train)

### Support Vector Classifier (Linear)

In [46]:
# Load libraries
from sklearn.svm import LinearSVC

# Create classification model
linSVC = LinearSVC(C=178, 
                   loss='hinge', 
                   multi_class='crammer_singer', 
                   penalty='l1', 
                   random_state=42, 
                   dual=False, 
                   class_weight='balanced', 
                   max_iter=1000000)

In [47]:
# Fit model
models_dict['LinearSVC'] = linSVC.fit(X_train, y_train)

### Support Vector Classifier (SVC)

In [48]:
# Load libraries
from sklearn.svm import SVC

# Create classification model
svc = SVC(C=32, 
          decision_function_shape='ovo', 
          gamma='scale', 
          kernel='rbf', 
          shrinking=True, 
          random_state=42, 
          class_weight='balanced', 
          max_iter=5000)

In [49]:
# Fit model
models_dict['SVC'] = svc.fit(X_train, y_train)

### MLPClassifier

In [54]:
# Load libraries
from sklearn.neural_network import MLPClassifier

# Create classification model
mlp = MLPClassifier(max_iter=5000, 
                    activation='relu', 
                    alpha=0.00025,
                    hidden_layer_sizes=(77, 73, 54),
                    learning_rate='constant', 
                    solver='adam')

In [55]:
# Fit model
models_dict['MLP'] = mlp.fit(X_train, y_train)

## Neural Network - Keras

In [50]:
# Load libraries
import numpy as np
from keras.models import Sequential
from keras.layers import Dense

In [51]:
# Encode the target variables
le = LabelEncoder()
le.fit(y)

le_y_train = le.transform(y_train)
le_y_test  = le.transform(y_test)

from keras.utils.np_utils import to_categorical
cat_y_train = to_categorical(le_y_train)
cat_y_test  = to_categorical(le_y_test)

# NOTE: Add dropout? (see M5_test2) 

In [52]:
# Set the number of features
number_of_features = X.shape[1]

# Start neural network
network = Sequential()

# Add fully connected layer w/a ReLU activation function
network.add(Dense(units=100, activation='relu', 
                  input_shape=(number_of_features,)))

# Add fully connected layer w/a ReLU activation function
network.add(Dense(units=100, activation='relu'))

# Add fully connected layer w/a softmax activation function
network.add(Dense(units=7, activation='softmax'))

# Compile neural network
network.compile(loss='categorical_crossentropy', 
                optimizer='adam', 
                metrics=['accuracy'])

In [53]:
# Fit model
history = network.fit(X_train, cat_y_train, 
                      epochs=3, 
                      batch_size=100, 
                      validation_data=(X_test, cat_y_test))

Epoch 1/3
Epoch 2/3
Epoch 3/3


Note: I'm not adding this one to the dictionary of methods because I had to categorize my target variable.

## Compare Accuracies

In [93]:
# Create lists of models and accuracy scores
modelName = []
score = []
for key in models_dict:
    modelName += [key]
    model = models_dict[key]
    value = model.score(X_test, y_test)*100
    score += [round(value, 2)]

# Add Keras accuracy
modelName += ['Keras']
kerasAcc = history.history['accuracy'][np.argmin(history.history['loss'])]*100
score += [round(kerasAcc, 2)]

# Create DataFrame of results
d = {'Model': modelName, 'Accuracy': score}
results = pd.DataFrame(d).sort_values(by=['Accuracy'], ascending=False)
results

Unnamed: 0,Model,Accuracy
5,SVC,93.0
6,MLP,92.92
4,LinearSVC,92.38
1,RandomForest,92.26
0,Logistic,92.21
7,Keras,92.04
2,DecisionTree,89.03
3,AdaBoost,86.29
