# Feature Extraction

In [None]:
# when running locally
workdir = './'

In [None]:
from google.colab import drive
drive.mount("/content/drive/", force_remount=True)# definindo workdir

workdir = '/content/drive/MyDrive/EP - MAC0417 5768'

Mounted at /content/drive/


In [None]:
from joblib import load
import pandas as pd
import matplotlib.pyplot as plt
from skimage.measure import label, regionprops, regionprops_table
import numpy as np
import random

np.random.seed(12)
random.seed(12)

meta_set = pd.read_csv(f'{workdir}/classification_dataset.csv')

#imgs = load('images.pkl')
labels = load(f'{workdir}/labels.pkl')
images = load(f'{workdir}/images.pkl')

# extract features from an image
dataset = pd.DataFrame()

for idx, img in enumerate(labels):

    rowset = {'idx': idx, 'class': meta_set['Objeto'].iloc[idx]}
    res = regionprops_table(
        label(img),
        intensity_image=images[idx],
        properties=(
            'area',
            'extent',
            'perimeter',
            'solidity',
            'centroid',
            'orientation',
            'intensity_max',
            'intensity_min',
            'intensity_mean',
            'eccentricity',
            'inertia_tensor',
            'inertia_tensor_eigvals',
            'moments_normalized',
            'moments_weighted_normalized',
            'inertia_tensor_eigvals',
            'eccentricity',
            'centroid_weighted',
            'centroid_weighted_local',
            'equivalent_diameter_area',
            'euler_number',
            'feret_diameter_max',
            'perimeter_crofton',
            'axis_minor_length',
            'axis_major_length'
        )
    )
    rowset.update(res)
    dataset = pd.concat([dataset, pd.DataFrame(rowset)])

# this is the dataset with features
dataset.set_index('idx', drop=True, inplace=True)
# dataset = pd.concat([dataset, meta_set])

In [None]:
dataset = pd.concat([dataset, meta_set], axis=1)

In [None]:
# now make it useful
from sklearn.preprocessing import LabelEncoder

lenc = LabelEncoder()
features = [x for x in dataset.columns if x != 'class']

# datasetcopy = dataset.copy(deep=True).fillna(0)
datasetcopy = dataset.copy(deep=True)
X = datasetcopy[features].dropna(axis=1)
y = lenc.fit_transform(dataset['class'])

print(dataset[:100])
print(X[:100])
print(y[:100])


      class   area  extent  perimeter  solidity  centroid-0  centroid-1  \
0   celular  12922     1.0      502.0       1.0       130.5       127.0   
1   celular   1326     1.0      142.0       1.0       157.0       129.5   
2   celular   1326     1.0      142.0       1.0       157.0       129.5   
3   celular   1326     1.0      142.0       1.0       156.0       128.5   
4   celular   1326     1.0      142.0       1.0       157.0       129.5   
..      ...    ...     ...        ...       ...         ...         ...   
95  celular   3087     1.0      220.0       1.0       140.0       154.0   
96  celular   2132     1.0      182.0       1.0       146.5       150.0   
97  celular   2132     1.0      182.0       1.0       146.5       150.0   
98  celular   2132     1.0      182.0       1.0       145.5       149.0   
99  celular   2132     1.0      182.0       1.0       146.5       150.0   

    orientation  intensity_max  intensity_min  ...  axis_minor_length  \
0           0.0          2

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, mean_absolute_error, classification_report
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.svm import LinearSVC
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

def filter_df_column_by_list(df, column, values):
  # return df[df[column].str.contains('|'.join(values))]
  # return df[column].query('&'.join(values))
  return df[df[column].isin(values)]

def grid_to_df(grid):
  df = pd.DataFrame(grid.cv_results_)
  df = pd.concat([df, df['params'].apply(pd.Series)], axis=1)
  return df

def plot_parameters(grid, parameters):
  counter = 1
  columns = 3
  lines = len(parameters)//columns+1
  plt.rcParams['figure.figsize'] = (5*columns, 5*lines)
  df = grid_to_df(grid)
  for p in parameters.keys():
    v = parameters[p]
    df = filter_df_column_by_list(df, p, v)
  for p in parameters.keys():
    if len(parameters[p]) > 1:
      plt.subplot(lines, columns, counter) 
      sns.boxplot(x = df[p], y=df['mean_test_score'])
      counter += 1
  plt.tight_layout()
  plt.show()

def train_svm(xtrain, xtest, ytrain, ytest, parameters):
  SVM = LinearSVC(class_weight = 'balanced', random_state = 12, max_iter = 1500)
  grid = GridSearchCV(SVM, parameters)
  print(grid)
  grid.fit(xtrain, ytrain)
  print("Best model:", grid.best_params_)
  print("Score:", grid.score(xtest, ytest))
  return grid

X_train0, X_test0, y_train, y_test = train_test_split(X, y, train_size = .7, test_size = .3, stratify=y)
X_train = X_train0[[x for x in dataset.columns if x not in meta_set.columns and x in X_train0]]
print(X_train[:10])
X_test = X_test0[[x for x in dataset.columns if x not in meta_set.columns and x in X_test0]]
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

       area  extent  perimeter  solidity  centroid-0  centroid-1  orientation  \
4153  29725     1.0      696.0       1.0       144.0       129.0     0.000000   
1969  64516     1.0     1012.0       1.0       126.5       126.5     0.785398   
3603  16898     1.0      518.0       1.0       131.5       128.0     0.000000   
4501   7548     1.0      348.0       1.0       150.5        81.5     0.000000   
3301   2550     1.0      198.0       1.0       141.0       124.5     0.000000   
5022  54056     1.0      926.0       1.0       119.0       115.5     0.000000   
5192  28014     1.0      678.0       1.0       133.0       127.5     0.000000   
2787  26085     1.0      648.0       1.0       136.0       102.0     1.570796   
6925   1560     1.0      160.0       1.0       125.5       144.5     0.000000   
2952    900     1.0      146.0       1.0       104.0        86.5     1.570796   

      intensity_max  intensity_min  intensity_mean  ...  centroid_weighted-0  \
4153          196.0         

In [None]:
# parameters_svm = {
#     'penalty' : ('l1', 'l2'),
#     'loss' : ('hinge', 'squared_hinge'),
#     'dual' : (True, False),
#     'tol' : [1e-2, 1e-4, 1e-6, 1e-8],
#     'C' : [1, 2, 4, 8],
#     'fit_intercept' : (True, False),
#     'intercept_scaling' : [1, 2, 4]
# }

parameters_svm = {
    'penalty' : ['l1', 'l2'],
    'loss' : ['hinge', 'squared_hinge'],
    'dual': [True, False],
    'tol' : [1e-2, 1e-4],
    'C' : [1, 2],
    'fit_intercept' : [True, False],
    'intercept_scaling' : [1, 2]
}

# parameters_svm = {
#     'penalty' : ['l1'],
#     'loss' : ['squared_hinge'],
#     'dual': [False],
#     'tol' : [1e-4],
#     'C' : [1],
#     'fit_intercept' : [True],
#     'intercept_scaling' : [2]
# }

grid_svm = train_svm(X_train, X_test, y_train, y_test, parameters_svm)
plot_parameters(grid_svm, parameters_svm)

In [None]:
parameters_svm_final = {
    'penalty' : ['l1'],
    'loss' : ['squared_hinge'],
    'dual': [False],
    'tol' : [1e-4],
    'C' : [1],
    'fit_intercept' : [True],
    'intercept_scaling' : [2]
}

grid_svm_final = train_svm(X_train, X_test, y_train, y_test, parameters_svm_final)
plot_parameters(grid_svm_final, parameters_svm_final)

GridSearchCV(estimator=LinearSVC(class_weight='balanced', max_iter=1500,
                                 random_state=12),
             param_grid={'C': [1], 'dual': [False], 'fit_intercept': [True],
                         'intercept_scaling': [2], 'loss': ['squared_hinge'],
                         'penalty': ['l1'], 'tol': [0.0001]})
Best model: {'C': 1, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 2, 'loss': 'squared_hinge', 'penalty': 'l1', 'tol': 0.0001}
Score: 0.40754541220307405


<Figure size 432x288 with 0 Axes>

In [None]:
from sklearn.metrics import classification_report, log_loss

def printErrors(grid, X_test, X_test0, y_test):
    def estimate_error(grid, pred, y_test):
    #   print("Score", grid.score(X_test, y_test))
      print("Accuracy score", accuracy_score(y_test, pred))
      print("Mean absolute error", mean_absolute_error(y_test, pred))
    #   print(confusion_matrix(y_test, pred))
    #   print(classification_report(y_test, pred))

    predictions = grid.predict(X_test)
    # for className in pd.unique(y):
    #     mask = [y_test == className]
    #     pred = predictions[mask]
    #     class_y_test = y_test[mask]
    #     print(lenc.inverse_transform([className]))
    #     estimate_error(grid, pred, class_y_test)

    target_names = np.array([x.capitalize() for x in lenc.inverse_transform(pd.unique(y))])

    def classification_report_latex(y_test, predictions, target_names):
        report = classification_report(y_test, predictions, target_names=target_names, output_dict = True)
        report_df = pd.DataFrame(report)
        report_df = report_df.transpose()
        report_df['support'] = report_df['support'].astype(int)
        return report_df.to_latex(float_format="%.2f")

    def estimate_error_filtered(columnName):
        for value in pd.unique(X_test0[columnName]):
            mask = value == X_test0[columnName]
            pred = predictions[mask]
            filtered_y_test = y_test[mask]
            print(columnName, value)
    #         print(classification_report(filtered_y_test, pred, target_names=target_names))
            print(classification_report_latex(filtered_y_test, pred, target_names=target_names))
#     estimate_error_filtered('Fundo')

    df = pd.DataFrame()
#     df2 = pd.DataFrame()
    df['Classe'] = np.concatenate([target_names, ['Accuracy', 'Macro Avg', 'Weighted Avg']])
#     df2['Classe'] = np.concatenate([target_names, ['accuracy', 'macro avg', 'weighted avg']])
    for value in pd.unique(X_test0['Fundo']):
        mask = value == X_test0['Fundo']
        pred = predictions[mask]
        filtered_y_test = y_test[mask]
#         print('Fundo', value)
#         print(classification_report(filtered_y_test, pred, target_names=target_names))
#         print(classification_report_latex(filtered_y_test, pred, target_names=target_names))
        report = classification_report(filtered_y_test, pred, target_names=target_names, output_dict = True)
        report_df = pd.DataFrame(report).transpose()
        df[f'Fundo {value}'] = report_df['f1-score'].to_numpy()
#         df2[f'Fundo {value}'] = log_loss(filtered_y_test, pred, labels = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

    for value in pd.unique(X_test0['Ambiente']):
        for value2 in pd.unique(X_test0['Iluminacao']):
            mask1 = value == X_test0['Ambiente']
            mask2 = value2 == X_test0['Iluminacao']
            mask = np.logical_and(mask1, mask2)
            pred = predictions[mask]
            filtered_y_test = y_test[mask]
            report = classification_report(filtered_y_test, pred, target_names=target_names, output_dict = True)
            report_df = pd.DataFrame(report).transpose()
#             print(report_df['f1-score'])
            df[f'{value} {value2}'] = report_df['f1-score'].to_numpy()
#             print('Ambiente', value, 'Iluminacao', value2)
    #         print(classification_report(filtered_y_test, pred, target_names=target_names))
#             print(classification_report_latex(filtered_y_test, pred, target_names=target_names))

    # print(classification_report(y_test, predictions, target_names=target_names))

#     print(df)
    print("\nMatriz de confusão\n")
    confusion = pd.DataFrame(confusion_matrix(y_test, predictions))
    confusion.set_axis(target_names, axis=1, inplace=True)
    confusion.set_axis(target_names, axis=0, inplace=True)
    print(confusion.to_latex(float_format="%.2f"))
    print(classification_report_latex(y_test, predictions, target_names))

    print(df.to_latex(float_format="%.2f"))

    
printErrors(grid_svm_final, X_test, X_test0, y_test)


Matriz de confusão

\begin{tabular}{lrrrrrrrrrr}
\toprule
{} &  Celular &  Sapato &  Chinelo &  Caneca &  Tesoura &  Livro &  Portacopo &  Garrafa &  Prato &  Chave \\
\midrule
Celular   &       82 &      12 &       40 &       1 &        6 &     11 &         31 &       13 &     11 &      9 \\
Sapato    &       17 &      32 &       35 &       3 &       15 &      7 &         21 &       30 &     13 &     30 \\
Chinelo   &        7 &       2 &      160 &       0 &        0 &     10 &          5 &       17 &      0 &     15 \\
Caneca    &       23 &       7 &       17 &       0 &       29 &     18 &         44 &       43 &     31 &      4 \\
Tesoura   &       23 &       4 &        5 &       1 &       50 &     22 &         28 &       49 &     22 &     12 \\
Livro     &       12 &       7 &       23 &       1 &        6 &    103 &          9 &       21 &     32 &      2 \\
Portacopo &        7 &       8 &       23 &       0 &        0 &      4 &        164 &        3 &      7 &      0 \\
Gar

In [None]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

def train_lda(xtrain, xtest, ytrain, ytest, parameters):
  SVM = LinearDiscriminantAnalysis()
  grid = GridSearchCV(SVM, parameters)
  print(grid)
  grid.fit(xtrain, ytrain)
  print("Best model:", grid.best_params_)
  print("Score:", grid.score(xtest, ytest))
  return grid

In [None]:
parameters_lda = {
'solver' : ['svd', 'lsqr', 'eigen'],
'tol' : [1e-2, 1e-4, 1e-6]
}

grid_lda = train_lda(X_train, X_test, y_train, y_test, parameters_lda)
plot_parameters(grid_lda, parameters_lda)

In [None]:
parameters_lda_final = {
'solver' : ['svd'],
'tol' : [1e-06]
}

grid_lda_final = train_lda(X_train, X_test, y_train, y_test, parameters_lda_final)
plot_parameters(grid_lda_final, parameters_lda_final)

GridSearchCV(estimator=LinearDiscriminantAnalysis(),
             param_grid={'solver': ['svd'], 'tol': [1e-06]})
Best model: {'solver': 'svd', 'tol': 1e-06}
Score: 0.46343735444806705


<Figure size 1080x360 with 0 Axes>

In [None]:
printErrors(grid_lda_final, X_test, X_test0, y_test)


Matriz de confusão

\begin{tabular}{lrrrrrrrrrr}
\toprule
{} &  Celular &  Sapato &  Chinelo &  Caneca &  Tesoura &  Livro &  Portacopo &  Garrafa &  Prato &  Chave \\
\midrule
Celular   &      105 &       2 &       29 &       7 &        3 &     16 &         28 &       11 &      8 &      7 \\
Sapato    &       20 &      54 &       14 &      17 &       21 &      8 &          3 &       21 &     15 &     30 \\
Chinelo   &        5 &       1 &      161 &       2 &        0 &      6 &          9 &       17 &      0 &     15 \\
Caneca    &       20 &      11 &       11 &      40 &       32 &     20 &         17 &       38 &     25 &      2 \\
Tesoura   &       34 &       4 &        2 &      11 &       60 &     30 &          9 &       43 &     13 &     10 \\
Livro     &       17 &       2 &       13 &       4 &        9 &    120 &          2 &       22 &     24 &      3 \\
Portacopo &       14 &       2 &        5 &       4 &        1 &      6 &        178 &        4 &      2 &      0 \\
Gar

In [None]:
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
grid_qda = QuadraticDiscriminantAnalysis()
grid_qda.fit(X_train, y_train)
print("Score:", grid_qda.score(X_test, y_test))

Score: 0.4527247321844434


In [None]:
printErrors(grid_qda, X_test, X_test0, y_test)


Matriz de confusão

\begin{tabular}{lrrrrrrrrrr}
\toprule
{} &  Celular &  Sapato &  Chinelo &  Caneca &  Tesoura &  Livro &  Portacopo &  Garrafa &  Prato &  Chave \\
\midrule
Celular   &       83 &       2 &        9 &       0 &        3 &      4 &         58 &       46 &      8 &      3 \\
Sapato    &        2 &     109 &        1 &       0 &        1 &      2 &         29 &       56 &      3 &      0 \\
Chinelo   &        3 &      19 &       97 &       0 &        0 &      2 &         20 &       69 &      1 &      5 \\
Caneca    &       16 &      33 &        1 &      27 &       24 &      1 &         41 &       62 &     10 &      1 \\
Tesoura   &        8 &      13 &        2 &       0 &      111 &      9 &         27 &       37 &      8 &      1 \\
Livro     &       24 &       8 &        3 &       1 &        7 &     78 &          9 &       76 &      9 &      1 \\
Portacopo &        5 &       3 &        0 &       0 &        0 &      2 &        194 &        8 &      4 &      0 \\
Gar

In [None]:
from sklearn.linear_model import LogisticRegression

def logistic_regression(xtrain, xtest, ytrain, ytest, parameters):
    logistic_Reg = LogisticRegression(class_weight='balanced', max_iter = 500, multi_class='multinomial')
    grid = GridSearchCV(logistic_Reg, parameters)
    grid.fit(xtrain, ytrain)
    print("Best model:", grid.best_params_)
    print("Score:", grid.score(xtest, ytest))
    return grid

In [None]:
parameters_lr1 = {
    'tol': [1e-2, 1e-4, 1e-6, 1e-8],
    'C': [0.5, 0.75, 1.0, 1.25, 1.5, 1.75, 2.0],
    'fit_intercept': (False, True)
}

grid_lr = logistic_regression(X_train, X_test, y_train, y_test, parameters_lr1)
plot_parameters(grid_lr, parameters_lr1)

KeyboardInterrupt: 

In [None]:
parameters_lr_final = {
    'tol': [0.01],
    'C': [1.75],
    'fit_intercept': [True]
}

grid_lr_final = logistic_regression(X_train, X_test, y_train, y_test, parameters_lr_final)
plot_parameters(grid_lr_final, parameters_lr_final)

Best model: {'C': 1.75, 'fit_intercept': True, 'tol': 0.01}
Score: 0.3055426176059618


<Figure size 1080x720 with 0 Axes>

In [None]:
printErrors(grid_lr_final, X_test, X_test0, y_test)


Matriz de confusão

\begin{tabular}{lrrrrrrrrrr}
\toprule
{} &  Celular &  Sapato &  Chinelo &  Caneca &  Tesoura &  Livro &  Portacopo &  Garrafa &  Prato &  Chave \\
\midrule
Celular   &       63 &      45 &       19 &       4 &       23 &     13 &         19 &       21 &      4 &      5 \\
Sapato    &       31 &      47 &       39 &       2 &       12 &     10 &         38 &       15 &      0 &      9 \\
Chinelo   &        3 &      12 &      130 &       2 &        0 &     18 &         27 &        9 &      2 &     13 \\
Caneca    &       30 &      17 &       13 &       5 &       30 &     27 &         45 &       32 &     16 &      1 \\
Tesoura   &       35 &      16 &        7 &       9 &       39 &     29 &         14 &       48 &     14 &      5 \\
Livro     &       18 &       7 &       19 &       3 &        8 &    108 &          6 &       22 &     21 &      4 \\
Portacopo &       14 &      43 &       20 &       6 &        0 &      7 &         91 &        1 &      7 &     27 \\
Gar

In [None]:
from sklearn.neural_network import MLPClassifier
def neural_network(xtrain, xtest, ytrain, ytest, parameters):
  MLP = MLPClassifier(random_state = 12, max_iter = 100)
  grid = GridSearchCV(MLP, parameters, n_jobs=12)
  grid.fit(xtrain, ytrain)
  print("Best model:", grid.best_params_)
  print("Score:", grid.score(xtest, ytest))
  return grid

parameters_nn_final = {
    'activation': ['tanh'],
    'alpha': [0.0001],
    'beta_1': [0.8],
    'beta_2': [0.9],
    'early_stopping': [True],
    'epsilon': [1e-08],
    'hidden_layer_sizes': [(100, 50, 25)],
    'learning_rate': ['constant'],
    'momentum': [0.8],
    'nesterovs_momentum': [True],
    'solver': ['adam'],
    'tol': [0.1]
}

grid_nn_final = neural_network(X_train, X_test, y_train, y_test, parameters_nn_final)
plot_parameters(grid_nn_final, parameters_nn_final)

Best model: {'activation': 'tanh', 'alpha': 0.0001, 'beta_1': 0.8, 'beta_2': 0.9, 'early_stopping': True, 'epsilon': 1e-08, 'hidden_layer_sizes': (100, 50, 25), 'learning_rate': 'constant', 'momentum': 0.8, 'nesterovs_momentum': True, 'solver': 'adam', 'tol': 0.1}
Score: 0.35165346995808106


<Figure size 1080x1800 with 0 Axes>

In [None]:
printErrors(grid_nn_final, X_test, X_test0, y_test)


Matriz de confusão

\begin{tabular}{lrrrrrrrrrr}
\toprule
{} &  Celular &  Sapato &  Chinelo &  Caneca &  Tesoura &  Livro &  Portacopo &  Garrafa &  Prato &  Chave \\
\midrule
Celular   &       35 &       0 &       17 &      19 &        3 &     58 &         56 &       23 &      0 &      5 \\
Sapato    &       37 &      28 &       20 &       0 &        4 &     54 &          6 &       27 &      0 &     27 \\
Chinelo   &        4 &       4 &      108 &       0 &        0 &     64 &          6 &       17 &      0 &     13 \\
Caneca    &       13 &      13 &        8 &      19 &        0 &     66 &         36 &       60 &      0 &      1 \\
Tesoura   &       15 &       0 &        8 &      10 &       32 &     53 &         20 &       73 &      0 &      5 \\
Livro     &       18 &       0 &        7 &       9 &        3 &    141 &          1 &       37 &      0 &      0 \\
Portacopo &       12 &       1 &        9 &       4 &        1 &      2 &        176 &       11 &      0 &      0 \\
Gar