# Feature Extraction

In [1]:
from google.colab import drive
drive.mount("/content/drive/", force_remount=True)# definindo workdir

workdir = '/content/drive/MyDrive/EP - MAC0417 5768'

Mounted at /content/drive/


In [2]:
from joblib import load
import pandas as pd
import matplotlib.pyplot as plt
from skimage.measure import label, regionprops, regionprops_table

meta_set = pd.read_csv(f'{workdir}/classification_dataset.csv')

#imgs = load('images.pkl')
labels = load(f'{workdir}/labels.pkl')

# extract features from an image
dataset = pd.DataFrame()

for idx, img in enumerate(labels):

    rowset = {'idx': idx, 'class': meta_set['Objeto'].iloc[idx]}
    res = regionprops_table(
        label(img),
        intensity_image=img,
        properties=(
            'area',
            'extent',
            'perimeter',
            'solidity',
            'centroid',
            'orientation',
            'intensity_max',
            'intensity_min',
            'intensity_mean'
        )
    )
    rowset.update(res)
    dataset = pd.concat([dataset, pd.DataFrame(rowset)])

# this is the dataset with features
dataset.set_index('idx', drop=True, inplace=True)

AttributeError: ignored

In [None]:
# now make it useful
from sklearn.preprocessing import LabelEncoder

lenc = LabelEncoder()
features = [x for x in dataset.columns if x != 'class']

X = dataset[features]
y = lenc.fit_transform(dataset['class'])

dataset = dataset[:100]
print(dataset)


In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, mean_absolute_error, classification_report
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.svm import LinearSVC
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

def filter_df_column_by_list(df, column, values):
  # return df[df[column].str.contains('|'.join(values))]
  # return df[column].query('&'.join(values))
  return df[df[column].isin(values)]

def grid_to_df(grid):
  df = pd.DataFrame(grid.cv_results_)
  df = pd.concat([df, df['params'].apply(pd.Series)], axis=1)
  return df

def plot_parameters(grid, parameters):
  counter = 1
  columns = 3
  lines = len(parameters)//columns+1
  plt.rcParams['figure.figsize'] = (5*columns, 5*lines)
  df = grid_to_df(grid)
  for p in parameters.keys():
    v = parameters[p]
    df = filter_df_column_by_list(df, p, v)
  for p in parameters.keys():
    if len(parameters[p]) > 1:
      plt.subplot(lines, columns, counter) 
      sns.boxplot(x = df[p], y=df['mean_test_score'])
      counter += 1
  plt.tight_layout()
  plt.show()

def train_svm(xtrain, xtest, ytrain, ytest, parameters):
  SVM = LinearSVC(class_weight = 'balanced', random_state = 12, max_iter = 1500)
  grid = GridSearchCV(SVM, parameters)
  print(grid)
  grid.fit(xtrain, ytrain)
  print("Best model:", grid.best_params_)
  print("Score:", grid.score(xtest, ytest))
  return grid

parameters_svm = {
    # 'penalty' : ('l1', 'l2'),
    'penalty' : ('l2'),
    'loss' : ('hinge', 'squared_hinge'),
    # 'dual' : (True, False),
    'dual' : (True)
    'tol' : [1e-2, 1e-4, 1e-6, 1e-8],
    # 'C' : [1, 2, 4, 8],
    'C' : [1],
    'fit_intercept' : (True, False),
    'intercept_scaling' : [1, 2, 4]
}

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = .7, test_size = .3, stratify=y)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

grid_svm = train_svm(X_train, X_test, y_train, y_test, parameters_svm)
plot_parameters(grid_svm, parameters_svm)