# **Machine Learning from Data**

## Lab 2: Feature selection using PCA and MDA


2021 - 2024 Veronica Vilaplana - [GPI @ IDEAI](https://imatge.upc.edu/web/) Research group

-----------------

##Part3: The Phoneme dataset
##Classification using all the features or a manually selected subset of features

In [1]:
import pandas as pd             #import pandas with the alias pd
import numpy as np              #import numpy with the alias np
import seaborn as sns           #import seaborn with the alias sns
import scipy.stats as ss
import matplotlib.pyplot as plt #import matplotlib.pyplot with the alias plt
from sklearn.model_selection import train_test_split
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

###Read dataset
Load the file "BD_phoneme.csv" to Colab

In [2]:
from google.colab import drive
drive.mount('/content/drive')

ValueError: mount failed

In [None]:
df = pd.read_csv("drive/MyDrive/MLEARN/Lab/Lab2/BD_phoneme.csv", header=None)
print(df.info())

In [None]:
df.head()

###Prepare data, normalize in mean

In [None]:
X = np.array(df.iloc[:,0:256])
print(np.shape(X))
y = np.array(df.iloc[:,256])
print(np.shape(y))
# we use broadcasting for this:
# compute a vector of feature means and subtract it from each row in X
Xm = X.mean(axis=0)
X = X - Xm
nclass = 5
nfft = 256

### Plot spectrum

In [None]:
classes = ['aa','ao','dc','iy','sh']
colors = ['r','g','b','c','m']

fmax = 8*nfft/256
freqax = np.array(range(0,nfft))*fmax/nfft
xdm = np.zeros((nclass,nfft))
fig, ax = plt.subplots(figsize=(10,10), nrows= 3, ncols=2)
fig.suptitle('Log-periodograms', fontsize=14)
fig.subplots_adjust(hspace=0.4, wspace=0.2)
for idc, nc in enumerate(classes):
  index = y== (idc+1)
  xd = X[index,0:nfft]
  xdm[idc,:] = xd.mean(axis=0)
  plt.subplot(3,2,idc+1)
  for ids in range(0,len(xd)):
    plt.plot(freqax,xd[ids,:])
    plt.xlabel('frec(kHz)', fontsize=8)
    plt.ylabel('class ' + nc)

plt.subplot(3,2,nclass+1)
for idc, nc in enumerate(classes):
  plt.plot(np.array(range(0,nfft)),xdm[idc,:],color=colors[idc])
  plt.xlabel('features', fontsize=8)
  plt.ylabel('average')


plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(20,10), nrows= 1, ncols=1)
for idc, nc in enumerate(classes):
  plt.plot(np.array(range(0,nfft)),xdm[idc,:],color=colors[idc])
  plt.xlabel('features', fontsize=8)
  plt.ylabel('average')
  plt.xticks(np.arange(0, 256, 5.0))
  plt.grid(color='grey', linestyle='dashed', linewidth=1)
plt.show()

###Feature selection
Initially select the first 64 features

In [None]:
# Select the number of features to use (initially the first 64 features)
V_coor = np.array(range(0,64))
nfeat  = len(V_coor)
print(nfeat)

# Feature selection
X = X[:,V_coor]
print(X.shape)

###Dataset partition into training and test sets

In scikit-learn a random split into training and test sets can be computed with the `train_test_split` helper function.
We define the `train_size`. It should be between 0.0 and 1.0 and represents the proportion of the dataset to include in the train split.The `test_size` value is set to the complement of the train size. `shuffle` is a boolean indicating whether or not to shuffle the data before splitting. If `stratify` is not None, data is split in a stratified fashion, using this as the class labels.

In [None]:
# Split dataset (feature vectors and labels) into training and testing subsets
# shuffle data first, and do stratified sampling, keeping class proportions in the subsets
trainsize= 0.7
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size= trainsize, random_state=5, shuffle=True, stratify = y)

###LDA and QDA classifiers
We train a linear and a quadratic classifier and show error and confusion matrices on the train and on the test sets.

In [None]:
# Linear Discriminant Analysis
lda = LinearDiscriminantAnalysis(solver="svd",store_covariance=True)
ldamodel = lda.fit(X_train, y_train)
y_tpred_lda = ldamodel.predict(X_train)
y_testpred_lda = ldamodel.predict(X_test)


lda_train_error = 1. - accuracy_score(y_train,y_tpred_lda)
print('LDA train error: %f' %lda_train_error)
print('LDA train confusion matrix:')
print(confusion_matrix(y_train,y_tpred_lda))

lda_test_error = 1. - accuracy_score(y_test,y_testpred_lda)
print('LDA test error: %f' %lda_test_error)
print('LDA test confusion matrix:')
print(confusion_matrix(y_test,y_testpred_lda))

# Quadratic Discriminant Analysis
qda = QuadraticDiscriminantAnalysis(store_covariance=True)
qdamodel = qda.fit(X_train, y_train)
y_tpred_qda = qdamodel.predict(X_train)
y_testpred_qda = qdamodel.predict(X_test)

qda_train_error = 1. - accuracy_score(y_train,y_tpred_qda)
print('QDA train error: %f' %qda_train_error)
print('QDA train confusion matrix:')
print(confusion_matrix(y_train,y_tpred_qda))

qda_test_error = 1. - accuracy_score(y_test,y_testpred_qda)
print('QDA test error: %f' %qda_test_error)
print('QDA test confusion matrix:')
print(confusion_matrix(y_test,y_testpred_qda))


###Now we repeat the previous analysis using only two features manually selected
Manually select two features, you can use the spectrum plots to try to find the most discriminative ones.

In [None]:

X = np.array(df.iloc[:,0:256])
print(np.shape(X))
y = np.array(df.iloc[:,256])
print(np.shape(y))
# we use broadcasting for this:
# compute a vector of feature means and subtract it from each row in X
Xm = X.mean(axis=0)
X = X - Xm
nclass = 5
nfft = 256

# Select the number of features to use (initially the first 64 features)
#V_coor = np.array(range(0,64))
V_coor = [30, 140]
nfeat  = len(V_coor)
print(nfeat)
# Feature selection
X = X[:,V_coor]
print(X.shape)

# Split dataset (feature vectors and labels) into training and testing subsets
# shuffle data first, and do stratified sampling, keeping class proportions in the subsets
trainsize= 0.7
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size= trainsize, random_state=5, shuffle=True, stratify = y)

# Linear Discriminant Analysis
lda = LinearDiscriminantAnalysis(solver="svd",store_covariance=True)
ldamodel = lda.fit(X_train, y_train)
y_tpred_lda = ldamodel.predict(X_train)
y_testpred_lda = ldamodel.predict(X_test)


lda_train_error = 1. - accuracy_score(y_train,y_tpred_lda)
print('LDA train error: %f' %lda_train_error)
print('LDA train confusion matrix:')
print(confusion_matrix(y_train,y_tpred_lda))

lda_test_error = 1. - accuracy_score(y_test,y_testpred_lda)
print('LDA test error: %f' %lda_test_error)
print('LDA test confusion matrix:')
print(confusion_matrix(y_test,y_testpred_lda))

# Quadratic Discriminant Analysis
qda = QuadraticDiscriminantAnalysis(store_covariance=True)
qdamodel = qda.fit(X_train, y_train)
y_tpred_qda = qdamodel.predict(X_train)
y_testpred_qda = qdamodel.predict(X_test)

qda_train_error = 1. - accuracy_score(y_train,y_tpred_qda)
print('QDA train error: %f' %qda_train_error)
print('QDA train confusion matrix:')
print(confusion_matrix(y_train,y_tpred_qda))

qda_test_error = 1. - accuracy_score(y_test,y_testpred_qda)
print('QDA test error: %f' %qda_test_error)
print('QDA test confusion matrix:')
print(confusion_matrix(y_test,y_testpred_qda))


###Scatter plot and decision boundaries for 2 features

In [None]:
# For the linear model

# coef_ : shape of (n_classes, n_features)
# intercept_ :  shape of (n_classes,)

# Plot the hyperplanes: one vs all
fig, ax = plt.subplots(figsize=(8,12))

nclass = 5
classes = ['aa','ao','dc','iy','sh']
colors = ['r','g','b','c','m']

for l,c, m in zip(np.unique(y),colors,['s','x','o','+','s']):
    plt.scatter(X[y==l,0],
                X[y==l,1],
                c=c, marker=m,label='class %d' %l, alpha=0.3)
# marker=m
x1 = np.array([np.min(X[:,0], axis=0), np.max(X[:,0], axis=0)])

# lines for class 0
for i, c in enumerate(colors):
    b, w1, w2 = lda.intercept_[i], lda.coef_[i][0], lda.coef_[i][1]
    y1 = -(b+x1*w1)/w2
    plt.plot(x1,y1,c=c)

plt.title("Scatter plot with decision boundaries")
plt.legend()
plt.show()

###Now plot the region boundaries


In [None]:
# For the lineal and quadratic models

h = .1 # step size in the mesh

colors = ['r','g','b','c','m']
classes = ['aa','ao','dc','iy','sh']

# create a mesh to plot in
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                     np.arange(y_min, y_max, h))

fig, ax = plt.subplots(figsize=(14,7),nrows=1, ncols=2)

Z = lda.predict(np.c_[xx.ravel(), yy.ravel()])
# Put the result into a color plot
Z = Z.reshape(xx.shape)

# Plot also the training points
for idc, nc in enumerate(classes):
  idx = y== (idc+1)
  ax[0].scatter(X[idx,0], X[idx,1], color = colors[idc], label='class %d' %idc,alpha=0.7);

ax[0].contourf(xx, yy, Z, cmap=plt.cm.tab10, alpha=0.2)

ax[0].set_xlim(xx.min(), xx.max())
ax[0].set_ylim(yy.min(), yy.max())
ax[0].set_xticks(())
ax[0].set_yticks(())
ax[0].set_title('LDA boundaries')

Z = qda.predict(np.c_[xx.ravel(), yy.ravel()])
# Put the result into a color plot
Z = Z.reshape(xx.shape)

# Plot also the training points
for idc, nc in enumerate(classes):
  idx = y== (idc+1)
  ax[1].scatter(X[idx,0], X[idx,1], color = colors[idc], label='class %d' %idc,alpha=0.7);

ax[1].contourf(xx, yy, Z, cmap=plt.cm.tab10, alpha=0.2)

ax[1].set_xlim(xx.min(), xx.max())
ax[1].set_ylim(yy.min(), yy.max())
ax[1].set_xticks(())
ax[1].set_yticks(())
ax[1].set_title('QDA boundaries')

plt.show()
