# Problem
* Choose any dataset for multiclass classification on [Kaggle](https://www.kaggle.com/) (go to "Datasets" section, choose "Filter" and enter "multiclass classification" into the "Tags" field). 
* Perform classification with few methods. I expect you to use at least SVM (linear and rbf) and random forest.
* Try getting the best result with each of the methods. I expect you to use at least GridSearch for hyperparameters tuning.
* Try feature engineering. I expect you to use at least PCA for dimensionality reduction.
* Calculate accuracy and confusion matrix for each of the methods.
* Draw conclusions. Which method is the best? Why? If the dataset has any articles linked, compare your results with the state of the art.

# Grading criteria:
* I expect a confident usage of sklearn methods.
* I expect understanding of basics of models assessment.
* I expect you to be able to learn PCA method on your own.
* I expect the ability of succinct, cohesive, and coherent expression of your thoughts, i.e. clearly state (in a few sentences) what is the problem you are solving, what approaches do you propose, and what conclusions can be drawn regarding these approaches in the context of the problem.

Machine Learning problem solving includes several steps:
+ Training
+ Evaluation
+ Analysis

The steps accompany each choosed method for evaluation and choosing the right solution. 

In general the next structure is defined for overall solution
+ Data exploration
+ Feature engineering 
+ Selection of metrics
+ Choosing models
+ Train/Validation/Hyperparameter tuning
+ Comparsion and choosing model

In [None]:
# !pip install opendatasets
from IPython.display import Markdown as md
from IPython.display import display
from matplotlib import pyplot as plt
import pandas as pd
import numpy as np, re
import scipy.stats as sts
import seaborn as sns
import os, opendatasets as od

# Downloading

In [None]:
# od.download("https://www.kaggle.com/datasets/zalando-research/fashionmnist")
os.listdir('./fashionmnist')

In [None]:
df_train, df_test = pd.read_csv('./fashionmnist/fashion-mnist_train.csv'), pd.read_csv('./fashionmnist/fashion-mnist_test.csv')
print(df_train.info(), df_test.info())
print(df_train.describe(), df_test.describe())
print(df_train.head(), df_test.head())

# Data exploration

In [None]:
img_w, img_h = 28, 28
_xcol = ['pixel'+str(i+1) for i in range(28*28)]
_ycol = ['label']
x_train, x_test, y_train, y_test = df_train[_xcol].to_numpy(),  df_test[_xcol].to_numpy(), df_train[_ycol].to_numpy(), df_test[_ycol].to_numpy() 
print('Train:', x_train.shape, y_train.shape)
print('Test:', x_test.shape, y_test.shape)
print('Train classes:', np.unique(y_train))
print('Test classes:', np.unique(y_test))

In [None]:
fig, ax = plt.subplots(nrows=2, ncols=1, figsize = (6,4))
names, counts = np.unique(y_train, return_counts=True)
ax[0].bar(names, counts)
ax[0].set_title('Train')
ax[0].set_xticks(names)
ax[0].set_ylabel('Counts')
ax[0].set_xlabel('Label')
names, counts = np.unique(y_test, return_counts=True)
ax[1].bar(names,counts)
ax[1].set_title('Test')
ax[1].set_xticks(names)
ax[1].set_ylabel('Counts')
ax[1].set_xlabel('Label')
fig.suptitle('Classes distribution')
plt.tight_layout()

In [None]:
fig, ax = plt.subplots (nrows = 2, ncols = 5, figsize = (6,4))
ax = ax.flatten()
for i, iax in enumerate(ax):
    sample, label = x_train[y_train.flatten()==i][0], i
    iax.imshow(sample.reshape(img_h, img_w), cmap='binary')
    iax.set_xlabel(label)
    iax.set_xticks([])
    iax.set_yticks([])
fig.suptitle('Test data')
plt.tight_layout()

fig, ax = plt.subplots (nrows = 2, ncols = 5, figsize = (6,4))
ax = ax.flatten()
for i, iax in enumerate(ax):
    sample, label = x_test[y_test.flatten()==i][0], i
    iax.imshow(sample.reshape(img_h, img_w), cmap='binary')
    iax.set_xlabel(label)
    iax.set_xticks([])
    iax.set_yticks([])
fig.suptitle('Train data')
plt.tight_layout()

In [None]:
mean = np.mean(x_train, axis=0)
std = np.std(x_train, axis=0)

fig, ax = plt.subplots(ncols=2, figsize=(10, 4))
ax[0].imshow(mean.astype(np.uint8).reshape(img_w,img_h), cmap='binary')
ax[1].imshow(std.astype(np.uint8).reshape(img_w,img_h), cmap='binary')
ax[0].set_title("Mean")
ax[1].set_title("Standard deviation")
plt.tight_layout()

In [None]:
px_means=[]
px_stds=[]
for label in range(10):
    label_data = x_train[y_train.flatten()==label]
    mean = np.mean(label_data, axis=0)
    std = np.std(label_data, axis=0)
    px_means.append(mean)
    px_stds.append(std)
    
fig, ax = plt.subplots (nrows = 2, ncols = 5, figsize = (10,5))
ax = ax.flatten()
for i, iax in enumerate(ax):
    iax.imshow(px_means[i].astype(np.uint8).reshape(img_w,img_h), cmap='binary')
    iax.set_xlabel(i)
fig.suptitle('Means by labels')
plt.tight_layout()

fig, ax = plt.subplots (nrows = 2, ncols = 5, figsize = (10,5))
ax = ax.flatten()
for i, iax in enumerate(ax):
    iax.imshow(px_stds[i].astype(np.uint8).reshape(img_w,img_h), cmap='binary')
    iax.set_xlabel(i)
fig.suptitle('Standart deviation by labels')
plt.tight_layout()

In [None]:
from scipy import fftpack
# Compute the two-dimensional Fourier transform of an image and visualize its power spectrum

num_examples = 3
fig, ax = plt.subplots(nrows=10, ncols=num_examples*2, figsize=(10, 20))

for iclass in range(10):
    img_idxs = np.where(y_train.flatten()==iclass)[0]

    for j,i in enumerate(img_idxs[:num_examples]):
        img_fft = fftpack.fft2(x_train[i].reshape(img_w, img_h))
        img_fft_shift = fftpack.fftshift(img_fft)
        power_spectrum = np.abs(img_fft_shift) ** 2

        ax[iclass][j*2].imshow(x_train[i].reshape(img_w, img_h), cmap='binary')
        ax[iclass][j*2+1].imshow(np.log10(power_spectrum), cmap='gray')
        ax[iclass][3].set_title(str(iclass))
        ax[iclass][j*2].axis('off')
        ax[iclass][j*2+1].axis('off')
plt.tight_layout()

# Feature engineering

In [None]:
from sklearn.preprocessing import Normalizer
#Normalize
_norm = Normalizer()
x_train = _norm.fit_transform(x_train)
x_test = _norm.transform(x_test)
print(x_train.min(), x_train.max())

In [None]:
fig, ax = plt.subplots (nrows = 2, ncols = 5, figsize = (6,4))
ax = ax.flatten()
for i, iax in enumerate(ax):
    sample, label = x_train[y_train.flatten()==i][0], i
    iax.imshow(sample.reshape(img_h, img_w), cmap='binary')
    iax.set_xlabel(label)
    iax.set_xticks([])
    iax.set_yticks([])
fig.suptitle('Normalized data')
plt.tight_layout()


In [None]:
from sklearn.decomposition import PCA
pca = PCA().fit(x_train)
plt.grid()
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('Number of components')
plt.ylabel('Explained variance')

quantiles = np.arange(0.25, 0.75, 0.1)
n_components = np.zeros(len(quantiles))
for i, quantile in enumerate(quantiles):
    n_components[i] = np.where(pca.explained_variance_ratio_<np.quantile(pca.explained_variance_ratio_, quantile))[0].min()

In [None]:
test_sample = np.random.randint(len(x_train))

fig, ax = plt.subplots(nrows=len(n_components), ncols=3, figsize=(5,8))

for i in range(len(n_components)):
    in_components = n_components[i]
    
    pca= PCA(int(in_components)).fit(x_train)
    transformed_ = pca.transform(x_train[test_sample].reshape(1,-1))
    restored_ = pca.inverse_transform(transformed_)
    restored_ = np.clip(restored_, 0, 1)
    print(restored_.min(), restored_.max())
    print(transformed_.shape)
    print(f'Explained variance for n_components = {in_components}:', np.sum(pca.explained_variance_ratio_))

    ax[i][0].imshow(x_train[test_sample].reshape(img_w, img_h), cmap='binary')
    ax[i][0].axis('off')

    ax[i][1].imshow(restored_.reshape(img_w, img_h), cmap='binary')
    ax[i][1].axis('off')
    ax[i][1].set_title(f'Original/Restored/Difference of sample {test_sample} for n_components = {in_components}')

    ax[i][2].imshow(np.clip(x_train[test_sample]-restored_, 0, 1).reshape(img_w, img_h), cmap='binary')
    ax[i][2].axis('off')

plt.tight_layout()

In [None]:
x_train = pca.transform(x_train)
x_test = pca.transform(x_test)

# Classifiers

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score

def print_evaluate(y_test, y_pred):
    #confussion matrix
    cm = confusion_matrix(y_test, y_pred)
    
    # metrics
    accuracy = accuracy_score(y_test,y_pred)
#     precission = precision_score(y_test,y_pred)
#     recall = recall_score(y_test,y_pred)
#     print('Precision', precission)
#     print('Recall', recall)
    print('Accuracy', accuracy)

    sns.heatmap(cm, annot=True, cmap='coolwarm', fmt='.3g')
    plt.show()
    
    return accuracy

In [None]:
results = {}

## SVD

In [None]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

kernels = ['rbf', 'sigmoid','linear']

param_grid = {
'kernel': kernels,
# 'C': [0.1, 1, 10, 100],
# 'gamma': ['scale', 'auto'],
}

grid_search = GridSearchCV(SVC(), param_grid, scoring='accuracy', n_jobs=-1)
grid_search.fit(x_train, y_train.ravel())
print(f'best param.:', grid_search.best_params_)

final_model_svc = SVC(**grid_search.best_params_)
final_model_svc.fit(x_train, y_train.ravel())
y_pred = final_model_svc.predict(x_test)

print_evaluate(y_test, y_pred)

## Random Forest

## CNN