# We will be working with the wine dataset distributed as part of sklearn

In [26]:
import numpy as np

from sklearn.datasets import load_wine

wdata = load_wine()

In [27]:
type(wdata)  # dataset is structured as a "Bunch"

sklearn.utils.Bunch

In [28]:
wdata.keys() # column names

dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names'])

In [29]:
type(wdata.data)

pandas.core.frame.DataFrame

In [30]:
wdata.data.shape # number of samples by number of features

(178, 13)

In [31]:
wdata.feature_names

['alcohol',
 'malic_acid',
 'ash',
 'alcalinity_of_ash',
 'magnesium',
 'total_phenols',
 'flavanoids',
 'nonflavanoid_phenols',
 'proanthocyanins',
 'color_intensity',
 'hue',
 'od280/od315_of_diluted_wines',
 'proline']

In [32]:
list(wdata.target_names)

['class_0', 'class_1', 'class_2']

In [33]:
wdata.target # class 0, 1 and 2

0      0
1      0
2      0
3      0
4      0
      ..
173    2
174    2
175    2
176    2
177    2
Name: target, Length: 178, dtype: int64

In [34]:
wdata.data[0]

KeyError: 0

In [10]:
wdata.data[0,0]

14.23

# Normalization example

* MinMax scaling is a commonly used method of normalizing data 

* MinMax scaling transforms the dataset to values that are between 0 and 1

In [11]:
from sklearn import preprocessing

minmax_scaler = preprocessing.MinMaxScaler()

In [15]:
scaled_wdata = minmax_scaler.fit_transform(wdata.data)

In [16]:
print(f'min changed from {np.min(wdata.data)} to {np.min(scaled_wdata)}, max changed from {np.max(wdata.data)} to {np.max(scaled_wdata)}')

min changed from 0.13 to 0.0, max changed from 1680.0 to 1.0


In [17]:
print(f'mean changed from {np.mean(wdata.data)} to {np.mean(scaled_wdata)}, and std changed from {np.std(wdata.data)} to {np.std(scaled_wdata)}')

mean changed from 69.13366292091617 to 0.4084913360554177, and std changed from 215.74620420485243 to 0.21688619487069796


# Standard Scaler

* Also very widely used
* Transforms the dataset to have the same properties as a normal distribution with 0 mean and standard deviation of 1

In [17]:
standard_scaler = preprocessing.StandardScaler()

In [18]:
st_wdata = standard_scaler.fit_transform(wdata.data)

In [19]:
print(f'min changed from {np.min(wdata.data)} to {np.min(st_wdata)}, max changed from {np.max(wdata.data)} to {np.max(st_wdata)}')

min changed from 0.13 to -3.6791622340370145, max changed from 1680.0 to 4.371372139554767


In [20]:
print(f'mean changed from {np.mean(wdata.data)} to {np.mean(st_wdata)}, and std changed from {np.std(wdata.data)} to {np.std(st_wdata)}')

mean changed from 69.13366292091617 to 4.66735072755122e-16, and std changed from 215.74620420485243 to 1.0


# Exercise 1

* Using the formula for min-max scaling, implement it yourself (as opposed to using the sklearn MinMaxScaler) and apply to the wine data
* Using the formula for standardization, implement it yourself (as opposed to using the sklearn StandardScaler) and apply to the wine data


# Exercise 2

* Use PCA to reduce the dimensionality of the wine dataset
* How many components do you need to capture 90% of the variability of the data if you first standardize the data vs. if you don't standardize the data prior to applying PCA?

# Dimensionality Reduction 

### As a starting point, take a look at the contribution of the max number of components

In [34]:
from sklearn.decomposition import PCA

In [35]:
nfeatures = wdata.data.shape[1]
pca_model = PCA(n_components=nfeatures) # return all components (no dimensionality reduction)