In [7]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.decomposition import SparsePCA

df = pd.read_csv('winequality-red-reduced.csv')
df.head()

Unnamed: 0,fixed acidity,volatile acidity,chlorides,free sulfur dioxide,total sulfur dioxide,pH,sulphates,alcohol,quality
0,7.4,0.7,0.076,11.0,34.0,3.51,0.56,9.4,5
1,7.8,0.88,0.098,25.0,67.0,3.2,0.68,9.8,5
2,7.8,0.76,0.092,15.0,54.0,3.26,0.65,9.8,5
3,11.2,0.28,0.075,17.0,60.0,3.16,0.58,9.8,6
4,7.4,0.7,0.076,11.0,34.0,3.51,0.56,9.4,5


# Using Sparce principal component analysis :
### based on 3 latent factors namely content, properties and quality

In [8]:
transformer = SparsePCA(n_components=3, alpha=5, random_state=42)
transformer.fit(df)
# Fitting the data to the sparse PCA model
transformer.components_
# plt.bar(range(1,4), X_transformed, color='red')
# plt.title("PCA visualized")
# plt.ylabel("Explained variance ratio")
# plt.xlabel("Principal Component")
# plt.xticks([1,2,3])
# plt.show()

array([[ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  9.99995592e-01,  0.00000000e+00,
         0.00000000e+00, -2.87406144e-03, -7.46111426e-04],
       [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        -1.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       [-1.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00]])

### Since standard scaler is not used the values obtained cannot clearly explain the degree of influence on the features.
### Here the only noticable inference we can take from this is, it indicates the features of interest

In [11]:
from sklearn.preprocessing import StandardScaler


df = pd.read_csv('winequality-red-reduced.csv')
scaler = StandardScaler(with_mean=0, with_std=1)
scaler.fit(df)
df = scaler.transform(df)
scaler.mean_
df

array([[ 4.25152529,  3.9105336 ,  1.61528287, ...,  3.30473231,
         8.8235214 ,  6.19335498],
       [ 4.48133747,  4.91609938,  2.08286475, ...,  4.01288923,
         9.1989904 ,  6.19335498],
       [ 4.48133747,  4.24572219,  1.95534242, ...,  3.83585   ,
         9.1989904 ,  6.19335498],
       ...,
       [ 3.6195418 ,  2.84910305,  1.61528287, ...,  4.42598077,
        10.32539739,  7.43202598],
       [ 3.38972962,  3.60327739,  1.59402915, ...,  4.18992846,
         9.57445939,  6.19335498],
       [ 3.44718267,  1.73180774,  1.42399937, ...,  3.89486308,
        10.32539739,  7.43202598]])

# Standard scaler:
### Scales the features to have mean 0 and variance 1 to give it a feel and properties of "standard" normal distribution

In [10]:
transformer = SparsePCA(n_components=3, alpha=5, random_state=42)
transformer.fit(df)

transformer.components_

array([[-0.6099484 ,  0.13974952, -0.28749865,  0.        ,  0.        ,
         0.66393343, -0.25644816,  0.13857955,  0.        ],
       [ 0.        ,  0.45608769,  0.07217093,  0.        ,  0.        ,
         0.        , -0.19758447, -0.56528422, -0.6543619 ],
       [ 0.        ,  0.        ,  0.        ,  0.70710678,  0.70710678,
         0.        ,  0.        ,  0.        ,  0.        ]])

### The data obtained clearly captures the relationship between the features and we are able to distinguish the magnitude to which the latent variable has dependence on that feature better than when done without scaling.

### This also aligns with the colleagues theory that the non quality aspects of the dataset can be modelled using the three latent variables that is; content, properties and quality. Since we can observe from the components attribute that all the features have a dependency to atleast one latent variable i support the colleagues suggestion.