In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

#PCA on Wine Quality Dataset by Sinan Gok https://goksinan.github.io/machine/pca-on-wine-data/

Unsupervised learning (principal component analysis)

In [None]:
nRowsRead = 1000 # specify 'None' if want to read whole file
df = pd.read_csv('../input/cusersmarildownloadswinecsv/wine.csv', delimiter=';', encoding = "utf8", nrows = nRowsRead)
df.dataframeName = 'wine.csv'
nRow, nCol = df.shape
print(f'There are {nRow} rows and {nCol} columns')
df.head()

#Slice from first column to one before the last.

In [None]:
#The one before the last was alcohol (since the encoding didn't work I changed to sulphates 2 before the last)

X = df.loc[:,'fixed_acidity':'sulphates']
y = df['quality']

#Double check:

In [None]:
X.columns

In [None]:
y.name

In [None]:
from sklearn.preprocessing import LabelEncoder

#fill in mean for floats
for c in df.columns:
    if df[c].dtype=='float16' or  df[c].dtype=='float32' or  df[c].dtype=='float64':
        df[c].fillna(df[c].mean())

#fill in -999 for categoricals
df = df.fillna(-999)
# Label Encoding
for f in df.columns:
    if df[f].dtype=='object': 
        lbl = LabelEncoder()
        lbl.fit(list(df[f].values))
        df[f] = lbl.transform(list(df[f].values))
        
print('Labelling done.')

In [None]:
df = pd.get_dummies(df)

#The dataset has to be standardized (i.e. subtracting mean, dividing by the standard deviation) The scikit-learn PCA package probably performs this internally, but the author (Sinan Gok) did it anyway.

In [None]:
from sklearn import preprocessing

from sklearn.preprocessing import StandardScaler

scaler = preprocessing.StandardScaler().fit(X)
scaler

In [None]:
StandardScaler(copy=True, with_mean=True, with_std=True)

In [None]:
print('Mean of each variable:')
print(scaler.mean_)
print('\nStd of each variable:')
print(scaler.scale_)

#Perform transformation

In [None]:
#Perform transformation

X = scaler.transform(X)

#Use PCA and take a closer look at the latent variables.

In [None]:
from sklearn.decomposition import PCA

pca = PCA() # creates an instance of PCA class
results = pca.fit(X) # applies PCA on predictor variables
Z = results.transform(X) # create a new array of latent variables

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns
sns.set()


plt.plot(results.explained_variance_ratio_*100) # scree plot
plt.show()

The above is called a scree plot. It shows the variances explained by each latent variable. The first component explains approx. 28% of the variance in the whole dataset.

Ideally, we would like to see an elbow shape in order to decide which PCs to keep and which ones to disregard. In practice, this rarely happens. Most of the time, we use enough PCs so that they explain 95% or 99% of the variation in the data.

By examining the above figure, we can conclude that first 6 variables contain most of the information inside the data.
https://goksinan.github.io/machine/pca-on-wine-data/

#Interpreting the results

Once the author applied the PCA, he was no longer in his familiar domain. A different domain in which the latents are the linear combinations of the original variables, but they don’t represent any meaningful properties. Thus, it is impossible to interpret them by themselves.

Look at the correlation between the latent variable and original variables. If any of the original variables correlate well with the first few PCs, conclude that the PCs are mainly influenced by the said variables, thus they must be the important ones.

Another approach, look at the PCA coefficients. These coefficients tell how much of the original variables are used in creating the PCs. The higher the coefficient, the more important is the related variable.
https://goksinan.github.io/machine/pca-on-wine-data/

In [None]:
pd.DataFrame(results.components_)

In [None]:
pd.DataFrame(Z[:,:6], columns=list(
[u'Acidity', u'Sulfides', u'More alcohol', u'Chlorides', u'More residual sugar', u'Less pH'])).head(10)

#Predictive model

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, accuracy_score

neigh = KNeighborsClassifier(n_neighbors=3)
neigh.fit(X,y)
pred = neigh.predict(X)
print('Confusion matrix:')
print(confusion_matrix(pred,y))
print('\nAccuracy:')
print(accuracy_score(pred,y))

#Using the first 6 PCs

In [None]:
neigh = KNeighborsClassifier(n_neighbors=3)
neigh.fit(Z[:,:6],y)
pred = neigh.predict(Z[:,:6])
print('Confusion matrix:')
print(confusion_matrix(pred,y))
print('\nAccuracy:')
print(accuracy_score(pred,y))

#Using 6 variables instead of 11, he achive almost the same accuracy in his prediction.

In [None]:
#Code by Olga Belitskaya https://www.kaggle.com/olgabelitskaya/sequential-data/comments
from IPython.display import display,HTML
c1,c2,f1,f2,fs1,fs2=\
'#eb3434','#eb3446','Akronim','Smokum',30,15
def dhtml(string,fontcolor=c1,font=f1,fontsize=fs1):
    display(HTML("""<style>
    @import 'https://fonts.googleapis.com/css?family="""\
    +font+"""&effect=3d-float';</style>
    <h1 class='font-effect-3d-float' style='font-family:"""+\
    font+"""; color:"""+fontcolor+"""; font-size:"""+\
    str(fontsize)+"""px;'>%s</h1>"""%string))
    
    
dhtml('Thanks Sinan Gok for the script' )