# Intelligent Systems
# Exercise 04 / Representation
### Working Group: Intelligent Systems
### Lecture: Prof. Dr.-Ing. habil.  Sven Tomforde
### Exercise: Ma. Sc. Simon Reichhuber (mail to: [Simon.Reichhuber@informatik.uni-kiel.de](mailto:Simon.Reichhuber@informatik.uni-kiel.de))

# Exercise 04 - Task 3

Implement the PCA procedure with the help of numpy. Compare your results with the results of sklearn's PCA.

In [None]:
from numpy import array
from numpy import mean
from numpy import cov
from numpy.linalg import eig

import pandas as pd
from pandas.plotting import parallel_coordinates
import numpy as np
from sklearn import preprocessing
from sklearn import datasets
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

---

In [None]:
def make_3D_plot(dataFrame, x_label="sepal_len", y_label="sepal_wid", z_label="petal_len"): 
        
    colors = {"Iris-setosa":'blue', "Iris-versicolor":"red", "Iris-virginica":'yellow'}
    
    fig = plt.figure(figsize=(8,8))
    ax = fig.add_subplot(111, projection='3d')
    plt.rcParams['legend.fontsize'] = 10  
    
    grouped = dataFrame.groupby('class')
    for key, group in grouped:
        ax.plot(group[x_label],group[y_label], group[z_label], 'o', markersize=8, color=colors[key], alpha=0.5, label=key)

    plt.title('Iris Dataset')
    ax.legend(loc='upper right')
    
    ax.set_xlabel(x_label)
    ax.set_ylabel(y_label)
    ax.set_zlabel(z_label)

    plt.show()

In [None]:
def make_2D_plot(dataFrame):    
    
    colors = {"Iris-setosa":'blue', "Iris-versicolor":"red", "Iris-virginica":'yellow'}
    fig, ax = plt.subplots()
    grouped = dataFrame.groupby('class')
    for key, group in grouped:
        group.plot(ax=ax, kind='scatter', x=dataFrame.columns[0], y=dataFrame.columns[1], label=key, color=colors[key])
    plt.show()

## Import Iris Dataset 

You can use the following link to download the data: https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data

In [None]:
irisData = pd.read_csv(filepath_or_buffer='https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data', header=None, sep=',')
irisData.columns=['sepal_len', 'sepal_wid', 'petal_len', 'petal_wid', 'class']

In [None]:
irisData

## Plot Data to get an overview

In [None]:
plt.figure()
parallel_coordinates(irisData, 'class', colormap="plasma")
make_3D_plot(irisData)
plt.show()

---

## Step 1: Standardize Data 

In [None]:
standardScaler = preprocessing.StandardScaler()
standardizedData = standardScaler.fit_transform(irisData.iloc[:,:4])

irisStandardizedData = pd.DataFrame(standardizedData, columns=irisData.columns[:4])
irisStandardizedData["class"] = irisData["class"]

In [None]:
irisStandardizedData

In [None]:
irisValues = irisStandardizedData.iloc[:,:4].values

## Step 2: Create Covariance Matrix

In [None]:
# calculate covariance matrix of centered matrix (use T)
covarianceMatrix = cov(irisValues.T)
covarianceMatrix

## Step 3: Find Eigenvecotrs and Eigenvalues to the Matrix

In [None]:
# eigendecomposition of covariance matrix
eigenValues, eigenVectors = eig(covarianceMatrix)
print(eigenValues)
print(eigenVectors)

### Plot Eigenvalue Distribution to find vectors with the highest variance

In [None]:
objects = list(range(1,len(eigenValues)+1))
y_pos = list(range(1,len(eigenValues)+1))
performance = eigenValues/len(eigenValues)
 
plt.bar(y_pos, performance, align='center', alpha=0.5)
plt.xticks(y_pos, objects)
plt.scatter(x=[1,2,3,4], y=np.cumsum(eigenValues/4))
plt.ylabel('Variance')
plt.title('Variance of PCs')
 
plt.show()

## Step 4: Project Original Data onto new PC coordinates (new feature space)

In [None]:
pc_Matrix = eigenVectors[:,:2]
print(eigenVectors)
print("----------")
print(pc_Matrix)

In [None]:
transformedData = irisValues.dot(pc_Matrix)
# equivalent to transformedData = pc_Matrix.dot(irisValues.T).T

### Plot new data and feature space

In [None]:
transformedFrame = pd.DataFrame(transformedData, columns=["PC1", "PC2"])
transformedFrame["class"] = irisData["class"]

make_2D_plot(transformedFrame)

---

## Step 0: Do it with sklearn :)

In [None]:
from sklearn.decomposition import PCA as sklearnPCA
pca = sklearnPCA(n_components=2)
pcaTransformedData = pca.fit_transform(irisValues)

In [None]:
pcaTransformedFrame = pd.DataFrame(pcaTransformedData, columns=["PC1", "PC2"])
pcaTransformedFrame["class"] = irisData["class"]

make_2D_plot(pcaTransformedFrame)

In [None]:
print(pca)