In [3]:
%load_ext autoreload
%autoreload 2

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

In [None]:
import numpy as np
from code.pca import customPCA

# Create a sample dataset
X = np.array([[2.5, 2.4],
              [0.5, 0.7],
              [2.2, 2.9],
              [1.9, 2.2],
              [3.1, 3.0],
              [2.3, 2.7],
              [2, 1.6],
              [1, 1.1],
              [1.5, 1.6],
              [1.1, 0.9]])

# Instantiate and apply PCA
pca = customPCA()
X_transformed = pca.reduce_dim(X,threshold=0.99)

print("Principal Components:")
for c in pca.components.T:
    print(f"{ c}")
print("Principal Components Eigenvalues:\n", pca.eigenvalues)
print("Principal Components Explained Variance:\n", pca.explained_variance)
print("Transformed Data:\n", X_transformed)

In [None]:
from code.preprocessing import get_dataset
from code.pca import customPCA

# Step 1
dataset = "vowel"
x, y = get_dataset(dataset)
ds = x.copy()
ds["target"] = y

# Step 2
correlation_with_target = ds.corr()["target"] # Based on their correlation with the target
top_features = correlation_with_target.abs().sort_values(ascending=False).iloc[1:4]
top_features = top_features.index.tolist()
ds = ds.drop('target', axis=1)

plt.figure(figsize=(8, 6))
scatter = plt.scatter(ds[top_features[0]], ds[top_features[1]], c=ds[top_features[2]], cmap='viridis', alpha=0.7)
plt.colorbar(scatter, label=f'{top_features[2]}')
plt.xlabel(top_features[0])
plt.ylabel(top_features[1])
plt.title(f'Dataset {dataset} only representing the 3 components with more correlation with the target')
plt.show()

# Step 3 & 4 & 5 & 6 & 7 & 9
pca = customPCA(verbose=True)
X_transformed = pca.reduce_dim(ds,threshold=0.90,n_components=3)

# Step 8
# Create a scatter plot
plt.figure(figsize=(8, 6))
scatter = plt.scatter(X_transformed[:,0], X_transformed[:,1], c=X_transformed[:,2], cmap='viridis', alpha=0.7)  # `viridis` is a color map, change as needed

# Add a color bar to show the gradient
plt.colorbar(scatter, label='Third Dimension (Color Gradient)')

plt.xlabel('First Component (X)')
plt.ylabel('Second Component (Y)')
plt.title(f'Dataset {dataset} after applying PCA and leaving 3 principal components')
plt.show()


In [None]:
pca = customPCA(verbose=False)
X_transformed = pca.reduce_dim(ds,threshold=0.90,n_components=3)

pca.mean = pca.mean.values

X_reconstructed = pca.reconstruct(X_transformed)

print(X_reconstructed.shape)

In [None]:
from sklearn import decomposition
# Step 1
dataset = "sick"
x, y = get_dataset(dataset)
ds = x.copy()
ds["target"] = y

# Step 2
correlation_with_target = ds.corr()["target"]  # Based on their correlation with the target
top_features = correlation_with_target.abs().sort_values(ascending=False).iloc[1:4].index.tolist()

ds = ds.drop('target', axis=1)
feature_indices = [ds.columns.get_loc(feature) for feature in top_features]  # Get indices of the features

plt.figure(figsize=(8, 6))
scatter = plt.scatter(
    ds.iloc[:, feature_indices[0]],  # Access by column index
    ds.iloc[:, feature_indices[1]],  # Access by column index
    c=ds.iloc[:, feature_indices[2]],  # Access by column index
    cmap='viridis',
    alpha=0.7
)
plt.colorbar(scatter, label=f'{top_features[2]}')
plt.xlabel(top_features[0])
plt.ylabel(top_features[1])
plt.title(f'Dataset {dataset} only representing the 3 components with more correlation with the target')
plt.show()

# Step 3 & 4 & 5 & 6 & 7
pca = decomposition.PCA(n_components=3)  # Reduce to 3 components
X_transformed = pca.fit_transform(ds)

# Step 8
# Create a scatter plot
plt.figure(figsize=(8, 6))
scatter = plt.scatter(X_transformed[:, 0], X_transformed[:, 1], c=X_transformed[:, 2], cmap='viridis',alpha=0.7)  # `viridis` is a color map, change as needed

# Add a color bar to show the gradient
plt.colorbar(scatter, label='Third Dimension (Color Gradient)')

plt.xlabel('First Component (X)')
plt.ylabel('Second Component (Y)')
plt.title(f'Dataset {dataset} after applying PCA and leaving 3 principal components')
plt.show()

# Step 9
X_reconstructed = pca.inverse_transfdeorm(X_transformed)

plt.figure(figsize=(8, 6))

scatter = plt.scatter(
    X_reconstructed[:, feature_indices[0]],  # Access by column index
    X_reconstructed[:, feature_indices[1]],  # Access by column index
    c=X_reconstructed[:, feature_indices[2]],  # Access by column index
    cmap='viridis',
    alpha=0.7
)
plt.colorbar(scatter, label=f'{top_features[2]}')
plt.xlabel(top_features[0])
plt.ylabel(top_features[1])
plt.title(f'Dataset {dataset} only representing the 3 components with more correlation with the target')
plt.show()

In [None]:
# Step 1: Load dataset
dataset = "sick"
x, y = get_dataset(dataset)
ds = x.copy()
ds["target"] = y

# Step 2: Select top features based on correlation with the target
correlation_with_target = ds.corr()["target"]  # Correlation with target
top_features = correlation_with_target.abs().sort_values(ascending=False).iloc[1:4].index.tolist()

# Extract feature indices for later use
ds = ds.drop('target', axis=1)
feature_indices = [ds.columns.get_loc(feature) for feature in top_features]

# Step 3: Apply Incremental PCA for dimensionality reduction
# Define mini-batches
batch_size = 100  # Set a batch size suitable for your system's memory
incremental_pca = decomposition.IncrementalPCA(n_components=3)

# Perform partial fit on mini-batches
for i in range(0, ds.shape[0], batch_size):
    batch = ds.iloc[i:i + batch_size]
    incremental_pca.partial_fit(batch)

# Transform the dataset in batches
X_transformed = np.vstack([
    incremental_pca.transform(ds.iloc[i:i + batch_size])
    for i in range(0, ds.shape[0], batch_size)
])

# Step 4: Scatter plot of PCA-transformed components
plt.figure(figsize=(8, 6))
scatter = plt.scatter(
    X_transformed[:, 0],  # First principal component
    X_transformed[:, 1],  # Second principal component
    c=X_transformed[:, 2],  # Third principal component (as color gradient)
    cmap='viridis',
    alpha=0.7
)
plt.colorbar(scatter, label='Third Principal Component (Color Gradient)')
plt.xlabel('First Principal Component')
plt.ylabel('Second Principal Component')
plt.title(f'Dataset {dataset} after Incremental PCA (3 Principal Components)')
plt.show()

# Step 5: Reconstruct the original dataset from Incremental PCA components
X_reconstructed = np.vstack([
    incremental_pca.inverse_transform(X_transformed[i:i + batch_size])
    for i in range(0, X_transformed.shape[0], batch_size)
])

# Step 6: Visualize reconstructed features
plt.figure(figsize=(8, 6))
scatter = plt.scatter(
    X_reconstructed[:, feature_indices[0]],  # Reconstructed first feature
    X_reconstructed[:, feature_indices[1]],  # Reconstructed second feature
    c=X_reconstructed[:, feature_indices[2]],  # Reconstructed third feature (as color gradient)
    cmap='viridis',
    alpha=0.7
)
plt.colorbar(scatter, label=f'Reconstructed {top_features[2]}')
plt.xlabel(top_features[0])
plt.ylabel(top_features[1])
plt.title(f'Reconstructed Dataset {dataset} using Incremental PCA')
plt.show()

In [7]:
from code.preprocessing import get_dataset
from code.main import compare_pca_models

dataset="sick"
x, _ = get_dataset(dataset)
ds = x.copy()

compare_pca_models(ds)

Unnamed: 0,PCA Type,Components,Explained Variance,Projected Data Shape,Reconstruction Error,Time Taken (s)
0,customPCA,"[[-0.04673187939546602, 0.053177242823952114, ...","[0.25485785610097883, 0.18243388209079922, 0.1...","(3770, 3)",0.165977,0.045793
1,PCA,"[[-0.04673187939546549, 0.11412097278747298, 5...","[0.2548578561009787, 0.18243388209079928, 0.11...","(3770, 3)",0.165977,0.011564
2,IncrementalPCA,"[[-0.04735912361362391, 0.11218357646820691, 6...","[0.25474102113739994, 0.1815884883046357, 0.11...","(3770, 3)",0.166257,0.583613
