# Principal component analysis (PCA)

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [2]:
raw_data = "../data/raw.csv"
dataset = pd.read_csv(raw_data)
dataset.head()

Unnamed: 0,ckpt_name,checkpoint_params.load_checkpoint,training_hyperparams.lr_warmup_epochs,training_hyperparams.lr_warmup_steps,training_hyperparams.lr_cooldown_epochs,training_hyperparams.warmup_initial_lr,training_hyperparams.cosine_final_lr_ratio,training_hyperparams.optimizer,training_hyperparams.optimizer_params.weight_decay,training_hyperparams.ema,...,mAP@0.50:0.95,F1@0.50:0.95,AP@0.50:0.95_Architectural distortion,AP@0.50:0.95_Mass,AP@0.50:0.95_Calcification,Best_score_threshold,Best_score_threshold_Architectural distortion,Best_score_threshold_Mass,Best_score_threshold_Calcification,Target
0,RUN_20240612_100027_359642,False,0,100,0,1e-06,0.1,Adam,1e-05,True,...,7e-06,0.000182,0,1e-05,4e-06,0.11,0,0.11,0.22,0
1,RUN_20240617_163510_293224,False,0,100,0,1e-06,0.1,Adam,1e-05,True,...,7e-06,0.000235,0,6e-06,9e-06,0.11,0,0.11,0.22,0
2,RUN_20240620_104658_182467,False,0,100,0,1e-06,0.1,AdamW,1e-05,True,...,0.005795,0.001406,0,8e-05,0.011509,0.25,0,0.1,0.25,0
3,RUN_20240624_175224_278149,False,0,100,0,1e-06,0.1,AdamW,1e-05,True,...,0.011001,0.002431,0,0.00034,0.021661,0.25,0,0.18,0.25,0
4,RUN_20240625_113055_125920,False,0,100,0,1e-06,0.1,AdamW,1e-05,True,...,0.015902,0.003215,0,0.000385,0.031419,0.25,0,0.18,0.25,0


### Drop label columns

Since PCA is unsupervised, we don't want non-feature columns influencing the results

In [10]:
non_features = ['ckpt_name', 'Precision@0.50:0.95', 'Recall@0.50:0.95', 'mAP@0.50:0.95', 'F1@0.50:0.95', 'AP@0.50:0.95_Architectural distortion', 
                'AP@0.50:0.95_Mass', 'AP@0.50:0.95_Calcification', 'Best_score_threshold', 'Best_score_threshold_Architectural distortion', 
                'Best_score_threshold_Mass', 'Best_score_threshold_Calcification', 'Target']

x = dataset.drop(columns=non_features)
y = dataset['Target'] # this is stored in variable 'y' for later (training a ML model)

x.head()

Unnamed: 0,checkpoint_params.load_checkpoint,training_hyperparams.lr_warmup_epochs,training_hyperparams.lr_warmup_steps,training_hyperparams.lr_cooldown_epochs,training_hyperparams.warmup_initial_lr,training_hyperparams.cosine_final_lr_ratio,training_hyperparams.optimizer,training_hyperparams.optimizer_params.weight_decay,training_hyperparams.ema,training_hyperparams.batch_accumulate,...,additional_log_items.installed_packages.tzlocal,additional_log_items.installed_packages.uritools,additional_log_items.installed_packages.urllib3,additional_log_items.installed_packages.wcwidth,additional_log_items.installed_packages.webencodings,additional_log_items.installed_packages.werkzeug,additional_log_items.installed_packages.wheel,additional_log_items.installed_packages.wrapt,additional_log_items.installed_packages.xhtml2pdf,additional_log_items.installed_packages.zipp
0,False,0,100,0,1e-06,0.1,Adam,1e-05,True,1,...,5.2,4.0.3,2.2.1,0.2.13,0.5.1,3.0.3,0.43.0,1.16.0,0.2.11,3.19.2
1,False,0,100,0,1e-06,0.1,Adam,1e-05,True,1,...,5.2,4.0.3,2.2.1,0.2.13,0.5.1,3.0.3,0.43.0,1.16.0,0.2.11,3.19.2
2,False,0,100,0,1e-06,0.1,AdamW,1e-05,True,1,...,5.2,4.0.3,2.2.1,0.2.13,0.5.1,3.0.3,0.43.0,1.16.0,0.2.11,3.19.2
3,False,0,100,0,1e-06,0.1,AdamW,1e-05,True,1,...,5.2,4.0.3,2.2.1,0.2.13,0.5.1,3.0.3,0.43.0,1.16.0,0.2.11,3.19.2
4,False,0,100,0,1e-06,0.1,AdamW,1e-05,True,1,...,5.2,4.0.3,2.2.1,0.2.13,0.5.1,3.0.3,0.43.0,1.16.0,0.2.11,3.19.2


### Identify column types

In [4]:
numericals = x.select_dtypes(include=['int64', 'float64']).columns
categoricals = x.select_dtypes(include=['bool', 'object']).columns

### Preprocessing pipelines

* StandardScaler for preprocessing numerical features so each will have a mean of 0 and a stardard deviation of 1. This helps avoiding bias. 
* OneHotEncoder converts categorical features into a format that can be provided to ML algorithms to do a better job in prediction. This transformation creates a binary column for each category

In [5]:
numericals_transformer = StandardScaler()
categoricals_transformer = OneHotEncoder(handle_unknown='ignore')

preprocessor = ColumnTransformer(
    transformers= [
        ('num', numericals_transformer, numericals),
        ('cat', categoricals_transformer, categoricals)
    ])

### Create a pipeline that includes PCA

In [6]:
pca = PCA(n_components=2) # Define PCA instance

# Chain preprocessing and PCA into a single workflow
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('pca', pca)
])

In [7]:
# Fit and transform the data using the pipeline
x_pca = pipeline.fit_transform(x)

  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count


ValueError: Input X contains NaN.
PCA does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

### Determine the number of principal components

Explained Variance Ratio can help determine how many components to retain, since the purpose of PCA is reducing dimensions while preserving the most importnt variance in the data. 

In [8]:
pca = PCA()
x_pca = pca.fit_transform(x)

# Plot cumulative explained variance
plt.plot(range(1, len(pca.explained_variance_ratio_)+1),
         np.cumsum(pca.explained_variance_ratio_))

plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance')
plt.title('Explained Variance Ratio vs. Number of Components')
plt.show()

ValueError: could not convert string to float: 'Adam'