# K- Nearest Neighbors (KNN)

In [1]:
import logging

import pandas as pd
import sklearn
import sklearn.impute

import vaep
import vaep.model
import vaep.models as models
import vaep.nb
from vaep import sampling
from vaep.io import datasplits
from vaep.models import ae

logger = vaep.logging.setup_logger(logging.getLogger('vaep'))
logger.info("Experiment 03 - Analysis of latent spaces and performance comparisions")

figures = {}  # collection of ax or figures

vaep - INFO     Experiment 03 - Analysis of latent spaces and performance comparisions


In [2]:
# catch passed parameters
args = None
args = dict(globals()).keys()

Papermill script parameters:

In [3]:
# files and folders
folder_experiment: str = 'runs/example'  # Datasplit folder with data for experiment
folder_data: str = ''  # specify data directory if needed
file_format: str = 'csv'  # file format of create splits, default pickle (pkl)
# Machine parsed metadata from rawfile workflow
fn_rawfile_metadata: str = 'data/dev_datasets/HeLa_6070/files_selected_metadata_N50.csv'
# training
epochs_max: int = 50  # Maximum number of epochs
# early_stopping:bool = True # Wheather to use early stopping or not
batch_size: int = 64  # Batch size for training (and evaluation)
cuda: bool = True  # Whether to use a GPU for training
# model
neighbors: int = 3  # number of neigherst neighbors to use
force_train: bool = True  # Force training when saved model could be used. Per default re-train model
sample_idx_position: int = 0  # position of index which is sample ID
model: str = 'KNN'  # model name
model_key: str = 'KNN'  # potentially alternative key for model (grid search)
save_pred_real_na: bool = True  # Save all predictions for missing values
# metadata -> defaults for metadata extracted from machine data
meta_date_col: str = None  # date column in meta data
meta_cat_col: str = None  # category column in meta data

In [4]:
# Parameters
neighbors = 3
file_format = "csv"
fn_rawfile_metadata = "data/dev_datasets/HeLa_6070/files_selected_metadata_N50.csv"
folder_experiment = "runs/example"
model_key = "KNN"


Some argument transformations

In [5]:
args = vaep.nb.get_params(args, globals=globals())
args = vaep.nb.args_from_dict(args)
args

{'batch_size': 64,
 'cuda': True,
 'data': Path('runs/example/data'),
 'epochs_max': 50,
 'file_format': 'csv',
 'fn_rawfile_metadata': 'data/dev_datasets/HeLa_6070/files_selected_metadata_N50.csv',
 'folder_data': '',
 'folder_experiment': Path('runs/example'),
 'force_train': True,
 'meta_cat_col': None,
 'meta_date_col': None,
 'model': 'KNN',
 'model_key': 'KNN',
 'neighbors': 3,
 'out_figures': Path('runs/example/figures'),
 'out_folder': Path('runs/example'),
 'out_metrics': Path('runs/example'),
 'out_models': Path('runs/example'),
 'out_preds': Path('runs/example/preds'),
 'sample_idx_position': 0,
 'save_pred_real_na': True}

Some naming conventions

In [6]:
TEMPLATE_MODEL_PARAMS = 'model_params_{}.json'

## Load data in long format

In [7]:
data = datasplits.DataSplits.from_folder(args.data, file_format=args.file_format)

vaep.io.datasplits - INFO     Loaded 'train_X' from file: runs/example/data/train_X.csv


vaep.io.datasplits - INFO     Loaded 'val_y' from file: runs/example/data/val_y.csv


vaep.io.datasplits - INFO     Loaded 'test_y' from file: runs/example/data/test_y.csv


data is loaded in long format

In [8]:
data.train_X.sample(5)

Sample ID                                       Gene Names
2020_03_07_18_15_Q-Exactive-HF-X-Orbitrap_6070  IGF2BP3      30.434
2019_12_19_19_48_Q-Exactive-HF-X-Orbitrap_6070  RCN2         28.781
2020_02_05_20_55_Q-Exactive-HF-X-Orbitrap_6070  FLOT2        29.859
2020_05_12_18_10_Q-Exactive-HF-X-Orbitrap_6070  CASK         21.404
2020_01_03_20_10_Q-Exactive-HF-X-Orbitrap_6070  TRIP12       27.696
Name: intensity, dtype: float64

load meta data for splits

In [9]:
if args.fn_rawfile_metadata:
    df_meta = pd.read_csv(args.fn_rawfile_metadata, index_col=0)
    display(df_meta.loc[data.train_X.index.levels[0]])
else:
    df_meta = None

Unnamed: 0_level_0,Version,Content Creation Date,Thermo Scientific instrument model,instrument attribute,instrument serial number,Software Version,firmware version,Number of MS1 spectra,Number of MS2 spectra,MS min charge,...,injection volume setting,Row,dilution factor,electron transfer dissociation,Comment,collision-induced dissociation,sample name,sample volume,Type,Number of MS3 spectra
Sample ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2019_12_18_14_35_Q-Exactive-HF-X-Orbitrap_6070,66,2019-12-18 14:35:26,Q Exactive HF-X Orbitrap,Q Exactive HF-X Orbitrap,Exactive Series slot #6070,2.9-290204ph16/2.9.0.2926,rev. 1,12336.0,114830.0,2,...,2.5,,1.0,,,,,,,
2019_12_19_19_48_Q-Exactive-HF-X-Orbitrap_6070,66,2019-12-19 19:48:33,Q Exactive HF-X Orbitrap,Q Exactive HF-X Orbitrap,Exactive Series slot #6070,2.9-290204ph16/2.9.0.2926,rev. 1,12312.0,117156.0,2,...,2.5,2.0,1.0,,,,,,QC,
2019_12_20_14_15_Q-Exactive-HF-X-Orbitrap_6070,66,2019-12-20 14:15:00,Q Exactive HF-X Orbitrap,Q Exactive HF-X Orbitrap,Exactive Series slot #6070,2.9-290204ph16/2.9.0.2926,rev. 1,12263.0,118074.0,2,...,2.5,,1.0,,,,,,QC,
2019_12_27_12_29_Q-Exactive-HF-X-Orbitrap_6070,66,2019-12-27 12:29:44,Q Exactive HF-X Orbitrap,Q Exactive HF-X Orbitrap,Exactive Series slot #6070,2.9-290204ph16/2.9.0.2926,rev. 1,11862.0,115333.0,2,...,2.5,,1.0,,,,,,QC,
2019_12_29_15_06_Q-Exactive-HF-X-Orbitrap_6070,66,2019-12-29 15:06:50,Q Exactive HF-X Orbitrap,Q Exactive HF-X Orbitrap,Exactive Series slot #6070,2.9-290204ph16/2.9.0.2926,rev. 1,10776.0,127143.0,2,...,2.5,,1.0,,,,,,,
2019_12_29_18_18_Q-Exactive-HF-X-Orbitrap_6070,66,2019-12-29 18:18:31,Q Exactive HF-X Orbitrap,Q Exactive HF-X Orbitrap,Exactive Series slot #6070,2.9-290204ph16/2.9.0.2926,rev. 1,10957.0,129268.0,2,...,2.5,,1.0,,,,,,,
2020_01_02_17_38_Q-Exactive-HF-X-Orbitrap_6070,66,2020-01-02 17:38:26,Q Exactive HF-X Orbitrap,Q Exactive HF-X Orbitrap,Exactive Series slot #6070,2.9-290204ph16/2.9.0.2926,rev. 1,12243.0,120125.0,2,...,2.5,,1.0,,,,,,,
2020_01_03_11_17_Q-Exactive-HF-X-Orbitrap_6070,66,2020-01-03 11:17:32,Q Exactive HF-X Orbitrap,Q Exactive HF-X Orbitrap,Exactive Series slot #6070,2.9-290204ph16/2.9.0.2926,rev. 1,13189.0,118093.0,2,...,2.5,,1.0,,,,,,,
2020_01_03_16_58_Q-Exactive-HF-X-Orbitrap_6070,66,2020-01-03 16:58:20,Q Exactive HF-X Orbitrap,Q Exactive HF-X Orbitrap,Exactive Series slot #6070,2.9-290204ph16/2.9.0.2926,rev. 1,11172.0,126830.0,2,...,2.5,,1.0,,,,,,,
2020_01_03_20_10_Q-Exactive-HF-X-Orbitrap_6070,66,2020-01-03 20:10:03,Q Exactive HF-X Orbitrap,Q Exactive HF-X Orbitrap,Exactive Series slot #6070,2.9-290204ph16/2.9.0.2926,rev. 1,11000.0,128023.0,2,...,2.5,,1.0,,,,,,,


## Initialize Comparison

In [10]:
freq_feat = sampling.frequency_by_index(data.train_X, 0)
freq_feat.head()  # training data

Gene Names
AAR2     40
ABCB6    32
ABHD11   25
ACAD9    41
ACTL6A   44
Name: intensity, dtype: int64

### Simulated missing values

The validation fake NA is used to by all models to evaluate training performance.

In [11]:
val_pred_fake_na = data.val_y.to_frame(name='observed')
val_pred_fake_na

Unnamed: 0_level_0,Unnamed: 1_level_0,observed
Sample ID,Gene Names,Unnamed: 2_level_1
2019_12_18_14_35_Q-Exactive-HF-X-Orbitrap_6070,GEMIN5,30.946
2020_02_28_12_27_Q-Exactive-HF-X-Orbitrap_6070,AAR2,27.248
2020_02_13_03_11_Q-Exactive-HF-X-Orbitrap_6070,ADAM10,29.310
2020_01_04_14_59_Q-Exactive-HF-X-Orbitrap_6070,NDUFAF4,24.352
2020_05_27_13_57_Q-Exactive-HF-X-Orbitrap_6070,EPN1,29.250
...,...,...
2020_03_01_23_00_Q-Exactive-HF-X-Orbitrap_6070,PRRC2B,24.294
2020_01_04_04_23_Q-Exactive-HF-X-Orbitrap_6070,ARGLU1,28.471
2020_02_18_18_55_Q-Exactive-HF-X-Orbitrap_6070,GBF1,29.470
2019_12_18_14_35_Q-Exactive-HF-X-Orbitrap_6070,DENR,27.902


In [12]:
test_pred_fake_na = data.test_y.to_frame(name='observed')
test_pred_fake_na.describe()

Unnamed: 0,observed
count,859.0
mean,28.27
std,2.543
min,22.254
25%,26.477
50%,28.068
75%,30.064
max,35.919


## Data in wide format

In [13]:
data.to_wide_format()
args.M = data.train_X.shape[-1]
data.train_X.head()

Gene Names,AAR2,ABCB6,ABHD11,ACAD9,ACTL6A,ACTN1,ACTR3,ADAM10,AHSA1,AK3,...,WBSCR16,WDR18,WDR33,WDR36,WDR61,WNK1,YBX1,YTHDF1,YWHAH,ZNF326
Sample ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2019_12_18_14_35_Q-Exactive-HF-X-Orbitrap_6070,27.248,24.472,25.714,28.542,29.335,30.837,29.713,26.702,31.696,26.337,...,26.36,28.258,25.507,30.009,28.345,27.701,32.374,24.925,29.609,28.756
2019_12_19_19_48_Q-Exactive-HF-X-Orbitrap_6070,27.211,23.775,,28.437,29.692,30.635,30.34,25.929,31.296,25.415,...,,,23.62,29.348,28.728,,,,29.027,28.393
2019_12_20_14_15_Q-Exactive-HF-X-Orbitrap_6070,27.377,26.852,26.046,28.907,30.369,31.252,30.715,26.403,,25.554,...,27.147,28.335,,30.563,28.859,28.405,32.328,25.498,30.17,29.208
2019_12_27_12_29_Q-Exactive-HF-X-Orbitrap_6070,,24.047,24.788,27.894,29.359,29.996,29.686,,30.882,22.498,...,,28.228,,29.497,27.716,27.349,31.794,24.701,29.232,28.139
2019_12_29_15_06_Q-Exactive-HF-X-Orbitrap_6070,26.52,,,,30.164,33.143,30.985,25.824,31.145,29.21,...,,,26.818,29.374,29.407,28.9,32.443,,30.615,29.189


## Train
model = 'sklearn_knn'

In [14]:
knn_imputer = sklearn.impute.KNNImputer(n_neighbors=args.neighbors).fit(data.train_X)

### Predictions

- data of training data set and validation dataset to create predictions is the same as training data.
- predictions include missing values (which are not further compared)

create predictions and select for split entries

In [15]:
pred = knn_imputer.transform(data.train_X)
pred = pd.DataFrame(pred, index=data.train_X.index, columns=data.train_X.columns).stack()
pred

Sample ID                                       Gene Names
2019_12_18_14_35_Q-Exactive-HF-X-Orbitrap_6070  AAR2         27.248
                                                ABCB6        24.472
                                                ABHD11       25.714
                                                ACAD9        28.542
                                                ACTL6A       29.335
                                                              ...  
2020_06_02_09_41_Q-Exactive-HF-X-Orbitrap_6070  WNK1         27.444
                                                YBX1         32.605
                                                YTHDF1       25.548
                                                YWHAH        30.419
                                                ZNF326       29.720
Length: 18800, dtype: float64

In [16]:
val_pred_fake_na[args.model_key] = pred
val_pred_fake_na

Unnamed: 0_level_0,Unnamed: 1_level_0,observed,KNN
Sample ID,Gene Names,Unnamed: 2_level_1,Unnamed: 3_level_1
2019_12_18_14_35_Q-Exactive-HF-X-Orbitrap_6070,GEMIN5,30.946,31.027
2020_02_28_12_27_Q-Exactive-HF-X-Orbitrap_6070,AAR2,27.248,26.595
2020_02_13_03_11_Q-Exactive-HF-X-Orbitrap_6070,ADAM10,29.310,28.650
2020_01_04_14_59_Q-Exactive-HF-X-Orbitrap_6070,NDUFAF4,24.352,26.630
2020_05_27_13_57_Q-Exactive-HF-X-Orbitrap_6070,EPN1,29.250,29.169
...,...,...,...
2020_03_01_23_00_Q-Exactive-HF-X-Orbitrap_6070,PRRC2B,24.294,24.616
2020_01_04_04_23_Q-Exactive-HF-X-Orbitrap_6070,ARGLU1,28.471,28.777
2020_02_18_18_55_Q-Exactive-HF-X-Orbitrap_6070,GBF1,29.470,29.269
2019_12_18_14_35_Q-Exactive-HF-X-Orbitrap_6070,DENR,27.902,27.889


In [17]:
test_pred_fake_na[args.model_key] = pred
test_pred_fake_na

Unnamed: 0_level_0,Unnamed: 1_level_0,observed,KNN
Sample ID,Gene Names,Unnamed: 2_level_1,Unnamed: 3_level_1
2019_12_18_14_35_Q-Exactive-HF-X-Orbitrap_6070,ARAF,23.248,25.862
2019_12_18_14_35_Q-Exactive-HF-X-Orbitrap_6070,CENPH,25.045,25.215
2019_12_18_14_35_Q-Exactive-HF-X-Orbitrap_6070,CLPP,27.745,27.337
2019_12_18_14_35_Q-Exactive-HF-X-Orbitrap_6070,GNAI1,24.335,26.067
2019_12_18_14_35_Q-Exactive-HF-X-Orbitrap_6070,KIAA1143,26.901,27.182
...,...,...,...
2020_06_02_09_41_Q-Exactive-HF-X-Orbitrap_6070,NUDT9,27.658,27.114
2020_06_02_09_41_Q-Exactive-HF-X-Orbitrap_6070,PTPN1,29.024,28.058
2020_06_02_09_41_Q-Exactive-HF-X-Orbitrap_6070,RDH13,28.720,28.008
2020_06_02_09_41_Q-Exactive-HF-X-Orbitrap_6070,RNASEH2C,26.742,26.708


save missing values predictions

In [18]:
if args.save_pred_real_na:
    pred_real_na = ae.get_missing_values(df_train_wide=data.train_X,
                                         val_idx=val_pred_fake_na.index,
                                         test_idx=test_pred_fake_na.index,
                                         pred=pred)
    display(pred_real_na)
    pred_real_na.to_csv(args.out_preds / f"pred_real_na_{args.model_key}.csv")

Sample ID                                       Gene Names 
2019_12_18_14_35_Q-Exactive-HF-X-Orbitrap_6070  ARMCX3        24.581
                                                BDH2          24.366
                                                CCBL2         26.531
                                                COMMD5        25.023
                                                DNAJC10       26.055
                                                               ...  
2020_06_02_09_41_Q-Exactive-HF-X-Orbitrap_6070  TMEM261       27.061
                                                TRMT2A        26.794
                                                TSNAX;DISC1   26.710
                                                UTRN          24.297
                                                VPS11         26.436
Name: intensity, Length: 1616, dtype: float64

### Plots

- validation data

## Comparisons

> Note: The interpolated values have less predictions for comparisons than the ones based on models (CF, DAE, VAE)
> The comparison is therefore not 100% fair as the interpolated samples will have more common ones (especailly the sparser the data)
> Could be changed.

### Validation data

- all measured (identified, observed) peptides in validation data

> Does not make to much sense to compare collab and AEs,
> as the setup differs of training and validation data differs

In [19]:
# papermill_description=metrics
d_metrics = models.Metrics()

The fake NA for the validation step are real test data (not used for training nor early stopping)

In [20]:
added_metrics = d_metrics.add_metrics(val_pred_fake_na, 'valid_fake_na')
added_metrics

Selected as truth to compare to: observed


{'KNN': {'MSE': 0.6689810919307538,
  'MAE': 0.5435059298987669,
  'N': 859,
  'prop': 1.0}}

### Test Datasplit

Fake NAs : Artificially created NAs. Some data was sampled and set
explicitly to misssing before it was fed to the model for
reconstruction.

In [21]:
added_metrics = d_metrics.add_metrics(test_pred_fake_na, 'test_fake_na')
added_metrics

Selected as truth to compare to: observed


{'KNN': {'MSE': 0.5338583901486527,
  'MAE': 0.5130756085821084,
  'N': 859,
  'prop': 1.0}}

Save all metrics as json

In [22]:
vaep.io.dump_json(d_metrics.metrics, args.out_metrics / f'metrics_{args.model_key}.json')
d_metrics

{ 'test_fake_na': { 'KNN': { 'MAE': 0.5130756085821084,
                             'MSE': 0.5338583901486527,
                             'N': 859,
                             'prop': 1.0}},
  'valid_fake_na': { 'KNN': { 'MAE': 0.5435059298987669,
                              'MSE': 0.6689810919307538,
                              'N': 859,
                              'prop': 1.0}}}

In [23]:
metrics_df = models.get_df_from_nested_dict(d_metrics.metrics,
                                            column_levels=['model', 'metric_name']).T
metrics_df

Unnamed: 0_level_0,subset,valid_fake_na,test_fake_na
model,metric_name,Unnamed: 2_level_1,Unnamed: 3_level_1
KNN,MSE,0.669,0.534
KNN,MAE,0.544,0.513
KNN,N,859.0,859.0
KNN,prop,1.0,1.0


## Save predictions

In [24]:
# save simulated missing values for both splits
val_pred_fake_na.to_csv(args.out_preds / f"pred_val_{args.model_key}.csv")
test_pred_fake_na.to_csv(args.out_preds / f"pred_test_{args.model_key}.csv")

## Config

In [25]:
figures  # switch to fnames?

{}

In [26]:
args.n_params = 1  # the number of neighbors to consider
args.dump(fname=args.out_models / f"model_config_{args.model_key}.yaml")
args

{'M': 376,
 'batch_size': 64,
 'cuda': True,
 'data': Path('runs/example/data'),
 'epochs_max': 50,
 'file_format': 'csv',
 'fn_rawfile_metadata': 'data/dev_datasets/HeLa_6070/files_selected_metadata_N50.csv',
 'folder_data': '',
 'folder_experiment': Path('runs/example'),
 'force_train': True,
 'meta_cat_col': None,
 'meta_date_col': None,
 'model': 'KNN',
 'model_key': 'KNN',
 'n_params': 1,
 'neighbors': 3,
 'out_figures': Path('runs/example/figures'),
 'out_folder': Path('runs/example'),
 'out_metrics': Path('runs/example'),
 'out_models': Path('runs/example'),
 'out_preds': Path('runs/example/preds'),
 'sample_idx_position': 0,
 'save_pred_real_na': True}