### <font color='blue'>**Data Exploration**
    Pulled from Eric's notebook


In [1]:
import random
import pandas as pd
import numpy as np
import os
from tqdm import tqdm

In [2]:
class Dataset:
    def __init__(self, pep_df, min_timepoints_per_patient=2):
        '''
        pep_df should be the pep_reduced_intensity_df we created above
        This dataset will only include patients with at least `min_timepoints_per_patient` timepoints
        '''
        self.pep_df = pep_df
        
        self.data = {} # patient id -> [array of timepoint vectors]
        for patient_idx in range(1, 58+1):
            patient_timepoint_vectors = []
            for timepoint in range(1, 7+1):
                try:
                    # We replace all nans with zeros using np.nan_to_num
                    patient_timepoint_vector = np.nan_to_num(self.pep_df[f"Patient_{patient_idx:02d}.Timepoint_{timepoint:01d}"].values)
                    patient_timepoint_vectors.append(patient_timepoint_vector)
                except:
                    # This patient timepoint doesnt exist in the data
                    pass
            
            # If this patient had at least `min_timepoints_per_patient` timepoints, then include this patient id
            if len(patient_timepoint_vectors) >= min_timepoints_per_patient:
                self.data[patient_idx] = patient_timepoint_vectors
                
    @staticmethod
    def prepare_df(filepath):
        df = pd.read_csv(filepath, sep="\t")
        
        # only keep the Patient_x.Timepoint_y formatted columns
        df = df.iloc[:, :667]
        
        # Remove all _unmod columns
        df = df[df.columns.drop(list(df.filter(regex='.\_unmod')))]
        
        # Convert the string abundance numbers into ints for all patient timepoint cols
        def convert_to_number(val):
            if isinstance(val, str):
                return int(val.replace(",","").strip())
            return val

        for patient_timepoint_col in df.columns.values[32:]:
            df[patient_timepoint_col] = df[patient_timepoint_col].apply(convert_to_number)
            df[patient_timepoint_col] = df[patient_timepoint_col].astype(float)
        
        return df
    
    @classmethod
    def from_file(cls, filepath, *args, **kwargs):
        pip_df = cls.prepare_df(filepath)
        return cls(pip_df, *args, **kwargs)
        
                
    def data_generator(self, patient_ids, num_samples, peptide_indices=None, same_patient_pair_probability=0.5):
        """
        This will return a generator that will yield `num_samples` sample pairs of peptide vectors 
        (limited to only the `patient_ids` given, and the `peptide_indices` given (if none is given, then all peptides will be included))
        such that, `same_patient_pair_probability` of the time, the pair of vectors will be from the same patient (but different time points)
        
        Note: when same_patient_pair_probability is None, it will model the true data distribution by putting all timepoint vectors into 1 bucket
        and randomly sampling from this (this will make it such that it is always way more likely for the timepoint vectors to be from different patients than the same one)
        
        Data samples yielded by the returned generator will be of the form:
        {
            "first_patient_idx" : ...,
            "first_patient_timepoint_vector" : ...,
            "second_patient_idx" : ...,
            "second_patient_timepoint_vector" : ...,
            "is_same_patient" : True/False
        }
        
        """
        # First ensure all of the given patient ids are in out prepared dataset
        assert all(pid in self.data for pid in patient_ids), "Not all of the given patient ids are in our dataset"
        
        
        def true_data_dist_gen():
            # This generator is used when same_patient_pair_probability is None
            all_patient_timepoint_vectors = []
            for patient_idx in patient_ids:
                for patient_timepoint_vector in self.data[patient_idx]:
                    all_patient_timepoint_vectors.append((patient_idx, patient_timepoint_vector))
            
            for _ in range(num_samples):
                first_patient_id, first_patient_timepoint_vector = random.choice(all_patient_timepoint_vectors)
                second_patient_id, second_patient_timepoint_vector = random.choice(all_patient_timepoint_vectors)
                
                # if peptide_indices is given, filter the vectors to only include the `peptide_indices` peptides
                if peptide_indices:
                    first_patient_timepoint_vector = first_patient_timepoint_vector[peptide_indices]
                    second_patient_timepoint_vector = second_patient_timepoint_vector[peptide_indices]
                    
                yield {
                    "first_patient_id" : first_patient_id,
                    "first_patient_timepoint_vector" : first_patient_timepoint_vector,
                    "second_patient_id" : second_patient_id,
                    "second_patient_timepoint_vector" : second_patient_timepoint_vector,
                    "is_same_patient" : first_patient_id == second_patient_id
                }

        
        def data_gen():
            # This generator is used when same_patient_pair_probability is NOT None
            for _ in range(num_samples):
                
                # pick a random first patient id
                first_patient_id = random.choice(patient_ids)
                
                # pick a random timepoint vector for this first patient id
                first_patient_timepoint_vector = random.choice(self.data[first_patient_id])
                
                # decide if the second patient in our pair should be the same as the first
                is_second_patient_same = random.random() <= same_patient_pair_probability
                
                if is_second_patient_same:
                    second_patient_id = first_patient_id
                else:
                    second_patient_id = random.choice([pid for pid in patient_ids if pid != first_patient_id])
                
                # Note: In this approach, it is possible that, when the second patient = first patient,
                # They both return the same timepoint vector. but that should be okay, since this will help the model learn
                # to detect identical inputs, especially when the input to the model is the concatenated vector
                second_patient_timepoint_vector = random.choice(self.data[second_patient_id])
                
                # if peptide_indices is given, filter the vectors to only include the `peptide_indices` peptides
                if peptide_indices:
                    first_patient_timepoint_vector = first_patient_timepoint_vector[peptide_indices]
                    second_patient_timepoint_vector = second_patient_timepoint_vector[peptide_indices]
                
                yield {
                    "first_patient_id" : first_patient_id,
                    "first_patient_timepoint_vector" : first_patient_timepoint_vector,
                    "second_patient_id" : second_patient_id,
                    "second_patient_timepoint_vector" : second_patient_timepoint_vector,
                    "is_same_patient" : first_patient_id == second_patient_id
                }

        # Based on same_patient_pair_probability, return the correct data generator
        if same_patient_pair_probability is None:
            return true_data_dist_gen()
        else:
            return data_gen()

In [3]:
DATA_DIR = "./Data"
DATA_TSV_FILEPATH = os.path.join(DATA_DIR, "peptidoforms_intensity", "data.tsv")

In [4]:
# Establish all random seeds
random.seed(2021)

# TODO: other libraries should be seeded here, like sklearn, torch, xgboost, etc.

### <font color='blue'>**Creating Main Dataset**

In [5]:
peptidoforms_intensity_dataset = Dataset.from_file(DATA_TSV_FILEPATH)

  exec(code_obj, self.user_global_ns, self.user_ns)


In [6]:
peptidoforms_intensity_df = peptidoforms_intensity_dataset.pep_df
peptidoforms_intensity_df.shape

(40921, 350)

In [7]:
# For each peptide index, count the number of NON-NaNs it has across all patient timepoints
peptide_nonempty_count_across_patient_timepoints = np.count_nonzero(~np.isnan(peptidoforms_intensity_df.values[:,32:].astype(float)), axis=-1)
peptide_nonempty_count_across_patient_timepoints

array([318, 318, 318, ...,   1,   7,   1])

In [8]:
pd.Series(peptide_nonempty_count_across_patient_timepoints).describe()

count    40921.000000
mean        25.752572
std         68.130665
min          0.000000
25%          0.000000
50%          0.000000
75%          8.000000
max        318.000000
dtype: float64

In [9]:
len(peptide_nonempty_count_across_patient_timepoints[peptide_nonempty_count_across_patient_timepoints <= 100])

37461

---

### <font color='blue'>**Creating Train/Val/Test Datasets**
    
Train and val datasets will enforce the `same_patient_pair_probability = 0.5`, while the test set will use the true data distribution

In [10]:
train_ratio = 0.6
val_ratio = 0.2
test_ratio = 0.2

In [11]:
# For now, we don't select a particular subset of peptides, and just use all of them.
selected_peptide_indices = None

In [12]:
train_same_patient_pair_probability = 0.5
val_same_patient_pair_probability = 0.5
test_same_patient_pair_probability = None

In [13]:
num_train_samples = 10000
num_val_samples = 10000
num_test_samples = 10000

In [14]:
all_patient_ids = list(peptidoforms_intensity_dataset.data.keys())
num_patient_ids = len(all_patient_ids)
num_patient_ids

52

In [15]:
random.shuffle(all_patient_ids)
train_patient_ids, val_patient_ids, test_patient_ids = np.split(all_patient_ids, 
                                                                [int(train_ratio * num_patient_ids), 
                                                                 int((train_ratio + val_ratio) * num_patient_ids)
                                                                ])
train_patient_ids = train_patient_ids.tolist()
val_patient_ids = val_patient_ids.tolist()
test_patient_ids = test_patient_ids.tolist()

len(train_patient_ids), len(val_patient_ids), len(test_patient_ids)

(31, 10, 11)

In [16]:
def create_data_arrays_from_generator(gen):
    """
    The input is a concatenated vector of the 2 patient vectors, and the output is 1/0 if the vectors are from the same patient or not
    """
    X, Y = [], []
    
    for data_input in tqdm(gen):
        x = np.hstack((data_input['first_patient_timepoint_vector'], data_input['second_patient_timepoint_vector'])).astype(int)
        y = 1 if data_input['is_same_patient'] else 0
        
        X.append(x)
        Y.append(y)

    return X, Y

In [17]:
train_data_gen = peptidoforms_intensity_dataset.data_generator(
    patient_ids=train_patient_ids, 
    num_samples=num_train_samples, 
    peptide_indices=selected_peptide_indices, 
    same_patient_pair_probability=train_same_patient_pair_probability)
trainX, trainY = create_data_arrays_from_generator(train_data_gen)

val_data_gen = peptidoforms_intensity_dataset.data_generator(
    patient_ids=val_patient_ids, 
    num_samples=num_val_samples, 
    peptide_indices=selected_peptide_indices, 
    same_patient_pair_probability=val_same_patient_pair_probability)
valX, valY = create_data_arrays_from_generator(val_data_gen)

test_data_gen = peptidoforms_intensity_dataset.data_generator(
    patient_ids=test_patient_ids, 
    num_samples=num_test_samples, 
    peptide_indices=selected_peptide_indices, 
    same_patient_pair_probability=test_same_patient_pair_probability)
testX, testY = create_data_arrays_from_generator(test_data_gen)

10000it [00:04, 2322.45it/s]
10000it [00:06, 1615.45it/s]
10000it [00:06, 1658.54it/s]


---

## NOTE on using scalers: 
Current approach of just using standard scaling on concatenated input feature vectors is likely not optimal/correct.

This is because, the first half and the second half of each concatenated feature vector is semantically the same, yet the scaler will scale them differently.

A better approach may be to create a data matrix of shape (num peptide, num patient time points) (and replacing all NaNs with 0s), and fit the standard scaler on this (using only the training patient ids). Then, when transforming the data, we transform the first half of each vector and the second half of each vector separately (using the same scaler), and concat these 2 scaled halfs to create a new scaled & concatenated feature vector

However, tree-based methods (including XGBoost) are invariant to feature scaling, so they don't need any scaling: https://datascience.stackexchange.com/questions/60950/is-it-necessary-to-normalise-data-for-xgboost

In [19]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

# helper method to print basic model metrics
def metrics(y_true, y_pred):
    print('Confusion matrix:\n', confusion_matrix(y_true, y_pred))
    print('\nReport:\n', classification_report(y_true, y_pred))

### <font color='blue'>**XGBoost Classifier**

In [None]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

# Apply transform to both the training set and the test set.
train_dataX = np.array(trainX) # scaler.transform(trainX)
val_dataX = np.array(valX) # scaler.transform(valX)
test_dataX = np.array(testX) # scaler.transform(testX)

train_dataY = np.array(trainY)
val_dataY = np.array(valY)
test_dataY = np.array(testY)

In [None]:
model = XGBClassifier(use_label_encoder=False)
model.fit(train_dataX, train_dataY)



In [None]:
# Train metrics
train_ypred = model.predict(train_dataX)
train_predictions = [round(value) for value in train_ypred]

metrics(train_dataY, train_predictions)
train_accuracy = accuracy_score(train_dataY, train_predictions)

print("Train accuracy - ", train_accuracy)

In [None]:
# Validation metrics
val_ypred = model.predict(val_dataX)
val_predictions = [round(value) for value in val_ypred]

metrics(val_dataY, val_predictions)
val_accuracy = accuracy_score(val_dataY, val_predictions)

print("Val accuracy - ", val_accuracy)

In [None]:
# Test metrics
test_ypred = model.predict(test_dataX)
test_predictions = [round(value) for value in test_ypred]

metrics(test_dataY, test_predictions)
test_accuracy = accuracy_score(test_dataY, test_predictions)

print("Test accuracy - ", test_accuracy)