# Playground to implement W&B as well as start hyperparameter-tuning

# WanDB

In [127]:
#%pip install wandb

In [128]:
import subprocess
import wandb
from wandb.keras import WandbCallback

In [129]:
subprocess.call(['wandb', 'login', '4e8d3dcb1584ad129b3b49ccc34f65b20116ae54'])

0

In [130]:
wandb.init(project='precursor_charge_prediction')

AssertionError: 

## Setup

In [None]:
#%pip install seaborn

In [None]:
import re
import os
import random
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from tensorflow.keras.callbacks import ModelCheckpoint
from sklearn.utils import class_weight
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import confusion_matrix, classification_report
import seaborn as sns
import wandb
from wandb.keras import WandbCallback

# Dataset batches + split

In [None]:
import re
# the dictionary
aa_syntax_dictionary = dict()
for index, i in enumerate(list('XACDEFGHIKLMNPQRSTVWY')): # added X for 0 value
    aa_syntax_dictionary[i] = index * 100
    if i != 'X': # ignore 0 value for X
        for count in range(0, 100):
            aa_syntax_dictionary[i + "[UNIMOD:" + str(count) + "]"] = index * 100 + count

def seq_translator(sequence, dictionary=aa_syntax_dictionary, print_result=False):
    """
    Translates a sequence into a vector of integers
    :param sequence: string
    :param dictionary: dictionary
    :return: list
    """
    pattern = r'[A-Z]\[[^\]]*\]|.' # regex pattern to match amino acids and modifications

    result = [match for match in re.findall(pattern, sequence)]

    if print_result:
        print(result)
    # Fill the list with "X" characters until it reaches a length of 40
    result += ['X'] * (40 - len(result))

    return [dictionary[aa] for aa in result]


def one_hot_precursor(int_value, max_charge_included=6):
    """
    One-hot encodes the precursor charge
    :param df: dataframe
    :param max_charge_included: int
    :return: dataframe
    """
    one_hot = [1 if x == int_value else 0 for x in range(1, max_charge_included+1)]

    return one_hot

In [None]:
print(seq_translator('AAC[UNIMOD:4]LLVAW', print_result=True))
print(one_hot_precursor(2))

In [None]:
file_list = ["data/"+file for file in os.listdir('data') if file.endswith('.parquet')]
print(file_list[0])

### import parquet files, drop all columns we dont need for training, one-hot encode precursor charge, translate sequence

In [161]:
batches_parquet = dict()
for file in file_list:
    df = pd.read_parquet(file, engine='fastparquet')
    # drop all columns we dont need for training
    for column in df.columns:
        if column not in ["modified_sequence","precursor_intensity", "precursor_charge"]:
            df.drop(column, axis=1, inplace=True)
    df = df[df["precursor_charge"].isin([2, 3, 4])] # remove 7+ charge states // only take charge states 2,3,4
    df["modified_sequence_vector"] = df["modified_sequence"].apply(seq_translator)
    df["precursor_charge_vector"] = df["precursor_charge"].apply(one_hot_precursor)
    batches_parquet[file] = df
    break # TODO REMOVE BREAK FOR FIRST FILE IN IN LIST

In [162]:
batches_parquet[file_list[0]].head()

Unnamed: 0,modified_sequence,precursor_charge,precursor_intensity,modified_sequence_vector,precursor_charge_vector
0,LPGSLETYVEQEQGENANDR,2,29525630.0,"[1000, 1300, 600, 1600, 1000, 400, 1700, 2000,...","[0, 1, 0, 0, 0, 0]"
1,HGSLQEYLQNDTGSK,2,13188580.0,"[700, 600, 1600, 1000, 1400, 400, 2000, 1000, ...","[0, 1, 0, 0, 0, 0]"
2,VEEEEEINSELTAR,2,20663460.0,"[1800, 400, 400, 400, 400, 400, 800, 1200, 160...","[0, 1, 0, 0, 0, 0]"
3,LPGSLETYVEQEQGENANDR,2,19884630.0,"[1000, 1300, 600, 1600, 1000, 400, 1700, 2000,...","[0, 1, 0, 0, 0, 0]"
4,NSSTAEINETTTSSTDFLAR,2,12804420.0,"[1200, 1600, 1600, 1700, 100, 400, 800, 1200, ...","[0, 1, 0, 0, 0, 0]"


In [176]:
batches_parquet[file_list[0]]["precursor_charge"].value_counts()

2    4410530
3    1548959
4     117028
Name: precursor_charge, dtype: int64

# Multi-label-dictionary

In [172]:
def multi_label_one_hot_precursor(charge_list, charges_included=[2,3,4]):

    one_hot_encoded = [1 if label in charge_list else 0 for label in charges_included]

    return one_hot_encoded

In [196]:
# Grouping by "modified_sequence" and aggregating precursor_charge into a list
grouped_df = batches_parquet[file_list[0]].groupby("modified_sequence")["precursor_charge"].agg(list).reset_index()
grouped_df

Unnamed: 0,modified_sequence,precursor_charge
0,AAAASAAEAGIATTGTEGER,"[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 2, 3, 3, ..."
1,AAAC[UNIMOD:4]FFEQPPR,"[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ..."
2,AAADFATHGK,"[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ..."
3,AAADLMAYC[UNIMOD:4]EAHAK,"[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]"
4,AAADSDPNLDPLMNPHIR,"[3, 3, 3, 3]"
...,...,...
70910,YYVYWYQQLPGTTPK,"[2, 2, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]"
70911,YYYENSDQPIDLTK,"[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]"
70912,YYYGHYLDDYHTK,"[2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 3, 4, ..."
70913,YYYSDNFFDGQR,"[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ..."


In [200]:
multi_label_df = grouped_df.copy()
multi_label_df["precursor_charge"] = grouped_df["precursor_charge"].apply(multi_label_one_hot_precursor)
multi_label_df["modified_sequence_vector"] = grouped_df["modified_sequence"].apply(seq_translator)


In [201]:
multi_label_df

Unnamed: 0,modified_sequence,precursor_charge,modified_sequence_vector
0,AAAASAAEAGIATTGTEGER,"[1, 1, 0]","[100, 100, 100, 100, 1600, 100, 100, 400, 100,..."
1,AAAC[UNIMOD:4]FFEQPPR,"[1, 0, 0]","[100, 100, 100, 204, 500, 500, 400, 1400, 1300..."
2,AAADFATHGK,"[1, 0, 0]","[100, 100, 100, 300, 500, 100, 1700, 700, 600,..."
3,AAADLMAYC[UNIMOD:4]EAHAK,"[1, 0, 0]","[100, 100, 100, 300, 1000, 1100, 100, 2000, 20..."
4,AAADSDPNLDPLMNPHIR,"[0, 1, 0]","[100, 100, 100, 300, 1600, 300, 1300, 1200, 10..."
...,...,...,...
70910,YYVYWYQQLPGTTPK,"[1, 1, 0]","[2000, 2000, 1800, 2000, 1900, 2000, 1400, 140..."
70911,YYYENSDQPIDLTK,"[1, 0, 0]","[2000, 2000, 2000, 400, 1200, 1600, 300, 1400,..."
70912,YYYGHYLDDYHTK,"[1, 1, 1]","[2000, 2000, 2000, 600, 700, 2000, 1000, 300, ..."
70913,YYYSDNFFDGQR,"[1, 0, 0]","[2000, 2000, 2000, 1600, 300, 1200, 500, 500, ..."


### import preprocessed df's

In [144]:
# Make validation data
sample_df = batches_parquet[file_list[0]][["precursor_charge", "modified_sequence_vector", "precursor_intensity"]].copy()

In [145]:
sample_df

Unnamed: 0,precursor_charge,modified_sequence_vector,precursor_intensity
0,2,"[1000, 1300, 600, 1600, 1000, 400, 1700, 2000,...",29525630.0
1,2,"[700, 600, 1600, 1000, 1400, 400, 2000, 1000, ...",13188580.0
2,2,"[1800, 400, 400, 400, 400, 400, 800, 1200, 160...",20663460.0
3,2,"[1000, 1300, 600, 1600, 1000, 400, 1700, 2000,...",19884630.0
4,2,"[1200, 1600, 1600, 1700, 100, 400, 800, 1200, ...",12804420.0
...,...,...,...
6080601,3,"[2000, 1600, 1400, 1000, 500, 204, 600, 1000, ...",573697.7
6080602,3,"[2000, 1600, 1400, 1000, 500, 204, 600, 1000, ...",573697.7
6080603,3,"[400, 1100, 400, 1200, 500, 1800, 1400, 1600, ...",114742.8
6080604,3,"[400, 1100, 400, 1200, 500, 1800, 1400, 1600, ...",114742.8


In [146]:
# Split the data into train, validation, and test sets
X_2 = np.array(sample_df['modified_sequence_vector'].tolist())
y_2 = np.array(sample_df['precursor_charge'])
max_len = max(sample_df.loc[:, 'modified_sequence_vector'].apply(len))  # Find the maximum length


# Create an instance of StratifiedShuffleSplit
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

# Perform the split
train_val_indices, test_indices = next(sss.split(X_2, y_2))
X_2_train_val, X_2_test = X_2[train_val_indices], X_2[test_indices]
y_2_train_val, y_2_test = y_2[train_val_indices], y_2[test_indices]

# Create another instance of StratifiedShuffleSplit for train-validation split
sss_train_val = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

# Perform the train-validation split
train_indices, val_indices = next(sss_train_val.split(X_2_train_val, y_2_train_val))
X_2_train, X_2_val = X_2_train_val[train_indices], X_2_train_val[val_indices]
y_2_train, y_2_val = y_2_train_val[train_indices], y_2_train_val[val_indices]

num_classes = 8  # Number of precursor charge classes (1 to 7, plus an extra class for 'None' charge)
y_2_train_encoded = tf.keras.utils.to_categorical(y_2_train, num_classes)
y_2_val_encoded = tf.keras.utils.to_categorical(y_2_val, num_classes)
y_2_test_encoded = tf.keras.utils.to_categorical(y_2_train, num_classes)

  X_2 = np.array(sample_df['modified_sequence_vector'].tolist())


In [151]:
X_2.shape

(6080604,)

### KerasTuner

In [148]:
import datetime
logdir = os.path.join("logs_kerastuner", datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))
tensorboard_callback_kerastuner = tf.keras.callbacks.TensorBoard(logdir, histogram_freq=1)

In [150]:
#%pip install keras-tuner
import keras_tuner
from tensorflow import keras

def build_model(hp):
    model = tf.keras.models.Sequential([
    tf.keras.layers.Embedding(input_dim=max_len, output_dim=20, input_length=X_2.shape[1]),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(units=hp.Int('dense_1_units', min_value=16, max_value=256, step=8),
                          activation='relu'),
    tf.keras.layers.Dense(num_classes, activation='softmax')
    #hp.Choice('unknown', values=[1, 10])
    ])
    # Compile the model
    learning_rate = hp.Float("lr", min_value=0.00001, max_value=0.001, sampling="log"),
    model.compile(loss='categorical_crossentropy', optimizer=keras.optimizers.Adam(learning_rate=learning_rate), metrics=['accuracy'])
    return model

tuner = keras_tuner.RandomSearch(
    build_model,
    objective='val_loss',
    max_trials=10,
    overwrite=True,
    executions_per_trial=3,
    directory='output_tuner',)

tuner.search(train_ds, epochs=30, validation_data=val_ds, callbacks=[tensorboard_callback_kerastuner, wandb_callback])

best_model = tuner.get_best_models()[0]

IndexError: tuple index out of range

In [None]:
from tensorboard import program

tracking_address = "logs_kerastuner" # the path of your log file.

if __name__ == "__main__":
    tb = program.TensorBoard()
    tb.configure(argv=[None, '--logdir', tracking_address])
    url = tb.launch()
    print(f"Tensorflow listening on {url}")

### Model

In [15]:
# Define model
model_cce = tf.keras.models.Sequential([
    tf.keras.layers.Embedding(input_dim=max_len, output_dim=20, input_length=X_2.shape[1]),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(num_classes, activation='softmax')
])

# Compile the model
model_cce.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

checkpoint_callback = ModelCheckpoint('precursor_charge_prediction_model_v1/cce_wo7_allSequences.h5', monitor='val_accuracy', save_best_only=True, mode='max')

# Define early stopping
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3)

IndexError: tuple index out of range

In [14]:
# Train the model
history_cce = model_cce.fit(X_2_train, y_2_train_encoded, epochs=10, batch_size=32, validation_data=(X_2_val, y_2_val_encoded), callbacks=[checkpoint_callback, early_stopping, wandb_callback]) #, wandb_callback])

NameError: name 'model_cce' is not defined

In [None]:
# Access the loss, validation loss, and accuracy from the history object
loss = history_cce.history['loss']
val_loss = history_cce.history['val_loss']
accuracy = history_cce.history['accuracy']
val_accuracy = history_cce.history['val_accuracy']

# Plot the loss, validation loss, and accuracy curves
epochs = range(1, len(loss) + 1)

# Create subplots
fig2, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 6))

# Plot loss and validation loss
ax1.plot(epochs, loss, 'b', label='Training Loss')
ax1.plot(epochs, val_loss, 'r', label='Validation Loss')
ax1.set_title('Training and Validation Loss')
ax1.set_xlabel('Epochs')
ax1.set_ylabel('Loss')
ax1.legend()

# Plot accuracy and validation accuracy
ax2.plot(epochs, accuracy, 'b', label='Training Accuracy')
ax2.plot(epochs, val_accuracy, 'r', label='Validation Accuracy')
ax2.set_title('Training and Validation Accuracy')
ax2.set_xlabel('Epochs')
ax2.set_ylabel('Accuracy')
ax2.legend()

# Adjust spacing between subplots
plt.tight_layout()

# Show the plots
plt.show()

## Check in with Franzi's group for reporting

### Multilable Model
#### WIP in precursor_charge_predictor

## Model Testing
### check if models only predict charge 2 or also other charges. Due to 'overrepresentation' the best bet for the model could be to only output charge state 2

## Hyperparameter Tuning