# **Load train data**

Load train data

In [61]:
import re
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder

In [None]:
# Constantes
RNA_BASES = [["A"], ["U"], ["C"], ["G"]]
encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False).fit(RNA_BASES)

In [62]:
train_data = pd.read_csv("./data/SN_filtered_train.csv", header=0)

# **Exploratory Data Analysis (EDA)**

To be completed: add analysis of shape of the data, pycharts, boxplots, pehaps PCA, univariate analysis, multivariate analysis...

In [63]:
# Display the first 5 rows of the DataFrame
train_data.head()

Unnamed: 0,sequence_id,sequence,experiment_type,dataset_name,reads,signal_to_noise,SN_filter,reactivity_0001,reactivity_0002,reactivity_0003,...,reactivity_error_0197,reactivity_error_0198,reactivity_error_0199,reactivity_error_0200,reactivity_error_0201,reactivity_error_0202,reactivity_error_0203,reactivity_error_0204,reactivity_error_0205,reactivity_error_0206
0,51e61fbde94d,GGGAACGACUCGAGUAGAGUCGAAAAACAUUGAUAUGGAUUUACUC...,2A3_MaP,15k_2A3,5326.0,1.933,1,,,,...,,,,,,,,,,
1,25ce8d5109cd,GGGAACGACUCGAGUAGAGUCGAAAAACCUUGAUAUGGAUUUACUC...,2A3_MaP,15k_2A3,4647.0,2.347,1,,,,...,,,,,,,,,,
2,07dcfb6d1965,GGGAACGACUCGAGUAGAGUCGAAAAACUUUGAUAUGGAUUUACUC...,2A3_MaP,15k_2A3,102843.0,11.824,1,,,,...,,,,,,,,,,
3,e561cc042a4c,GGGAACGACUCGAGUAGAGUCGAAAAACGAUGAUAUGGAUUUACUC...,2A3_MaP,15k_2A3,7665.0,3.519,1,,,,...,,,,,,,,,,
4,aa948762535f,GGGAACGACUCGAGUAGAGUCGAAAAACGCUGAUAUGGAUUUACUC...,2A3_MaP,15k_2A3,14018.0,3.219,1,,,,...,,,,,,,,,,


# **Feature engeneering**

Remove "sequence_id", "dataset_name", "reads", "SN_filter" and "reactivity_error" columns

In [72]:
# Function to keep rows with maximum signal_to_noise within identical sequences
def filter_identical_sequences(df):
    # Group by 'sequence' and keep the row with max 'signal_to_noise'
    filtered_df = df.groupby('sequence').apply(lambda x: x.loc[x['signal_to_noise'].idxmax()])
    return filtered_df

In [298]:
# Get columns name for X, and Y
x_columns = ["sequence_id", "sequence"]
conditional_columns = ["experiment_type", "signal_to_noise"]
y_columns = [colname for colname in train_data.columns if re.match("^reactivity_[0-9]{4}$", colname)]

# Keep the necessary columns from the DataFrame
cleaned_train_data = train_data[x_columns + conditional_columns + y_columns]

For each group of identical sequences, keep only the sequence with the highest signal to noise value

In [299]:
# Create two separate DataFrames based on "experiment_type"
df_2A3_MaP = cleaned_train_data[cleaned_train_data['experiment_type'] == '2A3_MaP']
df_DMS_MaP = cleaned_train_data[cleaned_train_data['experiment_type'] == 'DMS_MaP']

# Delete cleaned_train_data to free space memory
del cleaned_train_data

df_2A3_MaP = filter_identical_sequences(df_2A3_MaP)  # Filter df_2A3_MaP
df_DMS_MaP = filter_identical_sequences(df_DMS_MaP)  # Filter df_DMS_MaP

In [300]:
# Concatenate the two data frames
mask_2A3 = df_2A3_MaP["sequence"].isin(df_DMS_MaP["sequence"])
mask_DMS = df_DMS_MaP["sequence"].isin(df_2A3_MaP["sequence"])

cleared_train_data = pd.concat([df_2A3_MaP[mask_2A3], df_DMS_MaP[mask_DMS]], ignore_index=True)
cleared_train_data.drop(columns=['signal_to_noise'], inplace=True)

Save the cleared train data into a csv file

In [59]:
# columns type
cleared_train_data[y_columns] = cleared_train_data[y_columns].astype(np.float32)

In [301]:
# Save cleared_train_data as a CSV file
csv_path = './data/cleared_train_data.csv'
cleared_train_data.to_csv(csv_path, index=False)

In [268]:
def dict_from_data(data, keys_name=["2A3_MaP", "DMS_MaP"]):
    n_duplicate = sum(data.duplicated(subset=["sequence", "experiment_type"]))
    if n_duplicate > 0:
        return None

    seq_reactivity = dict()
    for seq, group in data.groupby("sequence"):
        seq_reactivity[seq] = dict()
        for key in keys_name:
            mask = group["experiment_type"] == key
            seq_reactivity[seq][key] = group[mask].drop(
                labels=["sequence", "experiment_type"], axis=1
            ).values.reshape(-1)
            seq_reactivity[seq][key] = np.expand_dims(seq_reactivity[seq][key], axis=1)

    return seq_reactivity

In [270]:
dict_data = dict_from_data(cleared_train_data)

In [334]:
def XY_from_dict(dict_data, encoder):
    x_list = []
    y_list = []
    for i, (sequence, reactivities) in enumerate(dict_data.items()):
        y = np.hstack([reactivities["2A3_MaP"], reactivities["DMS_MaP"]])
        x_list.append(onehot_from_sequence(sequence, encoder))
        y_list.append(y)
        if i == 2:
            break
    return np.array(x_list), np.array(y_list)


In [330]:
def onehot_from_sequence(sequence, encoder, to_add="0", maxlen=457):
    """sequence: str"""
    proccessed_sequence = sequence.upper()
    proccessed_sequence += to_add * (maxlen - len(sequence))
    proccessed_sequence = [[nbase] for nbase in proccessed_sequence]
    onehot_sequence = encoder.transform(proccessed_sequence)
    return onehot_sequence

In [338]:
x, y = XY_from_dict(d, encoder)


(3, 206, 2)

In [289]:
np.array(y_list)

array([[[nan, nan],
        [nan, nan],
        [nan, nan],
        ...,
        [nan, nan],
        [nan, nan],
        [nan, nan]],

       [[nan, nan],
        [nan, nan],
        [nan, nan],
        ...,
        [nan, nan],
        [nan, nan],
        [nan, nan]],

       [[nan, nan],
        [nan, nan],
        [nan, nan],
        ...,
        [nan, nan],
        [nan, nan],
        [nan, nan]]])

# **Model**
(To be corrected)

Load cleared train data

In [None]:
import pandas as pd

# Define the path of the CSV file
csv_path = '/content/drive/My Drive/M2BI_DRIVE/cleared_train_data.csv'

# Load the CSV file as a Dask DataFrame
cleared_train_data = pd.read_csv(csv_path)

Define features and targets

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Masking, Reshape
from sklearn.model_selection import train_test_split
from scipy.sparse import hstack

# Extract RNA sequences and experiment_type
rna_sequences = cleared_train_data['sequence']
experiment_type = cleared_train_data['experiment_type']

# Initialize the OneHotEncoder with sparse_output=True and dtype=np.int64
encoder = OneHotEncoder(sparse_output=True, dtype=np.int64)  # Fix the warning here
# Fit and transform RNA sequences
rna_sequences_encoded = encoder.fit_transform(rna_sequences.values.reshape(-1, 1))

# One-hot encode experiment_type using pandas
experiment_type_encoded = pd.get_dummies(experiment_type)

# Combine one-hot encoded features (RNA sequences and experiment_type)
# Use hstack to concatenate sparse matrices
features = hstack((rna_sequences_encoded, experiment_type_encoded))

# Extract reactivity columns as targets (excluding 'sequence' and 'experiment_type')
reactivity_columns = cleared_train_data.columns[~cleared_train_data.columns.isin(['sequence', 'experiment_type'])]
targets = cleared_train_data[reactivity_columns]

# Handle NaN values by creating a mask
reactivity_mask = ~np.isnan(targets.values)

# Split the data into training and validation sets
X_train, X_val, y_train, y_val, mask_train, mask_val = train_test_split(
    features, targets, reactivity_mask, test_size=0.2, random_state=42)

# Convert sparse matrices to dense arrays
X_train_dense = X_train.toarray()
X_val_dense = X_val.toarray()

# Reshape input data to include the time step dimension
timesteps = 1  # Number of time steps (since since we have masked sequences)
input_dim = X_train_dense.shape[1]
X_train_reshaped = X_train_dense.reshape(X_train_dense.shape[0], timesteps, input_dim)
X_val_reshaped = X_val_dense.reshape(X_val_dense.shape[0], timesteps, input_dim)

RNN model

In [None]:
# Define RNN model with Masking layer
model = Sequential()
model.add(Masking(mask_value=0.0, input_shape=(timesteps, input_dim)))  # Masking layer to handle NaN values
model.add(LSTM(64))
model.add(Dense(y_train.shape[1], activation='linear'))  # Linear activation for regression

# Compile the model (adjust the loss function and optimizer as needed)
model.compile(loss='mean_squared_error', optimizer='adam')

# Print model summary
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 masking (Masking)           (None, 1, 242731)         0         
                                                                 
 lstm (LSTM)                 (None, 64)                62155776  
                                                                 
 dense (Dense)               (None, 206)               13390     
                                                                 
Total params: 62169166 (237.16 MB)
Trainable params: 62169166 (237.16 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


Train the model using "mini-batch" training loop to avoid RAM issu



In [None]:
# Define mini-batch size and number of epochs
mini_batch_size = 32
num_epochs = 10

# Get the number of mini-batches
num_mini_batches = X_train.shape[0] // mini_batch_size

# Define optimizer and learning rate
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)

# Training loop with mini-batch processing
@tf.function  # Decorate the entire training loop
def train_step(mini_batch_X, mini_batch_y, mini_batch_mask):
    with tf.GradientTape() as tape:
        predictions = model(mini_batch_X, training=True)
        loss = tf.keras.losses.mean_squared_error(mini_batch_y, predictions)
        mini_batch_mask = tf.cast(mini_batch_mask, dtype=tf.float32)
        weighted_loss = loss * tf.reduce_mean(mini_batch_mask)

    gradients = tape.gradient(weighted_loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))
    return loss

for epoch in range(num_epochs):
    print(f'Epoch {epoch + 1}/{num_epochs}')
    for batch_idx in range(num_mini_batches):
        start_idx = batch_idx * mini_batch_size
        end_idx = (batch_idx + 1) * mini_batch_size

        # Extract mini-batch data
        mini_batch_X = X_train_reshaped[start_idx:end_idx]
        mini_batch_y = y_train[start_idx:end_idx]
        mini_batch_mask = mask_train[start_idx:end_idx]

        # Train on the mini-batch
        loss = train_step(mini_batch_X, mini_batch_y, mini_batch_mask)

    # Evaluate on validation data after each epoch
    val_loss = model.evaluate(X_val_reshaped, y_val, verbose=0)
    print(f'Validation Loss: {val_loss:.4f}')

# **Load test**

# **Check efficiency of the model**

# **Save submission**

# **Plot RNA structure**