# **Load train data**

Load train data

In [3]:
a = 2
a

2

In [2]:
import re
import pandas as pd
import numpy as np
import keras
from sklearn.preprocessing import OneHotEncoder

2023-10-06 16:54:36.012170: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
# Constantes
RNA_BASES = [["A"], ["U"], ["C"], ["G"]]
encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False).fit(RNA_BASES)

In [4]:
train_data = pd.read_csv("./data/SN_filtered_train.csv", header=0)

# **Exploratory Data Analysis (EDA)**

To be completed: add analysis of shape of the data, pycharts, boxplots, pehaps PCA, univariate analysis, multivariate analysis...

In [63]:
# Display the first 5 rows of the DataFrame
train_data.head()

Unnamed: 0,sequence_id,sequence,experiment_type,dataset_name,reads,signal_to_noise,SN_filter,reactivity_0001,reactivity_0002,reactivity_0003,...,reactivity_error_0197,reactivity_error_0198,reactivity_error_0199,reactivity_error_0200,reactivity_error_0201,reactivity_error_0202,reactivity_error_0203,reactivity_error_0204,reactivity_error_0205,reactivity_error_0206
0,51e61fbde94d,GGGAACGACUCGAGUAGAGUCGAAAAACAUUGAUAUGGAUUUACUC...,2A3_MaP,15k_2A3,5326.0,1.933,1,,,,...,,,,,,,,,,
1,25ce8d5109cd,GGGAACGACUCGAGUAGAGUCGAAAAACCUUGAUAUGGAUUUACUC...,2A3_MaP,15k_2A3,4647.0,2.347,1,,,,...,,,,,,,,,,
2,07dcfb6d1965,GGGAACGACUCGAGUAGAGUCGAAAAACUUUGAUAUGGAUUUACUC...,2A3_MaP,15k_2A3,102843.0,11.824,1,,,,...,,,,,,,,,,
3,e561cc042a4c,GGGAACGACUCGAGUAGAGUCGAAAAACGAUGAUAUGGAUUUACUC...,2A3_MaP,15k_2A3,7665.0,3.519,1,,,,...,,,,,,,,,,
4,aa948762535f,GGGAACGACUCGAGUAGAGUCGAAAAACGCUGAUAUGGAUUUACUC...,2A3_MaP,15k_2A3,14018.0,3.219,1,,,,...,,,,,,,,,,


# **Feature engeneering**

Remove "sequence_id", "dataset_name", "reads", "SN_filter" and "reactivity_error" columns

In [5]:
# Function to keep rows with maximum signal_to_noise within identical sequences
def filter_identical_sequences(df):
    # Group by 'sequence' and keep the row with max 'signal_to_noise'
    filtered_df = df.groupby('sequence').apply(lambda x: x.loc[x['signal_to_noise'].idxmax()])
    return filtered_df

In [6]:
def dict_from_data(data, keys_name=["2A3_MaP", "DMS_MaP"]):
    n_duplicate = sum(data.duplicated(subset=["sequence", "experiment_type"]))
    if n_duplicate > 0:
        return None

    seq_reactivity = dict()
    for seq, group in data.groupby("sequence"):
        seq_reactivity[seq] = dict()
        for key in keys_name:
            mask = group["experiment_type"] == key
            seq_reactivity[seq][key] = group[mask].drop(
                labels=["sequence", "experiment_type"], axis=1
            ).values.reshape(-1)
            seq_reactivity[seq][key] = np.expand_dims(seq_reactivity[seq][key], axis=1)

    return seq_reactivity

def XY_from_dict(dict_data, encoder, maxlen=457):
    x_list = []
    y_list = []
    for i, (sequence, reactivities) in enumerate(dict_data.items()):
        y = np.hstack([reactivities["2A3_MaP"], reactivities["DMS_MaP"]])
        x_list.append(onehot_from_sequence(sequence, encoder, maxlen=maxlen))
        y_list.append(padded_matrix(y, maxlen=maxlen))
    return np.array(x_list), np.array(y_list)

def onehot_from_sequence(sequence, encoder, to_add="0", maxlen=457):
    """sequence: str"""
    if maxlen is None:
        maxlen = 0
    proccessed_sequence = sequence.upper()
    proccessed_sequence += to_add * (maxlen - len(sequence))
    proccessed_sequence = [[nbase] for nbase in proccessed_sequence]
    onehot_sequence = encoder.transform(proccessed_sequence)
    return onehot_sequence

def padded_matrix(matrix_2d, maxlen=457):
    if not isinstance(matrix_2d, np.ndarray):
        matrix_2d = np.array(matrix_2d)

    n_toadd = maxlen - matrix_2d.shape[0]
    padding = ((0, n_toadd), (0, 0))  # padding on axis
    matrix_2d_padded = np.pad(matrix_2d, pad_width=padding, mode="constant")
    return matrix_2d_padded


In [298]:
# Get columns name for X, and Y
x_columns = ["sequence_id", "sequence"]
conditional_columns = ["experiment_type", "signal_to_noise"]
y_columns = [colname for colname in train_data.columns if re.match("^reactivity_[0-9]{4}$", colname)]

# Keep the necessary columns from the DataFrame
cleaned_train_data = train_data[x_columns + conditional_columns + y_columns]

For each group of identical sequences, keep only the sequence with the highest signal to noise value

In [299]:
# Create two separate DataFrames based on "experiment_type"
df_2A3_MaP = cleaned_train_data[cleaned_train_data['experiment_type'] == '2A3_MaP']
df_DMS_MaP = cleaned_train_data[cleaned_train_data['experiment_type'] == 'DMS_MaP']

# Delete cleaned_train_data to free space memory
del cleaned_train_data

df_2A3_MaP = filter_identical_sequences(df_2A3_MaP)  # Filter df_2A3_MaP
df_DMS_MaP = filter_identical_sequences(df_DMS_MaP)  # Filter df_DMS_MaP

In [300]:
# Concatenate the two data frames
mask_2A3 = df_2A3_MaP["sequence"].isin(df_DMS_MaP["sequence"])
mask_DMS = df_DMS_MaP["sequence"].isin(df_2A3_MaP["sequence"])

cleared_train_data = pd.concat([df_2A3_MaP[mask_2A3], df_DMS_MaP[mask_DMS]], ignore_index=True)
cleared_train_data.drop(columns=['signal_to_noise'], inplace=True)

Save the cleared train data into a csv file

In [59]:
# columns type
cleared_train_data[y_columns] = cleared_train_data[y_columns].astype(np.float32)

In [301]:
# Save cleared_train_data as a CSV file
csv_path = './data/cleared_train_data.csv'
cleared_train_data.to_csv(csv_path, index=False)

In [8]:
cleared_train_data = train_data
dict_data = dict_from_data(cleared_train_data)
x, y = XY_from_dict(dict_data, encoder)

In [9]:
x.shape

(167979, 457, 4)

In [10]:
y.shape

(167979, 457, 2)

# **Model**
(To be corrected)

Load cleared train data

In [350]:
# Define the path of the CSV file
csv_path = './data/cleared_train_data.csv'

# Load the CSV file as a DataFrame
cleared_train_data = pd.read_csv(csv_path)
dict_data = dict_from_data(cleared_train_data)
x, y = XY_from_dict(dict_data, encoder)

Define features and targets

In [390]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Masking, Reshape, Embedding
from sklearn.model_selection import train_test_split
from scipy.sparse import hstack

In [477]:
# Handle NaN values by creating a mask
x_mask = x.sum(axis=2) == 0
reactivity_mask = ~np.isnan(y)

In [478]:
# Split the data into training and validation sets
x_train, x_val, y_train, y_val, mask_train, mask_val = train_test_split(
    x, y, x_mask, test_size=0.2, random_state=42
)

RNN model

In [479]:
y_train = np.nan_to_num(y_train)  # pour le moment on remplace les nan par des 0

In [480]:
# Supposons que vous avez déjà créé votre masque de padding (padding_mask) comme indiqué précédemment.

# Créez votre modèle
model = keras.Sequential([
    keras.layers.Input(shape=(457, 4)),
    keras.layers.LSTM(units=8, return_sequences=True),
    keras.layers.Dense(units=2, activation='linear')
])

# Appliquez le masque de padding à la sortie de la couche LSTM
#masked_output = keras.layers.Masking(mask_value=True)(model.output)

# Créez un modèle final en utilisant la sortie masquée
final_model = keras.Model(inputs=model.input, outputs=model.output)

# Compilez et entraînez le modèle comme d'habitude
final_model.compile(loss='mean_squared_error', optimizer='adam')
final_model.fit(x_train, y_train, epochs=3, batch_size=16)


Epoch 1/3


2023-10-05 16:06:44.117141: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-10-05 16:06:44.118478: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-10-05 16:06:44.119406: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7f91d1b5ea50>

In [483]:
model.predict(x)



array([[[0.17378888, 0.01762347],
        [0.27257285, 0.01585824],
        [0.3257735 , 0.0079139 ],
        ...,
        [0.00706036, 0.01595702],
        [0.00706036, 0.01595702],
        [0.00706036, 0.01595702]],

       [[0.17378888, 0.01762347],
        [0.27257285, 0.01585824],
        [0.3257735 , 0.0079139 ],
        ...,
        [0.00706036, 0.01595702],
        [0.00706036, 0.01595702],
        [0.00706036, 0.01595702]],

       [[0.17378888, 0.01762347],
        [0.27257285, 0.01585824],
        [0.3257735 , 0.0079139 ],
        ...,
        [0.00706036, 0.01595702],
        [0.00706036, 0.01595702],
        [0.00706036, 0.01595702]],

       ...,

       [[0.17378888, 0.01762347],
        [0.27257285, 0.01585824],
        [0.3257735 , 0.0079139 ],
        ...,
        [0.00706036, 0.01595702],
        [0.00706036, 0.01595702],
        [0.00706036, 0.01595702]],

       [[0.17378888, 0.01762347],
        [0.27257285, 0.01585824],
        [0.3257735 , 0.0079139 ],
        .

In [11]:
np.save("./data/X.data", x)
np.save("./data/Y.data", y)

# **Load test**

# **Check efficiency of the model**

# **Save submission**

# **Plot RNA structure**