

`# **Load train data**`

Connect to google drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


*Load* train data

In [None]:
import dask.dataframe as dd

# Define the path of the CSV file
csv_path = '/content/drive/My Drive/M2BI_DRIVE/train_data.csv'

# Load the CSV file as a Dask DataFrame
train_data = dd.read_csv(csv_path)

# **Exploratory Data Analysis (EDA)**

To be completed: add analysis of shape of the data, pycharts, boxplots, pehaps PCA, univariate analysis, multivariate analysis...

In [None]:
# Display the first 5 rows of the Dask DataFrame
train_data.head()

Unnamed: 0,sequence_id,sequence,experiment_type,dataset_name,reads,signal_to_noise,SN_filter,reactivity_0001,reactivity_0002,reactivity_0003,...,reactivity_error_0197,reactivity_error_0198,reactivity_error_0199,reactivity_error_0200,reactivity_error_0201,reactivity_error_0202,reactivity_error_0203,reactivity_error_0204,reactivity_error_0205,reactivity_error_0206
0,8cdfeef009ea,GGGAACGACUCGAGUAGAGUCGAAAAACGUUGAUAUGGAUUUACUC...,2A3_MaP,15k_2A3,2343,0.944,0,,,,...,,,,,,,,,,
1,51e61fbde94d,GGGAACGACUCGAGUAGAGUCGAAAAACAUUGAUAUGGAUUUACUC...,2A3_MaP,15k_2A3,5326,1.933,1,,,,...,,,,,,,,,,
2,25ce8d5109cd,GGGAACGACUCGAGUAGAGUCGAAAAACCUUGAUAUGGAUUUACUC...,2A3_MaP,15k_2A3,4647,2.347,1,,,,...,,,,,,,,,,
3,07dcfb6d1965,GGGAACGACUCGAGUAGAGUCGAAAAACUUUGAUAUGGAUUUACUC...,2A3_MaP,15k_2A3,102843,11.824,1,,,,...,,,,,,,,,,
4,e561cc042a4c,GGGAACGACUCGAGUAGAGUCGAAAAACGAUGAUAUGGAUUUACUC...,2A3_MaP,15k_2A3,7665,3.519,1,,,,...,,,,,,,,,,


# **Feature engeneering**

Filter the rows where "SN_filter" is equal to 1



In [None]:
# Create a copy of train_data
cleaned_train_data = train_data
# Filter the rows where "SN_filter" is equal to 1
cleaned_train_data = train_data[train_data['SN_filter'] == 1]
# Convert the Dask DataFrame to a Pandas DataFrame
cleaned_train_data = cleaned_train_data.compute()

Remove "sequence_id", "dataset_name", "reads", "SN_filter" and "reactivity_error" columns

In [None]:
# List of columns to remove
columns_to_remove = ["sequence_id", "dataset_name", "reads", "SN_filter"]

# Find columns containing "reactivity_error" in their names
reactivity_error_columns = [col for col in cleaned_train_data.columns if "reactivity_error" in col]

# Combine the columns to remove
columns_to_remove.extend(reactivity_error_columns)

# Drop the specified columns from the DataFrame
cleaned_train_data = cleaned_train_data.drop(columns=columns_to_remove)

For each group of identical sequences, keep only the sequence with the highest signal to noise value

In [None]:
import pandas as pd

# Create two separate DataFrames based on "experiment_type"
df_2A3_MaP = cleaned_train_data[cleaned_train_data['experiment_type'] == '2A3_MaP']
df_DMS_MaP = cleaned_train_data[cleaned_train_data['experiment_type'] == 'DMS_MaP']

# Delete cleaned_train_data to free space memory
del cleaned_train_data

# Function to keep rows with maximum signal_to_noise within identical sequences
def filter_identical_sequences(df):
    # Group by 'sequence' and keep the row with max 'signal_to_noise'
    filtered_df = df.groupby('sequence').apply(lambda x: x.loc[x['signal_to_noise'].idxmax()])
    return filtered_df

# Filter df_2A3_MaP
df_2A3_MaP = filter_identical_sequences(df_2A3_MaP)

# Filter df_DMS_MaP
df_DMS_MaP = filter_identical_sequences(df_DMS_MaP)

# Concatenate the two data frames
cleared_train_data = pd.concat([df_2A3_MaP, df_DMS_MaP], ignore_index=True)

# Delete df_2A3_MaP and df_DMS_MaP to free space memory
del df_2A3_MaP
del df_DMS_MaP

# Reset the index of the merged data frame
cleared_train_data.reset_index(drop=True, inplace=True)

# Remove the "signal_to_noise" column
cleared_train_data = cleared_train_data.drop(columns=['signal_to_noise'])

Reduce the size of the cleared_train_data. Code inspired from: https://www.kaggle.com/code/saikiranvarma/reduce-memory-of-training-data-by-50

In [None]:
import numpy as np

def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

#Reduce memory usage by assigning the real datatypes based on the data
cleared_train_data = reduce_mem_usage(cleared_train_data)

Mem. usage decreased to 325.10 Mb (50.1% reduction)


Save the cleared train data into a csv file

In [None]:
# Save cleared_train_data as a CSV file
csv_path = '/content/drive/My Drive/M2BI_DRIVE/cleared_train_data.csv'
cleared_train_data.to_csv(csv_path, index=False)

Load cleared_train_data

In [None]:
import pandas as pd
import numpy as np

# Define the path of the CSV file
csv_path = '/content/drive/My Drive/M2BI_DRIVE/cleared_train_data.csv'

# Load the CSV file as a Dask DataFrame
cleared_train_data = pd.read_csv(csv_path)

Perform one-hot encoding for the nucleotides and for each nucleotide, combine with the two reactivity values in one row

In [None]:
# Define a function to process the entire DataFrame
def process_dataframe(df):
    result_list = []
    unique_sequences = df['sequence'].unique()

    for sequence in unique_sequences:
        sequence_data = df[df['sequence'] == sequence]
        dms_map_row = sequence_data[sequence_data['experiment_type'] == 'DMS_MaP']
        a3_map_row = sequence_data[sequence_data['experiment_type'] == '2A3_MaP']

        tensor_rows = []
        for i in range(len(sequence)):
            dms_value = dms_map_row[f'reactivity_{i+1:04d}'].iloc[0] if not dms_map_row.empty else np.nan
            a3_value = a3_map_row[f'reactivity_{i+1:04d}'].iloc[0] if not a3_map_row.empty else np.nan
            nucleotide = sequence[i]
            tensor_row = [nucleotide, dms_value, a3_value]
            tensor_rows.append(tensor_row)

        # Create a DataFrame from the tensor_rows
        tensor_df = pd.DataFrame(tensor_rows, columns=['Nucleotide', 'DMS_MaP_Reactivity', '2A3_MaP_Reactivity'])

        # Perform one-hot encoding for the 'Nucleotide' column
        nucleotide_encoded = pd.get_dummies(tensor_df['Nucleotide'], prefix='Nucleotide')

        # Drop the original 'Nucleotide' column
        tensor_df.drop(columns=['Nucleotide'], inplace=True)

        # Concatenate the one-hot encoded columns with the original DataFrame
        tensor_df = pd.concat([nucleotide_encoded, tensor_df], axis=1)

        result_list.append(tensor_df)

    return result_list

# Apply the function to the entire DataFrame
cleared_train_data = process_dataframe(cleared_train_data)

Save cleared_train_data after one-hot encoding of nucleotides

In [None]:
import os

# Define a function to save DataFrames to CSV files
def save_dataframes_to_csv(cleared_train_data, output_directory):
    for i, df in enumerate(cleared_train_data):
        filename = f'sequence_{i}.csv'
        filepath = os.path.join(output_directory, filename)
        df.to_csv(filepath, index=False)

# Specify the output directory where you want to save the CSV files
output_directory = '/content/drive/My Drive/M2BI_DRIVE/one_hot_cleared_train_data/'

# Call the function to save the DataFrames to CSV files
save_dataframes_to_csv(cleared_train_data, output_directory)

Load cleared_train_data after one-hot encoding of nucleotides

In [None]:
# Specify the directory where the CSV files are located
input_directory = '/content/drive/My Drive/M2BI_DRIVE/one_hot_cleared_train_data/'

# Define a function to load DataFrames from CSV files
def load_dataframes_from_csv(input_directory):
    cleared_train_data = []

    # List all CSV files in the input directory
    csv_files = [f for f in os.listdir(input_directory) if f.endswith('.csv')]

    for filename in csv_files:
        filepath = os.path.join(input_directory, filename)

        # Load the CSV file into a DataFrame
        df = pd.read_csv(filepath)
        cleared_train_data.append(df)

    return cleared_train_data

# Call the function to load the DataFrames from CSV files
cleared_train_data = load_dataframes_from_csv(input_directory)

# **Model**
(To be corrected)

Load cleared train data

In [None]:
import tensorflow as tf
from tensorflow.keras.layers import LSTM, Input, Dense, Masking, Concatenate
from tensorflow.keras.models import Model
from sklearn.model_selection import train_test_split
import numpy as np

# Split the list of DataFrames into training and validation sets
train_data, val_data = train_test_split(cleared_train_data, test_size=0.2, random_state=42)

# Define a custom loss function that accounts for the mask
def custom_loss(y_true, y_pred):
    # Extract the mask from y_true
    mask = tf.math.is_finite(y_true)

    # Calculate the mean squared error, applying the mask
    mse = tf.reduce_sum(tf.square(y_true - y_pred) * mask) / tf.reduce_sum(mask)

    return mse

# Define LSTM model architecture
input_layer = Input(shape=(None, len(cleared_train_data[0].columns)))  # Variable sequence length

# Apply masking layer to handle sequences of different lengths
masked_input = Masking(mask_value=0.0)(input_layer)

# LSTM layer with return_sequences=True for sequence-to-sequence prediction
lstm_layer = LSTM(64, return_sequences=True)(masked_input)

# Output layer for predicting reactivity values
output_layer = Dense(1, activation='linear')(lstm_layer)

# Create the model
model = Model(inputs=input_layer, outputs=output_layer)

# Compile the model with custom loss
model.compile(optimizer='adam', loss=custom_loss)

# Prepare the training and validation data
X_train = [df.values[:, :-2] for df in train_data]  # Exclude the last two columns (Reactivities)
y_train = [df.values[:, -2:] for df in train_data]   # Extract the last two columns (Reactivities)
X_val = [df.values[:, :-2] for df in val_data]
y_val = [df.values[:, -2:] for df in val_data]

# Train the model
model.fit(X_train, y_train, validation_data=(X_val, y_val), batch_size=32, epochs=10)

# Now the model is trained and can handle sequences of varying lengths without padding or truncation.

Define features and targets

RNN model

Train the model using "mini-batch" training loop to avoid RAM issu

# **Load test**

# **Check efficiency of the model**

# **Save submission**

# **Plot RNA structure**