In [2]:
# Setting up an RNA Science Environment
!pip install arnie
!pip install draw_rna

# Install EternaFold
!conda config --set auto_update_conda false
!conda install -c bioconda eternafold --yes
# Manually setup EternaFold for Kaggle notebook
%env ETERNAFOLD_PATH=/opt/conda/bin/eternafold-bin
%env ETERNAFOLD_PARAMETERS=/opt/conda/lib/eternafold-lib/parameters/EternaFoldParams.v1

Collecting argparse>=1.4 (from draw_rna)
  Using cached argparse-1.4.0-py2.py3-none-any.whl (23 kB)
Installing collected packages: argparse
Successfully installed argparse-1.4.0
Collecting package metadata (current_repodata.json): done
Solving environment: done


  current version: 23.7.3
  latest version: 23.9.0

Please update conda by running

    $ conda update -n base -c conda-forge conda

Or to minimize the number of packages updated during conda update use

     conda install conda=23.9.0



# All requested packages already installed.

env: ETERNAFOLD_PATH=/opt/conda/bin/eternafold-bin
env: ETERNAFOLD_PARAMETERS=/opt/conda/lib/eternafold-lib/parameters/EternaFoldParams.v1


In [3]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)


from arnie.mfe import mfe # Minimum Free Energy (mfe) function makes structure predictions about a given sequence
from arnie.bpps import bpps # function that predicts the probability of every possible base pairing
from draw_rna.ipynb_draw import draw_struct # plot RNA structures in 2D

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Import Datasets

In [4]:
train= pd.read_csv("/kaggle/input/stanford-ribonanza-rna-folding/train_data.csv")

test= pd.read_csv("/kaggle/input/stanford-ribonanza-rna-folding/test_sequences.csv")

In [27]:
print(f"Train dataset shape: {train.shape}\n")

print(f"Test dataset shape: {test.shape}")

Train dataset shape: (1643680, 419)

Test dataset shape: (1343823, 5)


In [5]:
train.head()

Unnamed: 0,sequence_id,sequence,experiment_type,dataset_name,reads,signal_to_noise,SN_filter,reactivity_0001,reactivity_0002,reactivity_0003,...,reactivity_error_0197,reactivity_error_0198,reactivity_error_0199,reactivity_error_0200,reactivity_error_0201,reactivity_error_0202,reactivity_error_0203,reactivity_error_0204,reactivity_error_0205,reactivity_error_0206
0,8cdfeef009ea,GGGAACGACUCGAGUAGAGUCGAAAAACGUUGAUAUGGAUUUACUC...,2A3_MaP,15k_2A3,2343,0.944,0,,,,...,,,,,,,,,,
1,51e61fbde94d,GGGAACGACUCGAGUAGAGUCGAAAAACAUUGAUAUGGAUUUACUC...,2A3_MaP,15k_2A3,5326,1.933,1,,,,...,,,,,,,,,,
2,25ce8d5109cd,GGGAACGACUCGAGUAGAGUCGAAAAACCUUGAUAUGGAUUUACUC...,2A3_MaP,15k_2A3,4647,2.347,1,,,,...,,,,,,,,,,
3,07dcfb6d1965,GGGAACGACUCGAGUAGAGUCGAAAAACUUUGAUAUGGAUUUACUC...,2A3_MaP,15k_2A3,102843,11.824,1,,,,...,,,,,,,,,,
4,e561cc042a4c,GGGAACGACUCGAGUAGAGUCGAAAAACGAUGAUAUGGAUUUACUC...,2A3_MaP,15k_2A3,7665,3.519,1,,,,...,,,,,,,,,,


In [28]:
test.head()

Unnamed: 0,id_min,id_max,sequence_id,sequence,future
0,0,176,eee73c1836bc,GGGAACGACUCGAGUAGAGUCGAAAAUUUCCUUCCAAAUCCUGAGG...,0
1,177,353,d2a929af7a97,GGGAACGACUCGAGUAGAGUCGAAAAUGUAAUCAGAUUGCUUCUCC...,0
2,354,530,d39a4425ff45,GGGAACGACUCGAGUAGAGUCGAAAAAACACAUGAAUUUGAGGGUU...,0
3,531,707,1fc41e92d553,GGGAACGACUCGAGUAGAGUCGAAAAUCAGAGCUGGCAAAUGGAUG...,0
4,708,884,1d0826fb892f,GGGAACGACUCGAGUAGAGUCGAAAAUUUGGUAUUUGAUGCAUUAA...,0


In [7]:
print(f"Test Columns: {test.info()}")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1343823 entries, 0 to 1343822
Data columns (total 5 columns):
 #   Column       Non-Null Count    Dtype 
---  ------       --------------    ----- 
 0   id_min       1343823 non-null  int64 
 1   id_max       1343823 non-null  int64 
 2   sequence_id  1343823 non-null  object
 3   sequence     1343823 non-null  object
 4   future       1343823 non-null  int64 
dtypes: int64(3), object(2)
memory usage: 51.3+ MB
Test Columns: None


In [6]:
print(f"Train Columns: {train.info()}")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1643680 entries, 0 to 1643679
Columns: 419 entries, sequence_id to reactivity_error_0206
dtypes: float64(413), int64(2), object(4)
memory usage: 5.1+ GB
Train Columns: None


In [8]:
# Count columns based on their Dtype
dtype_counts = train.dtypes.value_counts()
print(dtype_counts)

float64    413
object       4
int64        2
Name: count, dtype: int64


In [9]:
# Columns with int dtype
numeric_columns = train.select_dtypes(include=['int64'])
numeric_columns.head()

Unnamed: 0,reads,SN_filter
0,2343,0
1,5326,1
2,4647,1
3,102843,1
4,7665,1


In [10]:
numeric_columns.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1643680 entries, 0 to 1643679
Data columns (total 2 columns):
 #   Column     Non-Null Count    Dtype
---  ------     --------------    -----
 0   reads      1643680 non-null  int64
 1   SN_filter  1643680 non-null  int64
dtypes: int64(2)
memory usage: 25.1 MB


In [11]:
# Columns with Str dtype
str_columns = train.select_dtypes(include=['object'])
print(str_columns.columns)
str_columns.head()

Index(['sequence_id', 'sequence', 'experiment_type', 'dataset_name'], dtype='object')


Unnamed: 0,sequence_id,sequence,experiment_type,dataset_name
0,8cdfeef009ea,GGGAACGACUCGAGUAGAGUCGAAAAACGUUGAUAUGGAUUUACUC...,2A3_MaP,15k_2A3
1,51e61fbde94d,GGGAACGACUCGAGUAGAGUCGAAAAACAUUGAUAUGGAUUUACUC...,2A3_MaP,15k_2A3
2,25ce8d5109cd,GGGAACGACUCGAGUAGAGUCGAAAAACCUUGAUAUGGAUUUACUC...,2A3_MaP,15k_2A3
3,07dcfb6d1965,GGGAACGACUCGAGUAGAGUCGAAAAACUUUGAUAUGGAUUUACUC...,2A3_MaP,15k_2A3
4,e561cc042a4c,GGGAACGACUCGAGUAGAGUCGAAAAACGAUGAUAUGGAUUUACUC...,2A3_MaP,15k_2A3


In [12]:
str_columns.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1643680 entries, 0 to 1643679
Data columns (total 4 columns):
 #   Column           Non-Null Count    Dtype 
---  ------           --------------    ----- 
 0   sequence_id      1643680 non-null  object
 1   sequence         1643680 non-null  object
 2   experiment_type  1643680 non-null  object
 3   dataset_name     1643680 non-null  object
dtypes: object(4)
memory usage: 50.2+ MB


In [13]:
float_columns = train.select_dtypes(include=['float64'])
float_columns.columns

Index(['signal_to_noise', 'reactivity_0001', 'reactivity_0002',
       'reactivity_0003', 'reactivity_0004', 'reactivity_0005',
       'reactivity_0006', 'reactivity_0007', 'reactivity_0008',
       'reactivity_0009',
       ...
       'reactivity_error_0197', 'reactivity_error_0198',
       'reactivity_error_0199', 'reactivity_error_0200',
       'reactivity_error_0201', 'reactivity_error_0202',
       'reactivity_error_0203', 'reactivity_error_0204',
       'reactivity_error_0205', 'reactivity_error_0206'],
      dtype='object', length=413)

In [20]:
float_columns.drop('signal_to_noise', axis=1).info(verbose=True, show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1643680 entries, 0 to 1643679
Data columns (total 412 columns):
 #    Column                 Non-Null Count    Dtype  
---   ------                 --------------    -----  
 0    reactivity_0001        0 non-null        float64
 1    reactivity_0002        0 non-null        float64
 2    reactivity_0003        0 non-null        float64
 3    reactivity_0004        0 non-null        float64
 4    reactivity_0005        0 non-null        float64
 5    reactivity_0006        0 non-null        float64
 6    reactivity_0007        0 non-null        float64
 7    reactivity_0008        0 non-null        float64
 8    reactivity_0009        0 non-null        float64
 9    reactivity_0010        0 non-null        float64
 10   reactivity_0011        0 non-null        float64
 11   reactivity_0012        0 non-null        float64
 12   reactivity_0013        0 non-null        float64
 13   reactivity_0014        0 non-null        float64
 14   

In [24]:
float_columns['reactivity_0001'].isnull().sum()

1643680

In general we are to predict reactivity of RNA sequence to two chemical modifiers DMS and 2A3. In the training Dataset the columns experiment_type, reactivity_0001, reactivity_0002, reactivity_error_0001, reactivity_error_0002, are columns to predict.

In [15]:
experiments_count= train["experiment_type"].value_counts()
print(experiments_count)

experiment_type
2A3_MaP    821840
DMS_MaP    821840
Name: count, dtype: int64
