<a href="https://www.kaggle.com/code/jeffreyesedo/1st-ribo-note?scriptVersionId=151009761" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:
# Setting up an RNA Science Environment
!pip install arnie
!pip install draw_rna

# Install EternaFold
!conda config --set auto_update_conda false
!conda install -c bioconda eternafold --yes
# Manually setup EternaFold for Kaggle notebook
%env ETERNAFOLD_PATH=/opt/conda/bin/eternafold-bin
%env ETERNAFOLD_PARAMETERS=/opt/conda/lib/eternafold-lib/parameters/EternaFoldParams.v1

In [None]:
import os
import psutil
import gc
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt

#Ignore warnings
import warnings
warnings.filterwarnings('ignore')


# monitor memeory usage
def memory_usage_in_gb():
    process = psutil.Process(os.getpid())
    memory_use_in_bytes = process.memory_info().rss
    memory_use_in_gb = memory_use_in_bytes / (2. ** 30)
    return f'Memory usage: {np.round(memory_use_in_gb, 2)} GB'

In [None]:
from arnie.mfe import mfe
from arnie.bpps import bpps
from draw_rna.ipynb_draw import draw_struct

# Import Datasets

In [None]:
# train= pd.read_csv("/kaggle/input/stanford-ribonanza-rna-folding/train_data.csv")
train= pd.read_csv("/kaggle/input/stanford-ribonanza-rna-folding/train_data_QUICK_START.csv")

test= pd.read_csv("/kaggle/input/stanford-ribonanza-rna-folding/test_sequences.csv")

In [None]:
print(f"Train dataset shape: {train.shape}\n")

print(f"Test dataset shape: {test.shape}")

# Reduce Memory usage
Using the dataset as is consume a lot of memory and cause the codes to run slow and eventually crash, so there is need to import the in manner that reduces memory usage, improves speed, and retain the overall information of the original dataset.

In [None]:
# optimize numeric data types
def opt_num(df):
    df= df.copy()
    
    for col in df.columns:
        df_col= df[col]
        dn = df_col.dtype.name
        
        if dn == "int64":
            df[col]= pd.to_numeric(df_col, downcast="integer")
        elif dn == "float64":
            df[col]= pd.to_numeric(df_col, downcast="float")
#         elif dn == "object":
#             num_unique_values = len(df_col.unique())
#             num_total_values = len(df_col)
#             if num_unique_values / num_total_values < 0.5:
#                 df[col] = df_col.astype("category")
    return df

In [None]:
opt_train= opt_num(train)
opt_test= opt_num(test)

In [None]:
print(f"Train Dataset:{train.iloc[0:5, 0:10].info()}\n")
print(f"Optimized Dataset: {opt_train.iloc[0:5, 0:10].info()}")

In [None]:
del train
del test
gc.collect()

In [None]:
# Export Dataset as Parquet
# opt_train.to_parquet('train_data.parquet')
# opt_test.to_parquet('test_data.parquet')

# Import Parquet Dataset
# train_df = pd.read_parquet('/kaggle/working/train_data.parquet')
# train_df.head()

In [None]:
opt_train.head()

In [None]:
print(f"Test Columns: {opt_test.info()}")

In [None]:
# Count columns based on their Dtype
dtype_counts = opt_train.dtypes.value_counts()
print(dtype_counts)

In [None]:
experiments_count= opt_train["experiment_type"].value_counts()
print(experiments_count)

In [None]:
float_columns = opt_train.select_dtypes(include=['float'])
float_columns.columns

In [None]:
# Null values in reactivity and reactivity_error
# float_columns.info(verbose=True, show_counts=True)

# print(float_columns["reactivity_error_0043"].isna().sum())
# print(float_columns["reactivity_0043"].isna().sum())

In [None]:
# Columns that are NaN
num_empty_cols= 0
cols_having_values=0


# for col in float_columns.drop('signal_to_noise', axis=1):
for col in float_columns:
    if float_columns[col].isna().sum() == 0:
        num_empty_cols+=1
    else:
        cols_having_values+=1
        
print(f"Number of Columns with only NaN values: {num_empty_cols} of 412 columns\n")
print(f"Number of Columns with values: {cols_having_values} of 412 columns")

In [None]:
for i in range(6):
    structure = mfe(train.sequence[i],package="eternafold")
    experiment= train.experiment_type[i]
    print(experiment,structure)

In [None]:
# Veiwing the first six sequences
fig, axs = plt.subplots(2, 3, figsize=(15,14))

for i in range(6):
    ax = axs[i//3, i%3]  # Get the current axes
    draw_struct(train.sequence[i], structure, ax=ax)  # Draw the structure on the current axes
    ax.set_title(train.experiment_type[i], loc='left', fontsize='medium')

plt.show()

In [None]:
len_seq = pd.Series([len(t) for t in opt_train['sequence']])
len_seq.value_counts()

lengths of RNA sequence is between 115 to 206, while for the test the lengths are between 207 to 457.  Part of the challenge is to know whether the patterns recognized at length 115 to 206 will generalize to longer lengths [response found here.](https://www.kaggle.com/competitions/stanford-ribonanza-rna-folding/discussion/453147#2513582).

In [None]:
react= opt_train.columns[4:210]
react_err= opt_train.columns[210:]

In [None]:
sum_dict= {
    "sum_react":[opt_train.loc[i][react].sum() for i in range(len(opt_train))],
    "sum_react_err":[opt_train.loc[i][react_err].sum() for i in range(len(opt_train))],
    "sum_seq_reactivity":[opt_train.loc[i][4:].sum() for i in range(len(opt_train))]  
}


seq_react= pd.DataFrame(sum_dict)
    
seq_react

In [None]:

des_stats_reactivity={
    "min":[opt_train.loc[i][react].min() for i in range(len(opt_train))],
    "max":[opt_train.loc[i][react].max() for i in range(len(opt_train))],
    "mean":[opt_train.loc[i][react].mean() for i in range(len(opt_train))],
    "median":[opt_train.loc[i][react].median() for i in range(len(opt_train))],
}

stats_reactivity= pd.DataFrame(des_stats_reactivity)

stats_reactivity

In [None]:
des_stats_reactivity_err={
    "min":[opt_train.loc[i][react_err].min() for i in range(len(opt_train))],
    "max":[opt_train.loc[i][react_err].max() for i in range(len(opt_train))],
    "mean":[opt_train.loc[i][react_err].mean() for i in range(len(opt_train))],
    "median":[opt_train.loc[i][react_err].median() for i in range(len(opt_train))],
#     "0.25":[train.loc[i][react_err].quantile(0.25) for i in range(len(train))],
#     "0.5":[train.loc[i][react_err].quantile(0.5) for i in range(len(train))],
#     "0.75":[train.loc[i][react_err].quantile(0.75) for i in range(len(train))],
#     "0.90":[train.loc[i][react_err].quantile(0.9) for i in range(len(train))],
}

stats_reactivity_err= pd.DataFrame(des_stats_reactivity_err)

stats_reactivity_err

In [None]:
opt_train.sequence.describe()

In [None]:
opt_train.sequence.apply(lambda x: pd.Series(list(x)).describe())

In [None]:
# Split the sequence by experiment
exp_dms_map= opt_train[opt_train.experiment_type == "DMS_MaP"]
exp_2a3_map= opt_train[opt_train.experiment_type == "2A3_MaP"]

In [None]:
base_dms= exp_dms_map.sequence.apply(lambda x: pd.Series(list(x)).value_counts()).sum()
base_2a3= exp_2a3_map.sequence.apply(lambda x: pd.Series(list(x)).value_counts()).sum()


bases= pd.DataFrame({"dms":base_dms,"2a3":base_2a3})

In [None]:
bases

In [None]:
base_dms.plot.bar()
plt.xlabel('Base', fontsize = 12, fontweight = 'bold', color = 'darkblue')
plt.ylabel('Count', fontsize = 12, fontweight = 'bold', color = 'darkblue')
plt.title('Base Count', fontsize = 14, fontweight = 'bold', color = 'darkgreen')

In [None]:
seq_len= opt_train.sequence.apply(len)
seq_len = seq_len.value_counts()
# seq_len = pd.Series(seq_len)
seq_len

In [None]:
seq_len.plot.bar()
plt.xlabel("Sequences Lenght")
plt.title("Sequence lenght Distribution")

In [None]:
# (opt_train.sequence.apply(lambda x: pd.Series(list(x)).value_counts())).plot.hist()

# Enrich Dataset 
Uisng just the sequence of the training column won't suffice, so to enrich dataset I will be using the following:
- Bpp Files
<!-- - 3D Coords -->
<!-- - Sequence lib -->
<!-- - OpenKnot metadata -->

In [None]:

bpp_file = '/kaggle/input/stanford-ribonanza-rna-folding/Ribonanza_bpp_files/extra_data'
file_paths = []
for folder, _, files in tq(os.walk(bpp_file), total=len(os.listdir(bpp_file))):
    for file in files:
        file_paths.append(os.path.join(folder, file))

In [None]:
exp_dms= opt_train.loc[opt_train.experiment_type == "DMS_MaP"]
exp_2a3= opt_train.loc[opt_train.experiment_type == "2A3_MaP"]

In [None]:
exp_dms.head()