## Feature Engineering
### Compute Structure / Loop Data for Train/Test

> Note: Only run this notebook if you dont have the data files already(train_newfeat2.csv, test_newfeat2.csv).
> This will take a long time (>8h).
> Not compatible with Windows b.c. of missing library builds.

In [None]:
import os

if os.name == 'nt':
    raise NotImplementedError('No Builds for Windows')

!pip install -q --upgrade arnie forgi
!conda install -y -q -c bioconda viennarna eternafold

In [None]:
%env ETERNAFOLD_PATH=/opt/conda/bin/eternafold-bin
%env ETERNAFOLD_PARAMETERS=/opt/conda/lib/eternafold-lib/parameters/EternaFoldParams.v1

In [None]:
import pandas as pd
import os

df1 = pd.read_csv('../data/train_data.csv')

In [None]:
#sneak peak
print(df1.shape)
if ~ df1.isnull().values.any(): print('No missing values')
df1.head(1)

In [None]:
import gc

df = df1.copy()
print(df.shape, df1.shape)
del df1
gc.collect()

In [None]:
from concurrent.futures import ProcessPoolExecutor
from arnie.mfe import mfe
from tqdm import tqdm
import os
import pandas as pd
import multiprocessing

# Function to apply mfe to a chunk of data
def apply_mfe(chunk):
    chunk['structure'] = chunk['sequence'].apply(lambda x: mfe(x, package='eternafold'))
    return chunk

# Estimate memory usage of a single row in the DataFrame
sample_row = df.iloc[0:1]
sample_row_mem = sample_row.memory_usage(index=True, deep=True).sum()

# Calculate a reasonable chunk size based on available memory and individual row memory usage
total_memory = os.sysconf('SC_PAGE_SIZE') * os.sysconf('SC_PHYS_PAGES')
print(f'Total memory: {total_memory}')
max_chunk_size = int(0.8 * (total_memory / sample_row_mem))
chunk_size = min(max_chunk_size, len(df) // multiprocessing.cpu_count())
print(f'Chunk size: {chunk_size}, Max chunk size: {max_chunk_size}')

# Split DataFrame into chunks based on the adjusted chunk size
df_chunks = [df[i:i + chunk_size] for i in range(0, len(df), chunk_size)]
print(f'Length of DataFrame chunks: {len(df_chunks)}')

# Process chunks in parallel with a progress bar
with ProcessPoolExecutor() as executor:
    results = list(tqdm(executor.map(apply_mfe, df_chunks), total=len(df_chunks), desc='Processing'))

result_df = pd.concat(results, ignore_index=True)


In [None]:
#sneak peak
print(result_df.shape)
if ~ result_df.isnull().values.any(): print('No missing values')
result_df.head(1)

In [None]:
result_df.to_csv('train_newfeat1.csv')

In [None]:
from concurrent.futures import ProcessPoolExecutor
from tqdm import tqdm
import forgi
import os
import pandas as pd

def label_sequence(seq, bg):
    for stem in bg.stem_iterator():
        for rn in bg.define_residue_num_iterator(stem):
            seq = seq[:rn - 1] + 'S' + seq[rn:]
    for iloop in bg.iloop_iterator():
        for rn in bg.define_residue_num_iterator(iloop):
            seq = seq[:rn - 1] + 'I' + seq[rn:]
    for mloop in bg.mloop_iterator():
        for rn in bg.define_residue_num_iterator(mloop):
            seq = seq[:rn - 1] + 'M' + seq[rn:]
    for hloop in bg.hloop_iterator():
        for rn in bg.define_residue_num_iterator(hloop):
            seq = seq[:rn - 1] + 'H' + seq[rn:]
    for floop in bg.floop_iterator():
        for rn in bg.define_residue_num_iterator(floop):
            seq = seq[:rn - 1] + 'E' + seq[rn:]
    for tloop in bg.tloop_iterator():
        for rn in bg.define_residue_num_iterator(tloop):
            seq = seq[:rn - 1] + 'E' + seq[rn:]
    for i, nucleotide in enumerate(seq):
        if nucleotide not in ['S', 'I', 'M', 'H', 'E']:
            seq = seq[:i] + 'E' + seq[i + 1:]
    return seq

def get_loop_type(row):
    bg, = forgi.load_rna(row['structure'])
    labeled_seq = label_sequence(row['sequence'], bg)
    return labeled_seq

def parallel_apply(df_chunk):
    df_chunk_copy = df_chunk.copy()
    df_chunk_copy['predicted_loop_type'] = df_chunk_copy.apply(get_loop_type, axis=1)
    return df_chunk_copy

# Estimate memory usage of a single row in the DataFrame
sample_row = result_df.iloc[0:1]
sample_row_mem = sample_row.memory_usage(index=True, deep=True).sum()

# Calculate a reasonable chunk size based on available memory and individual row memory usage
total_memory = os.sysconf('SC_PAGE_SIZE') * os.sysconf('SC_PHYS_PAGES')
max_chunk_size = int(0.8 * (total_memory / sample_row_mem))
chunk_size = min(max_chunk_size, len(result_df) // os.cpu_count())
print(f'Chunk size: {chunk_size}')

# Split DataFrame into chunks based on the adjusted chunk size
df_chunks = [result_df[i:i + chunk_size] for i in range(0, len(result_df), chunk_size)]

# Parallel computation with tqdm progress bar
with ProcessPoolExecutor() as executor:
    results = []
    for result in tqdm(executor.map(parallel_apply, df_chunks), total=len(df_chunks), desc='Processing'):
        results.append(result)

result_df = pd.concat(results, ignore_index=True)


In [None]:
#sneak peak
print(result_df.shape)
if ~ result_df.isnull().values.any(): print('No missing values')
result_df.head(1)

In [None]:
result_df.to_csv('../data/train_data.csv')