# this will be used for all LLM based dataset creation

In [2]:
import pandas as pd
import numpy as np
from Bio import SeqIO
from random import randint
from sklearn.model_selection import train_test_split
import os

## constants

In [48]:
HG38_ENCODING = "HG38"
BEND_CPG_NAME = "bend_cpg"
UNMASKED_CPG_NAME = "table_browser_hg38_unmasked_CpG"
MR_DNA_50_NAME = "MR-DNA"
DATASET_TO_CTREATE = MR_DNA_50_NAME
CPG_PATH = "/sci/archive/michall/roeizucker/LLM_datasets/bend_benchmark/cpg_methylation.bed"
CPG_AMOUNT = 1
# DATA_CLASSES = [CPG_DATA]
HG19_PATH = "/sci/archive/michall/roeizucker/reference_genome/hg38.fa"

SHUFFLE = False
TRAIN_PATH = "MR_DNA_50_train.csv"
TEST_PATH = "MR_DNA_50_test.csv"

In [4]:
class DataTypeDescription:
    def __init__(self,name,df,amount,extractor_function,encoding,train_amount,shuffle):
        self.name=name
        self.df=df
        # TODO: make amount work
        self.amount = amount
        self.extractor_function = extractor_function
        self.encoding = encoding
        self.train_amount = train_amount
        self.shuffle = shuffle
        pass
    def get_train_test(self):
        return self.extractor_function(self.df,self.train_amount,self.shuffle)



## HG38 creation

In [4]:
# Specify your file, chromosome, and positions
# fasta_file = "hg38.fa"

chromosome_seq = {}
# Parse the FASTA file

with open(HG19_PATH, "r") as file:
    for record in SeqIO.parse(file, "fasta"):
        # print(record.id)
        chromosome_seq[record.id] = record.seq[:]
        # if record.id == chromosome:
        #     # Extract the sequence
        #     sequence = record.seq[start-1:end]  # Biopython uses 0-based indexing
        #     print(f"Sequence from {chromosome}:{start}-{end}:")
        #     print(sequence)
        #     break


In [1]:
print("on")

on


## Classes definitions

### table_browser_hg38_unmasked_CpG class

In [1]:
RESULT_SEQ_SIZE = 6000
UNMASKED_CPG_PATH = "/sci/nosnap/michall/roeizucker/jupyter_notebooks_backup/Tom_Hope_Project/table_browser_hg38_unmasked_CpG.csv"
UNMASKED_CPG_TRAIN_AMOUNT = 0.75
# sequence_length = 
def apply_add_pedding(row):
    to_fill =  RESULT_SEQ_SIZE - row["length"]
    to_fill_left = randint(0,to_fill)
    to_fill_right = to_fill - to_fill_left
    return [row["chromStart"] - to_fill_left,row["chromEnd"] + to_fill_right]

def apply_create_sequence(row):
    return str(chromosome_seq[row["chrom"]][row["new_start"]:row["new_end"]]).upper()
    # pass

def apply_create_labels(row):
    # return ( row["chromStart"] - row["new_start"] ,row["new_end"] - row["chromEnd"])
    return "0" * (row["chromStart"] - row["new_start"]) + "1" * row["length"] + "0" * (row["new_end"] - row["chromEnd"] )
    pass

# Function to remove overlapping rows
def remove_overlapping_rows(group):
    result = []
    for index, row in group.iterrows():
        if not result or row['new_start'] > result[-1]['new_end']:
            result.append(row)
    return pd.DataFrame(result)


def cpg_extractor(df,train_amount,shuffle):
    df = df[df["length"] < 4000]
    df[["new_start","new_end"]] = df.apply(apply_add_pedding,axis=1, result_type="expand")
    df = df.sort_values(by=['chrom', 'new_start']).reset_index(drop=True)
    df = df.groupby('chrom', group_keys=False).apply(remove_overlapping_rows)
    if not shuffle:
        df["seq"] = df.apply(apply_create_sequence,axis=1)
        df["labels"] = df.apply(apply_create_labels,axis=1)
        return train_test_split(df, test_size=(1-train_amount))
        
    train_df,test_df = train_test_split(df, test_size=(1-train_amount))
    
    expanded_rows = []
    # TODO: change so that this is a parameter
    for idx, row in train_df.iterrows():
    # 1) Keep the original row
        original_row = row.copy()
        expanded_rows.append(original_row)
    
        # 2) Create 5 new “shifted” rows
        for j in range(1, 6):
            row_copy = row.copy()
            row_copy['new_start'] = row['new_start'] + j
            row_copy['new_end']   = row['new_end']   + j
            
            # Update chromStart if needed
            if row_copy['new_start'] > row['chromStart']:
                row_copy['chromStart'] = row_copy['new_start']
            
            expanded_rows.append(row_copy)

    # Build the expanded DataFrame
    expanded_train_df = pd.DataFrame(expanded_rows)

    
    expanded_train_df["seq"] = expanded_train_df.apply(apply_create_sequence,axis=1)
    expanded_train_df["labels"] = expanded_train_df.apply(apply_create_labels,axis=1)
    
    test_df["seq"] = test_df.apply(apply_create_sequence,axis=1)
    test_df["labels"] = test_df.apply(apply_create_labels,axis=1)

    return expanded_train_df,test_df
    # return train_test_split(df, test_size=(1-train_amount))
    # print(train)
# CPG_DATA = DataTypeDescription("table_browser_hg38_unmasked_CpG",pd.read_csv(UNMASKED_CPG_PATH,sep=","),CPG_AMOUNT,cpg_extractor,HG38_ENCODING,UNMASKED_CPG_TRAIN_AMOUNT)
# train,test = CPG_DATA.get_train_test()

### MR-DNA50 class

In [85]:
MR_DNA_50_PATH = "/sci/nosnap/michall/roeizucker/MR-DNA/database/MR-DNA-50"
train_name = "train.txt"
test_name = "test.txt"
MR_DNA_50_CPG_TRAIN_AMOUNT = 0.75

def handle_df(df):
    df["labels"] = df.apply(apply_get_labels, axis=1)
    df["seq"] = df["text"]
    # print(df.columns)
    df.drop(["text"],axis=1,inplace=True)
    return df

def apply_get_labels(row):
    
    positions = list(map(lambda x:int(x),row["methy_pos"].strip("[]").split(", ")))
    
    labels = ["0"] * (len(row["text"]))
    for pos in positions:
        labels[pos] = "1"
    return ''.join(labels)
    
def mr_dna_50_dataset_extractor(df_values, train_amount,shuffle):
    df_train = df_values[0]
    df_test = df_values[1]
    
    df_test["text"] = df_test["sequence"]
    df_test.drop(["sequence"],axis=1,inplace=True)
    if not shuffle:
        return handle_df(df_train), handle_df(df_test)
    
    pass


In [87]:
# df_train.head(30)["methy_pos"].apply(lambda x : x[0],axis=1)
# df_test.head(30)

## Create datasets

In [89]:
if DATASET_TO_CTREATE == UNMASKED_CPG_NAME:
    dataset_manager = DataTypeDescription("table_browser_hg38_unmasked_CpG",pd.read_csv(UNMASKED_CPG_PATH,sep=","),CPG_AMOUNT,cpg_extractor,HG38_ENCODING,UNMASKED_CPG_TRAIN_AMOUNT,False)
elif DATASET_TO_CTREATE == MR_DNA_50_NAME:
    df_train = pd.read_csv(os.path.join(MR_DNA_50_PATH,train_name),sep="\t")
    df_test = pd.read_csv(os.path.join(MR_DNA_50_PATH,test_name),sep="\t")
    dataset_manager = DataTypeDescription("MR-DNA-50",[df_train,df_test],CPG_AMOUNT,mr_dna_50_dataset_extractor,HG38_ENCODING,MR_DNA_50_CPG_TRAIN_AMOUNT,False)
train_df,test_df = dataset_manager.get_train_test()
train_df.to_csv(TRAIN_PATH)
test_df.to_csv(TEST_PATH)


In [8]:
# TODO: add saving to local files if not existing, like in segment_test