# this will be used for all LLM based dataset creation

In [1]:
import pandas as pd
import numpy as np
from Bio import SeqIO
from random import randint
from sklearn.model_selection import train_test_split

## constants

In [2]:
HG38_ENCODING = "HG38"
BEND_CPG_NAME = "bend_cpg"
UNMASKED_CPG_NAME = "table_browser_hg38_unmasked_CpG"
MR_DNA_50_NAME = "MR-DNA"
DATASET_TO_CTREATE = MR_DNA_50_NAME
CPG_PATH = "/sci/archive/michall/roeizucker/LLM_datasets/bend_benchmark/cpg_methylation.bed"
CPG_AMOUNT = 1
# DATA_CLASSES = [CPG_DATA]
HG19_PATH = "/sci/archive/michall/roeizucker/reference_genome/hg38.fa"

SHUFFLE = False
TRAIN_PATH = "table_browser_hg38_unmasked_CpG_train.csv"
TEST_PATH = "table_browser_hg38_unmasked_CpG_test.csv"

In [3]:
class DataTypeDescription:
    def __init__(self,name,df,amount,extractor_function,encoding,train_amount,shuffle):
        self.name=name
        self.df=df
        # TODO: make amount work
        self.amount = amount
        self.extractor_function = extractor_function
        self.encoding = encoding
        self.train_amount = train_amount
        self.shuffle = shuffle
        pass
    def get_train_test(self):
        return self.extractor_function(self.df,self.train_amount,self.shuffle)



## HG38 creation

In [4]:
# Specify your file, chromosome, and positions
# fasta_file = "hg38.fa"

chromosome_seq = {}
# Parse the FASTA file

with open(HG19_PATH, "r") as file:
    for record in SeqIO.parse(file, "fasta"):
        # print(record.id)
        chromosome_seq[record.id] = record.seq[:]
        # if record.id == chromosome:
        #     # Extract the sequence
        #     sequence = record.seq[start-1:end]  # Biopython uses 0-based indexing
        #     print(f"Sequence from {chromosome}:{start}-{end}:")
        #     print(sequence)
        #     break


## Classes definitions

### table_browser_hg38_unmasked_CpG class

In [1]:
RESULT_SEQ_SIZE = 6000
UNMASKED_CPG_PATH = "/sci/nosnap/michall/roeizucker/jupyter_notebooks_backup/Tom_Hope_Project/table_browser_hg38_unmasked_CpG.csv"
UNMASKED_CPG_TRAIN_AMOUNT = 0.75
# sequence_length = 
def apply_add_pedding(row):
    to_fill =  RESULT_SEQ_SIZE - row["length"]
    to_fill_left = randint(0,to_fill)
    to_fill_right = to_fill - to_fill_left
    return [row["chromStart"] - to_fill_left,row["chromEnd"] + to_fill_right]

def apply_create_sequence(row):
    return str(chromosome_seq[row["chrom"]][row["new_start"]:row["new_end"]]).upper()
    # pass

def apply_create_labels(row):
    # return ( row["chromStart"] - row["new_start"] ,row["new_end"] - row["chromEnd"])
    return "0" * (row["chromStart"] - row["new_start"]) + "1" * row["length"] + "0" * (row["new_end"] - row["chromEnd"] )
    pass

# Function to remove overlapping rows
def remove_overlapping_rows(group):
    result = []
    
    for index, row in group.iterrows():
        if not result or row['new_start'] > result[-1]['new_end']:
            result.append(row)
    return pd.DataFrame(result)


def cpg_extractor(df,train_amount,shuffle):
    df = df[df["length"] < 4000]
    df[["new_start","new_end"]] = df.apply(apply_add_pedding,axis=1, result_type="expand")
    df = df.sort_values(by=['chrom', 'new_start']).reset_index(drop=True)
    df = df.groupby('chrom', group_keys=False).apply(remove_overlapping_rows)
    if not shuffle:
        df["seq"] = df.apply(apply_create_sequence,axis=1)
        df["labels"] = df.apply(apply_create_labels,axis=1)
        return train_test_split(df, test_size=(1-train_amount))
        
    train_df,test_df = train_test_split(df, test_size=(1-train_amount))
    
    expanded_rows = []
    # TODO: change so that this is a parameter
    for idx, row in train_df.iterrows():
    # 1) Keep the original row
        original_row = row.copy()
        expanded_rows.append(original_row)
    
        # 2) Create 5 new “shifted” rows
        for j in range(1, 6):
            row_copy = row.copy()
            row_copy['new_start'] = row['new_start'] + j
            row_copy['new_end']   = row['new_end']   + j
            
            # Update chromStart if needed
            if row_copy['new_start'] > row['chromStart']:
                row_copy['chromStart'] = row_copy['new_start']
            
            expanded_rows.append(row_copy)

    # Build the expanded DataFrame
    expanded_train_df = pd.DataFrame(expanded_rows)

    
    expanded_train_df["seq"] = expanded_train_df.apply(apply_create_sequence,axis=1)
    expanded_train_df["labels"] = expanded_train_df.apply(apply_create_labels,axis=1)
    
    test_df["seq"] = test_df.apply(apply_create_sequence,axis=1)
    test_df["labels"] = test_df.apply(apply_create_labels,axis=1)

    return expanded_train_df,test_df
    # return train_test_split(df, test_size=(1-train_amount))
    # print(train)
# CPG_DATA = DataTypeDescription("table_browser_hg38_unmasked_CpG",pd.read_csv(UNMASKED_CPG_PATH,sep=","),CPG_AMOUNT,cpg_extractor,HG38_ENCODING,UNMASKED_CPG_TRAIN_AMOUNT)
# train,test = CPG_DATA.get_train_test()

### MR-DNA50 class

In [2]:
MR_DNA_50_PATH = "/sci/nosnap/michall/roeizucker/MR-DNA/database/MR-DNA-50"
train_name = "train.txt"
test_name = "test.txt"
def get_mr_dna_50_dataset():
    
    pass

## Create datasets

In [10]:
if DATASET_TO_CTREATE == UNMASKED_CPG_NAME:
    dataset_manager = DataTypeDescription("table_browser_hg38_unmasked_CpG",pd.read_csv(UNMASKED_CPG_PATH,sep=","),CPG_AMOUNT,cpg_extractor,HG38_ENCODING,UNMASKED_CPG_TRAIN_AMOUNT,False)
elif DATASET_TO_CTREATE == MR_DNA_50_NAME:
    dataset_manager = DataTypeDescription("MR-DNA-50",pd.read_csv(UNMASKED_CPG_PATH,sep=","),CPG_AMOUNT,cpg_extractor,HG38_ENCODING,UNMASKED_CPG_TRAIN_AMOUNT,False)
train_df,test_df = dataset_manager.get_train_test()
train_df.to_csv(TRAIN_PATH)
test_df.to_csv(TEST_PATH)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[["new_start","new_end"]] = df.apply(apply_add_pedding,axis=1, result_type="expand")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[["new_start","new_end"]] = df.apply(apply_add_pedding,axis=1, result_type="expand")
  df = df.groupby('chrom', group_keys=False).apply(remove_overlapping_rows)


In [11]:
train_df

Unnamed: 0,"#""bin""",chrom,chromStart,chromEnd,name,length,cpgNum,gcNum,perCpg,perGc,obsExp,new_start,new_end,seq,labels
37700,1769,chr4,155208016,155209057,CpG: 73,1041,73,678,14.0,65.1,0.66,155207510,155213510,AGGTAGCTCTACTGCCTCCTCTTAAAACCAACAAAGGAAAGAGAGA...,0000000000000000000000000000000000000000000000...
23634,681,chr19,12666516,12666824,CpG: 31,308,31,226,20.1,73.4,0.75,12662598,12668598,CAGCCTGGGCGACAGAGTGAGTCTAAAAAAAATAAAAAAGGAATTC...,0000000000000000000000000000000000000000000000...
2921,1764,chr1,154627287,154628029,CpG: 66,742,66,499,17.8,67.3,0.79,154625829,154631829,CCCTGTCTTACTAAAAATACAAAAATTAGCCAGGCATGGTGGCATG...,0000000000000000000000000000000000000000000000...
52562,614,chrX,3835162,3835761,CpG: 65,599,65,444,21.7,74.1,0.79,3830146,3836146,AAAACAAACAAAAAACAAAAACAGACCACAGTGACACATTTTACAT...,0000000000000000000000000000000000000000000000...
19390,855,chr17,35448275,35449869,CpG: 101,1594,101,963,12.7,60.4,0.70,35444965,35450965,TTGCAAATGCAGAGATGTACTCTGGAATTATATTTTCTACATATTG...,0000000000000000000000000000000000000000000000...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52353,585,chrUn_GL000219v1,16027,16242,CpG: 17,215,17,155,15.8,72.1,0.65,13067,19067,AACATCCCATGCTCATGGATTGGAAGAATTATGATTATTAAAATGA...,0000000000000000000000000000000000000000000000...
50937,1111,chr9,69035730,69036260,CpG: 67,530,67,386,25.3,72.8,0.95,69033680,69039680,TGTGAGGAAGGGCCTCACTGTTGGTGTGGCAGAGTCTGAGACCATG...,0000000000000000000000000000000000000000000000...
2173,1347,chr1,99960717,99960991,CpG: 24,274,24,170,17.5,62.0,0.92,99959417,99965417,AGGACTTAACTTGTGCAAGCTGACTCCCAGCACATCCAAGAATGCA...,0000000000000000000000000000000000000000000000...
27279,1179,chr2,77864797,77865051,CpG: 19,254,19,150,15.0,59.1,0.87,77864180,77870180,TCTGAGCCTCTTCATCTGCCTAAAACAGATCTAGAAAAGATCATAT...,0000000000000000000000000000000000000000000000...


In [8]:
# TODO: add saving to local files if not existing, like in segment_test

In [12]:
train_df

Unnamed: 0,"#""bin""",chrom,chromStart,chromEnd,name,length,cpgNum,gcNum,perCpg,perGc,obsExp,new_start,new_end,seq,labels
37700,1769,chr4,155208016,155209057,CpG: 73,1041,73,678,14.0,65.1,0.66,155207510,155213510,AGGTAGCTCTACTGCCTCCTCTTAAAACCAACAAAGGAAAGAGAGA...,0000000000000000000000000000000000000000000000...
23634,681,chr19,12666516,12666824,CpG: 31,308,31,226,20.1,73.4,0.75,12662598,12668598,CAGCCTGGGCGACAGAGTGAGTCTAAAAAAAATAAAAAAGGAATTC...,0000000000000000000000000000000000000000000000...
2921,1764,chr1,154627287,154628029,CpG: 66,742,66,499,17.8,67.3,0.79,154625829,154631829,CCCTGTCTTACTAAAAATACAAAAATTAGCCAGGCATGGTGGCATG...,0000000000000000000000000000000000000000000000...
52562,614,chrX,3835162,3835761,CpG: 65,599,65,444,21.7,74.1,0.79,3830146,3836146,AAAACAAACAAAAAACAAAAACAGACCACAGTGACACATTTTACAT...,0000000000000000000000000000000000000000000000...
19390,855,chr17,35448275,35449869,CpG: 101,1594,101,963,12.7,60.4,0.70,35444965,35450965,TTGCAAATGCAGAGATGTACTCTGGAATTATATTTTCTACATATTG...,0000000000000000000000000000000000000000000000...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52353,585,chrUn_GL000219v1,16027,16242,CpG: 17,215,17,155,15.8,72.1,0.65,13067,19067,AACATCCCATGCTCATGGATTGGAAGAATTATGATTATTAAAATGA...,0000000000000000000000000000000000000000000000...
50937,1111,chr9,69035730,69036260,CpG: 67,530,67,386,25.3,72.8,0.95,69033680,69039680,TGTGAGGAAGGGCCTCACTGTTGGTGTGGCAGAGTCTGAGACCATG...,0000000000000000000000000000000000000000000000...
2173,1347,chr1,99960717,99960991,CpG: 24,274,24,170,17.5,62.0,0.92,99959417,99965417,AGGACTTAACTTGTGCAAGCTGACTCCCAGCACATCCAAGAATGCA...,0000000000000000000000000000000000000000000000...
27279,1179,chr2,77864797,77865051,CpG: 19,254,19,150,15.0,59.1,0.87,77864180,77870180,TCTGAGCCTCTTCATCTGCCTAAAACAGATCTAGAAAAGATCATAT...,0000000000000000000000000000000000000000000000...


In [13]:
test_df

Unnamed: 0,"#""bin""",chrom,chromStart,chromEnd,name,length,cpgNum,gcNum,perCpg,perGc,obsExp,new_start,new_end,seq,labels
35038,1785,chr3,157305131,157305406,CpG: 24,275,24,171,17.5,62.2,0.91,157302812,157308812,CTCCAGTCTCCACTCTCCCCTTTGTTCACTGGCTCCACCAATGCTG...,0000000000000000000000000000000000000000000000...
217,603,chr1,2370926,2371196,CpG: 18,270,18,161,13.3,59.6,0.76,2369330,2375330,ATGACAGAGCAAAACTCAGTCTCAGAAAAAAAAAAAAAAAAAAAAG...,0000000000000000000000000000000000000000000000...
1016,801,chr1,28369530,28370220,CpG: 61,690,61,477,17.7,69.1,0.74,28365825,28371825,GTGATCCACCTGCCTCGGCCTCCCAAAGTGCTGGGATTACAGGCGT...,0000000000000000000000000000000000000000000000...
1613,992,chr1,53438467,53439685,CpG: 119,1218,119,836,19.5,68.6,0.84,53436570,53442570,TCCTGGCCTCAAGCAATTCTCCCTCCTTGGCCTCCTAAGCTGTTGG...,0000000000000000000000000000000000000000000000...
29780,637,chr20,6926963,6927237,CpG: 21,274,21,167,15.3,60.9,0.84,6925591,6931591,TAAGAATATGATTCTGTTTATAAGGCACTTTTCAAACAAGGTAAGC...,0000000000000000000000000000000000000000000000...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37719,1784,chr4,157212550,157212831,CpG: 23,281,23,173,16.4,61.6,0.88,157211772,157217772,CATTTTTTAGTATTTAAATTTCATTGTGATAAGAACACTTAACATG...,0000000000000000000000000000000000000000000000...
50598,867,chr9,37025495,37025798,CpG: 22,303,22,184,14.5,60.7,0.79,37024025,37030025,GTCAGATGGTCAGGGAGACCCAAGAACCAGCTCCAGCTGTGCCAAT...,0000000000000000000000000000000000000000000000...
31323,839,chr21,33324227,33325445,CpG: 110,1218,110,789,18.1,64.8,0.88,33321332,33327332,ATTCGAACAATGCAATGGAATATATACATATATGTATATATGGAAC...,0000000000000000000000000000000000000000000000...
31121,707,chr21,15994704,15994941,CpG: 17,237,17,139,14.3,58.6,0.87,15991254,15997254,AATTGGTACACAGTAAATGAAGTGGGAAGTGTGAGTTAGCTTCAGA...,0000000000000000000000000000000000000000000000...
