# this will be used for all LLM based dataset creation

In [1]:
import pandas as pd
import numpy as np
from Bio import SeqIO
from random import randint
from sklearn.model_selection import train_test_split

## constants

In [2]:
HG38_ENCODING = "HG38"
BEND_CPG_NAME = "bend_cpg"
UNMASKED_CPG_NAME = "table_browser_hg38_unmasked_CpG"
DATASET_TO_CTREATE = UNMASKED_CPG_NAME
CPG_PATH = "/sci/archive/michall/roeizucker/LLM_datasets/bend_benchmark/cpg_methylation.bed"
CPG_AMOUNT = 1
# DATA_CLASSES = [CPG_DATA]
HG19_PATH = "/sci/archive/michall/roeizucker/reference_genome/hg38.fa"

SHUFFLE = False
TRAIN_PATH = "table_browser_hg38_unmasked_CpG_train.csv"
TEST_PATH = "table_browser_hg38_unmasked_CpG_test.csv"

In [3]:
class DataTypeDescription:
    def __init__(self,name,df,amount,extractor_function,encoding,train_amount,shuffle):
        self.name=name
        self.df=df
        # TODO: make amount work
        self.amount = amount
        self.extractor_function = extractor_function
        self.encoding = encoding
        self.train_amount = train_amount
        self.shuffle = shuffle
        pass
    def get_train_test(self):
        return self.extractor_function(self.df,self.train_amount,self.shuffle)



## HG38 creation

In [4]:
# Specify your file, chromosome, and positions
# fasta_file = "hg38.fa"

chromosome_seq = {}
# Parse the FASTA file

with open(HG19_PATH, "r") as file:
    for record in SeqIO.parse(file, "fasta"):
        # print(record.id)
        chromosome_seq[record.id] = record.seq[:]
        # if record.id == chromosome:
        #     # Extract the sequence
        #     sequence = record.seq[start-1:end]  # Biopython uses 0-based indexing
        #     print(f"Sequence from {chromosome}:{start}-{end}:")
        #     print(sequence)
        #     break


## Classes definitions

### table_browser_hg38_unmasked_CpG class

In [5]:
RESULT_SEQ_SIZE = 6000
UNMASKED_CPG_PATH = "table_browser_hg38_unmasked_CpG.csv"
UNMASKED_CPG_TRAIN_AMOUNT = 0.75
# sequence_length = 
def apply_add_pedding(row):
    to_fill =  RESULT_SEQ_SIZE - row["length"]
    to_fill_left = randint(0,to_fill)
    to_fill_right = to_fill - to_fill_left
    return [row["chromStart"] - to_fill_left,row["chromEnd"] + to_fill_right]

def apply_create_sequence(row):
    return str(chromosome_seq[row["chrom"]][row["new_start"]:row["new_end"]]).upper()
    # pass

def apply_create_labels(row):
    # return ( row["chromStart"] - row["new_start"] ,row["new_end"] - row["chromEnd"])
    return "0" * (row["chromStart"] - row["new_start"]) + "1" * row["length"] + "0" * (row["new_end"] - row["chromEnd"] )
    pass

# Function to remove overlapping rows
def remove_overlapping_rows(group):
    result = []
    
    for index, row in group.iterrows():
        if not result or row['new_start'] > result[-1]['new_end']:
            result.append(row)
    return pd.DataFrame(result)


def cpg_extractor(df,train_amount,shuffle):
    df = df[df["length"] < 4000]
    df[["new_start","new_end"]] = df.apply(apply_add_pedding,axis=1, result_type="expand")
    df = df.sort_values(by=['chrom', 'new_start']).reset_index(drop=True)
    df = df.groupby('chrom', group_keys=False).apply(remove_overlapping_rows)
    if not shuffle:
        df["seq"] = df.apply(apply_create_sequence,axis=1)
        df["labels"] = df.apply(apply_create_labels,axis=1)
        return train_test_split(df, test_size=(1-train_amount))
        
    train_df,test_df = train_test_split(df, test_size=(1-train_amount))
    
    expanded_rows = []
    # TODO: change so that this is a parameter
    for idx, row in train_df.iterrows():
    # 1) Keep the original row
        original_row = row.copy()
        expanded_rows.append(original_row)
    
        # 2) Create 5 new “shifted” rows
        for j in range(1, 6):
            row_copy = row.copy()
            row_copy['new_start'] = row['new_start'] + j
            row_copy['new_end']   = row['new_end']   + j
            
            # Update chromStart if needed
            if row_copy['new_start'] > row['chromStart']:
                row_copy['chromStart'] = row_copy['new_start']
            
            expanded_rows.append(row_copy)

    # Build the expanded DataFrame
    expanded_train_df = pd.DataFrame(expanded_rows)

    
    expanded_train_df["seq"] = expanded_train_df.apply(apply_create_sequence,axis=1)
    expanded_train_df["labels"] = expanded_train_df.apply(apply_create_labels,axis=1)
    
    test_df["seq"] = test_df.apply(apply_create_sequence,axis=1)
    test_df["labels"] = test_df.apply(apply_create_labels,axis=1)

    return expanded_train_df,test_df
    # return train_test_split(df, test_size=(1-train_amount))
    # print(train)
# CPG_DATA = DataTypeDescription("table_browser_hg38_unmasked_CpG",pd.read_csv(UNMASKED_CPG_PATH,sep=","),CPG_AMOUNT,cpg_extractor,HG38_ENCODING,UNMASKED_CPG_TRAIN_AMOUNT)
# train,test = CPG_DATA.get_train_test()

## Create datasets

In [6]:
if DATASET_TO_CTREATE == UNMASKED_CPG_NAME:
    dataset_manager = DataTypeDescription("table_browser_hg38_unmasked_CpG",pd.read_csv(UNMASKED_CPG_PATH,sep=","),CPG_AMOUNT,cpg_extractor,HG38_ENCODING,UNMASKED_CPG_TRAIN_AMOUNT,False)
train_df,test_df = dataset_manager.get_train_test()
train_df.to_csv(TRAIN_PATH)
test_df.to_csv(TEST_PATH)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[["new_start","new_end"]] = df.apply(apply_add_pedding,axis=1, result_type="expand")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[["new_start","new_end"]] = df.apply(apply_add_pedding,axis=1, result_type="expand")
  df = df.groupby('chrom', group_keys=False).apply(remove_overlapping_rows)


In [7]:
train_df

Unnamed: 0,"#""bin""",chrom,chromStart,chromEnd,name,length,cpgNum,gcNum,perCpg,perGc,obsExp,new_start,new_end,seq,labels
10804,1495,chr12,119292321,119292595,CpG: 20,274,20,167,14.6,60.9,0.80,119291160,119297160,GAGCAAAGCTTCATCTGTATTTACAGCTGCTCTGCATCCCTCTCAT...,0000000000000000000000000000000000000000000000...
24515,884,chr19,39196957,39197324,CpG: 46,367,46,277,25.1,75.5,0.88,39191757,39197757,GCCCCTGGACTTCAGCCTGGGAGACAGAGCCAGACCCTGTCTCAAA...,0000000000000000000000000000000000000000000000...
31654,932,chr21,45498274,45498496,CpG: 26,222,26,171,23.4,77.0,0.92,45497624,45503624,GGCGCCAAAGGAGAAGTGGGCCCCCCCGGACCACCAGGTGAGCAAC...,0000000000000000000000000000000000000000000000...
24217,815,chr19,30222501,30222958,CpG: 51,457,51,352,22.3,77.0,0.75,30217517,30223517,CCCAAGTCTCCATCTGCTAGCTGATCATCCCAAGTACAGGTTGCAA...,0000000000000000000000000000000000000000000000...
9892,999,chr12,54391116,54391454,CpG: 34,338,34,256,20.1,75.7,0.73,54387638,54393638,CATCTGTCAAATGAAAACAACACTACTTGCCTCACCAAGTTGTTGA...,0000000000000000000000000000000000000000000000...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14364,586,chr14_GL000009v2_random,199954,200249,CpG: 29,295,29,219,19.7,74.2,0.71,197091,203091,GGAAAAGTATAAAACAAACAAGTGGATAATCTACCTTTATTAATAT...,0000000000000000000000000000000000000000000000...
9813,991,chr12,53251935,53252287,CpG: 39,352,39,265,22.2,75.3,0.80,53247503,53253503,GCACTTTGGGAGGCCGAGGCAGGGGGATCACAAGGTCGGGAGATCG...,0000000000000000000000000000000000000000000000...
19637,909,chr17,42532015,42532337,CpG: 50,322,50,236,31.1,73.3,1.16,42528408,42534408,CTTGCTTTGTTGCTGCCGAGACCAGCTGGGTCAGAGAGACCCTAAC...,0000000000000000000000000000000000000000000000...
24242,834,chr19,32719547,32720160,CpG: 58,613,58,448,18.9,73.1,0.71,32719519,32725519,GTTACAATTTTGCAAAGGTGGTTTCCAACGGACTCAGCCGCACGCG...,0000000000000000000000000000111111111111111111...


In [8]:
# TODO: add saving to local files if not existing, like in segment_test

In [9]:
train_df

Unnamed: 0,"#""bin""",chrom,chromStart,chromEnd,name,length,cpgNum,gcNum,perCpg,perGc,obsExp,new_start,new_end,seq,labels
10804,1495,chr12,119292321,119292595,CpG: 20,274,20,167,14.6,60.9,0.80,119291160,119297160,GAGCAAAGCTTCATCTGTATTTACAGCTGCTCTGCATCCCTCTCAT...,0000000000000000000000000000000000000000000000...
24515,884,chr19,39196957,39197324,CpG: 46,367,46,277,25.1,75.5,0.88,39191757,39197757,GCCCCTGGACTTCAGCCTGGGAGACAGAGCCAGACCCTGTCTCAAA...,0000000000000000000000000000000000000000000000...
31654,932,chr21,45498274,45498496,CpG: 26,222,26,171,23.4,77.0,0.92,45497624,45503624,GGCGCCAAAGGAGAAGTGGGCCCCCCCGGACCACCAGGTGAGCAAC...,0000000000000000000000000000000000000000000000...
24217,815,chr19,30222501,30222958,CpG: 51,457,51,352,22.3,77.0,0.75,30217517,30223517,CCCAAGTCTCCATCTGCTAGCTGATCATCCCAAGTACAGGTTGCAA...,0000000000000000000000000000000000000000000000...
9892,999,chr12,54391116,54391454,CpG: 34,338,34,256,20.1,75.7,0.73,54387638,54393638,CATCTGTCAAATGAAAACAACACTACTTGCCTCACCAAGTTGTTGA...,0000000000000000000000000000000000000000000000...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14364,586,chr14_GL000009v2_random,199954,200249,CpG: 29,295,29,219,19.7,74.2,0.71,197091,203091,GGAAAAGTATAAAACAAACAAGTGGATAATCTACCTTTATTAATAT...,0000000000000000000000000000000000000000000000...
9813,991,chr12,53251935,53252287,CpG: 39,352,39,265,22.2,75.3,0.80,53247503,53253503,GCACTTTGGGAGGCCGAGGCAGGGGGATCACAAGGTCGGGAGATCG...,0000000000000000000000000000000000000000000000...
19637,909,chr17,42532015,42532337,CpG: 50,322,50,236,31.1,73.3,1.16,42528408,42534408,CTTGCTTTGTTGCTGCCGAGACCAGCTGGGTCAGAGAGACCCTAAC...,0000000000000000000000000000000000000000000000...
24242,834,chr19,32719547,32720160,CpG: 58,613,58,448,18.9,73.1,0.71,32719519,32725519,GTTACAATTTTGCAAAGGTGGTTTCCAACGGACTCAGCCGCACGCG...,0000000000000000000000000000111111111111111111...


In [10]:
test_df

Unnamed: 0,"#""bin""",chrom,chromStart,chromEnd,name,length,cpgNum,gcNum,perCpg,perGc,obsExp,new_start,new_end,seq,labels
44258,596,chr6_GL000253v2_alt,1498598,1498871,CpG: 22,273,22,170,16.1,62.3,0.84,1496391,1502391,AATGGGGAGGTGGCTACCAGTGGATATGGGGTCCCCTGCTCCAGGT...,0000000000000000000000000000000000000000000000...
42130,923,chr6,44312982,44313486,CpG: 46,504,46,329,18.3,65.3,0.86,44312506,44318506,AAGGTAAAAGAAGGAGGAATGTGTTGGGAGCTGGAAATGGGACGCT...,0000000000000000000000000000000000000000000000...
2541,1679,chr1,143497088,143498908,CpG: 218,1820,218,1276,24.0,70.1,1.04,143493553,143499553,TCTTTTCTTTCAATTTTCTCAATTACTAAGAGATGTTTAAGTACCC...,0000000000000000000000000000000000000000000000...
21990,1036,chr18,59113802,59114083,CpG: 24,281,24,174,17.1,61.9,0.90,59112561,59118561,CTTCCTGAAGCCCATCCTCCGCATGTGACTCCTGCTCAGAATGTTC...,0000000000000000000000000000000000000000000000...
15823,1335,chr15,98328026,98328304,CpG: 22,278,22,175,15.8,62.9,0.81,98324167,98330167,GAGTCAAGACTTCAGCAGTGGCCTAGCTGGGACCACAATCAAGTGT...,0000000000000000000000000000000000000000000000...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53070,956,chrX,48732840,48733062,CpG: 16,222,16,133,14.4,59.9,0.81,48729800,48735800,CAGAAATATCAAAGTCGGGACACAGGTTCTCAAACACATTAAAGAC...,0000000000000000000000000000000000000000000000...
6055,1476,chr10,116849088,116849866,CpG: 65,778,65,499,16.7,64.1,0.81,116845478,116851478,TGGAATGTTAACAATCATGACTAGATCCTCTCTCTAATATCATTTC...,0000000000000000000000000000000000000000000000...
22551,591,chr19,857246,857463,CpG: 19,217,19,134,17.5,61.8,0.93,854519,860519,GGCGCGATCGCAGCTCACTACAGCCTCCATCTCCTGGGCTCAAGCC...,0000000000000000000000000000000000000000000000...
41833,838,chr6,33199994,33200938,CpG: 81,944,81,620,17.2,65.7,0.80,33199078,33205078,GCCATGGGGAAGTTCACACAAGGATCTGGGGTTACAAGGAAAACAA...,0000000000000000000000000000000000000000000000...
