### label the ctcf code ---19bp as 1, others as 0

In [1]:
import pandas as pd

#### make positive samples

In [2]:
# read ctcf_fa file
def read_fasta(filepath):
    fasta = {}
    with open(filepath, 'r') as f:
        seq_id = None
        seq = ''
        for line in f:
            line = line.strip()
            if line.startswith('>'):
                if seq_id:
                    fasta[seq_id] = seq.upper()
                seq_id = line[1:]
                seq = ''
            else:
                seq += line
        if seq_id:
            fasta[seq_id] = seq.upper()
    return fasta
    
fasta = read_fasta("../data/ctcf_100bp.fa")

In [3]:
for k in list(fasta)[:5]:
    print(k, fasta[k])

chr3:39812865-39812965 TTTCCATATCTAGCAAGTAGATCCCTTGTATGGTCCCTGCTACTAAATCCATGGGTAGAACTCTATCCCAGCTCTTTATGAAGATGGCCGTGGGATGGGG
chr2:96508359-96508459 GGTGGTAGCCTCACTGGGACCCCTCTGTTGCAGTCGGCCTCTCTTGCCCAGAGCCCCATCACTACACATGTCTTCACGCAGGTTATCTCTCAAGACTCAA
chr20:577760-577860 CCTGGCCAACATGGTGAAACCCTGTCTCTACTAAAAATACAAAAACTAGCTGCGGCATGGTAGGACATGTGCCTGTAATCCCAGCTACTCAGGAGGCTGA
chr14:23094733-23094833 CTCCTGGAGAGTAGTGCACACCCTCCCATTTGGGCTCGCGCTGCTGCCGTTAAGGCGGGAACCGGCGCGATCTCCATTCCAGCGCGAACCTCATCAACCA
chr3:24684122-24684222 AAACAAAGGTTTATGATTCTTTTTATATACCAGAGTATTTCATTTCTGTTAAATGTTTGTTTCTAAAATTTGTAGTATGTGACATTGTTAAAAACATGAC


In [4]:
# read fimo data
fimo = pd.read_csv("../data/ctcf_fimo_filtered.tsv", sep="\t", header=None)
fimo.columns = [
    "motif_id", "tf_name", "seq_id", "start", "end",
    "strand", "score", "p_value", "q_value", "matched_seq"
]

In [5]:
fimo.head()

Unnamed: 0,motif_id,tf_name,seq_id,start,end,strand,score,p_value,q_value,matched_seq
0,MA0139.1,CTCF,seq37487,52,70,+,27.1967,4.53e-12,8e-06,TGGCCACCAGGGGGCGCCG
1,MA0139.1,CTCF,seq19160,40,58,+,27.1311,6.58e-12,8e-06,CGGCCACCAGGGGGCGCCA
2,MA0139.1,CTCF,seq34306,60,78,+,26.8689,1.04e-11,1e-05,TGGCCACCAGGGGGCGGCA
3,MA0139.1,CTCF,seq35270,79,97,+,26.8197,1.49e-11,1e-05,CGGCCACCAGGGGGCGCCG
4,MA0139.1,CTCF,seq33311,58,76,+,26.6557,2.35e-11,1e-05,TGGCCACCAGGGGGCGCTC


In [6]:
id_map = pd.read_csv("../data/id_mapping.tsv", sep='\t', names=["seq", "original_id"])
fimo = fimo.merge(id_map, left_on="seq_id", right_on="seq")

In [9]:
id_map

Unnamed: 0,seq,original_id
0,seq00001,chr3:39812865-39812965
1,seq00002,chr2:96508359-96508459
2,seq00003,chr20:577760-577860
3,seq00004,chr14:23094733-23094833
4,seq00005,chr3:24684122-24684222
...,...,...
40944,seq40945,chrX:10119622-10119722
40945,seq40946,chr1:114346638-114346738
40946,seq40947,chr14:106309025-106309125
40947,seq40948,chr14:49862176-49862276


In [7]:
fimo.head()

Unnamed: 0,motif_id,tf_name,seq_id,start,end,strand,score,p_value,q_value,matched_seq,seq,original_id
0,MA0139.1,CTCF,seq37487,52,70,+,27.1967,4.53e-12,8e-06,TGGCCACCAGGGGGCGCCG,seq37487,chr2:231612669-231612769
1,MA0139.1,CTCF,seq19160,40,58,+,27.1311,6.58e-12,8e-06,CGGCCACCAGGGGGCGCCA,seq19160,chr4:5821745-5821845
2,MA0139.1,CTCF,seq34306,60,78,+,26.8689,1.04e-11,1e-05,TGGCCACCAGGGGGCGGCA,seq34306,chr17:75905086-75905186
3,MA0139.1,CTCF,seq35270,79,97,+,26.8197,1.49e-11,1e-05,CGGCCACCAGGGGGCGCCG,seq35270,chr7:98021615-98021715
4,MA0139.1,CTCF,seq33311,58,76,+,26.6557,2.35e-11,1e-05,TGGCCACCAGGGGGCGCTC,seq33311,chr7:70857046-70857146


In [8]:
# label the 100bp sequence
records = []
for seq_id, seq in fasta.items():
    label = [0] * len(seq)
    if seq_id in fimo["original_id"].values:
        matched_rows = fimo[fimo["original_id"] == seq_id]
        for _, row in matched_rows.iterrows():
            start = int(row["start"]) - 1  # FIMO 是 1-based
            end = int(row["end"])
            for i in range(max(0, start), min(len(seq), end)):
                label[i] = 1
    records.append({
        "id": seq_id,
        "sequence": seq,
        "labels": label
    })

df = pd.DataFrame(records)

In [9]:
# validate
print(df[df["id"] == "chr2:231612669-231612769"]['sequence'].values[0])

ACCAGGTTAGCCTGGACTTGTCCCCAGGGCCCGTCCAGGCTCTCCTGCAACTGGCCACCAGGGGGCGCCGCTCCCCCAGCCTTCCTCTTACTGAACCTAC


In [10]:
sequence = "ACCAGGTTAGCCTGGACTTGTCCCCAGGGCCCGTCCAGGCTCTCCTGCAACTGGCCACCAGGGGGCGCCGCTCCCCCAGCCTTCCTCTTACTGAACCTAC"
motif = "TGGCCACCAGGGGGCGCCG"

start_idx = sequence.find(motif)
if start_idx != -1:
    end_idx = start_idx + len(motif)-1
    print(f"Motif found from position {start_idx} to {end_idx} (0-based)")
else:
    print("Motif not found")

Motif found from position 51 to 69 (0-based)


In [11]:
labels = df[df["id"] == "chr2:231612669-231612769"]['labels'].values[0]
positions = [i for i, v in enumerate(labels) if v == 1]
print(positions)

[51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69]


In [12]:
!pwd

/gpfs/wolf2/olcf/trn040/scratch/tianchun/ctcf_project/code


In [10]:
df

Unnamed: 0,id,sequence,labels
0,chr3:39812865-39812965,TTTCCATATCTAGCAAGTAGATCCCTTGTATGGTCCCTGCTACTAA...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,chr2:96508359-96508459,GGTGGTAGCCTCACTGGGACCCCTCTGTTGCAGTCGGCCTCTCTTG...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,chr20:577760-577860,CCTGGCCAACATGGTGAAACCCTGTCTCTACTAAAAATACAAAAAC...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,chr14:23094733-23094833,CTCCTGGAGAGTAGTGCACACCCTCCCATTTGGGCTCGCGCTGCTG...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,chr3:24684122-24684222,AAACAAAGGTTTATGATTCTTTTTATATACCAGAGTATTTCATTTC...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...,...
40941,chr15:56246146-56246246,TTTAATGGGTTCCATAAAAGAGCCAGGTCACCCCAGCAGCCACTAG...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
40942,chrX:10119622-10119722,TATGACATTGGTGATGAATTCTTGTATGGCACTGCCATCTGGTGGC...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
40943,chr1:114346638-114346738,TGCAAGCGCCCCTAGCGGCCTCGCAGCCATCGCGCCTCAGCAGCGC...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
40944,chr14:49862176-49862276,TGTGGGTAAACGAGCAGCCCCTGTTGGCTCCGAAGGTAATAACAGC...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [14]:
df.to_csv("../data/ctcf_training_data.csv", index=False)

In [12]:
df["label_list"] = df["labels"].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
df_has_1 = df[df["label_list"].apply(lambda x: 1 in x)]

In [13]:
df_has_1

Unnamed: 0,id,sequence,labels,label_list
37,chr11:11667383-11667483,TCCCATCTCCTTCACTTCTCCACCTTTGACACCAGAGGATGTCACA...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
70,chr19:42114120-42114220,GCCTGGGCTATACGGCAGGTGGTGGAGGCTCAGCTTGCTGCTGCTG...,"[0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, ..."
159,chr2:191014255-191014355,AGACGCGGCTGTTCCGTGGGCGCCACCGCCTCCCTCTGCGGGCCGC...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
164,chr19:11441213-11441313,GGTGCCTGTGTGTCTCCGCACCGCAGAGAGAAGGGCCGTAAGGAGA...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
192,chr18:63844583-63844683,AATGCCCATTGTGTCCATCCACTAGATGAAAACAACACCACTCCCT...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...,...,...
40940,chr6:53171982-53172082,GCCTGAGCCTGCTGCACTTGCAGTAGCAGGGACTGGCCGGGAGATG...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
40941,chr15:56246146-56246246,TTTAATGGGTTCCATAAAAGAGCCAGGTCACCCCAGCAGCCACTAG...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
40942,chrX:10119622-10119722,TATGACATTGGTGATGAATTCTTGTATGGCACTGCCATCTGGTGGC...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
40943,chr1:114346638-114346738,TGCAAGCGCCCCTAGCGGCCTCGCAGCCATCGCGCCTCAGCAGCGC...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [14]:
# distinguish strong and weak sample
def classify_sample(label):
    if sum(label) >= 1:
        return "strong"
    else:
        return "weak"

df["class"] = df["labels"].apply(classify_sample)

In [15]:
df

Unnamed: 0,id,sequence,labels,class
0,chr3:39812865-39812965,TTTCCATATCTAGCAAGTAGATCCCTTGTATGGTCCCTGCTACTAA...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",weak
1,chr2:96508359-96508459,GGTGGTAGCCTCACTGGGACCCCTCTGTTGCAGTCGGCCTCTCTTG...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",weak
2,chr20:577760-577860,CCTGGCCAACATGGTGAAACCCTGTCTCTACTAAAAATACAAAAAC...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",weak
3,chr14:23094733-23094833,CTCCTGGAGAGTAGTGCACACCCTCCCATTTGGGCTCGCGCTGCTG...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",weak
4,chr3:24684122-24684222,AAACAAAGGTTTATGATTCTTTTTATATACCAGAGTATTTCATTTC...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",weak
...,...,...,...,...
40941,chr15:56246146-56246246,TTTAATGGGTTCCATAAAAGAGCCAGGTCACCCCAGCAGCCACTAG...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",strong
40942,chrX:10119622-10119722,TATGACATTGGTGATGAATTCTTGTATGGCACTGCCATCTGGTGGC...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",strong
40943,chr1:114346638-114346738,TGCAAGCGCCCCTAGCGGCCTCGCAGCCATCGCGCCTCAGCAGCGC...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",strong
40944,chr14:49862176-49862276,TGTGGGTAAACGAGCAGCCCCTGTTGGCTCCGAAGGTAATAACAGC...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",strong


#### Make negative samples

In [14]:
neg_fasta_file = "../data/negative.fa"
neg_records = []

with open(neg_fasta_file) as f:
    seq_id = None
    seq = ''
    for line in f:
        line = line.strip()
        if line.startswith(">"):
            if seq_id:
                neg_records.append({
                    "id": seq_id,
                    "sequence": seq.upper(),
                    "labels": [0] * len(seq),
                    "class": "negative"
                })
            seq_id = line[1:]
            seq = ''
        else:
            seq += line
    if seq_id:
        neg_records.append({
            "id": seq_id,
            "sequence": seq.upper(),
            "labels": [0] * len(seq),
            "class": "negative"
        })


neg_df = pd.DataFrame(neg_records)

In [15]:
neg_df

Unnamed: 0,id,sequence,labels,class
0,chr9:31921407-31921507,TAAATGCCACTTTCTAATTAGTATTTACTCCAATAATTCTTCATTG...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",negative
1,chr2:170681872-170681972,TTAGTAAAGTGTAGCTGGTTGAAGAAAGCATAATGAATAATCCTAC...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",negative
2,chr3:126450519-126450619,TGGCACAGGAGCCACATGCATGTGACTAGAAATCTCTTCCAGAACG...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",negative
3,chr6:95650470-95650570,ATTCAGTCTGTGAGGTGGGAACATGTTGGCAGGACAGCCTCTTTTT...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",negative
4,chr4:38701057-38701157,ACTTCATATGCAGTGATCACTAGAGCAAAACTTACAGAGCAATGTA...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",negative
...,...,...,...,...
40944,chr4:89877060-89877160,CCTGTAATCCCAGCTACTCGGGAGGCTGAGGCAAGAGAATTGCTTG...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",negative
40945,chr10:72344975-72345075,AGTGTAGCCTTTGGTGCTCTCTCCTCCAGCTTCACCGTTGGCCGAG...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",negative
40946,chr13:38451312-38451412,TTTTTAATTTTAGTATTTCCAATAGGTGAGTTAGTAGTATCTCACT...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",negative
40947,chr17:71116529-71116629,TCAGTATTGTGCTTGTTTAATATCGAGATTTCAGTCCTACTGAATC...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",negative


In [16]:
neg_sampled = neg_df.sample(n=17345, random_state=42).reset_index(drop=True)

#### Combine positive and negative samples

In [17]:
full_df = pd.concat([df, neg_df], ignore_index=True)

In [18]:
full_df

Unnamed: 0,id,sequence,labels,class
0,chr3:39812865-39812965,TTTCCATATCTAGCAAGTAGATCCCTTGTATGGTCCCTGCTACTAA...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",weak
1,chr2:96508359-96508459,GGTGGTAGCCTCACTGGGACCCCTCTGTTGCAGTCGGCCTCTCTTG...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",weak
2,chr20:577760-577860,CCTGGCCAACATGGTGAAACCCTGTCTCTACTAAAAATACAAAAAC...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",weak
3,chr14:23094733-23094833,CTCCTGGAGAGTAGTGCACACCCTCCCATTTGGGCTCGCGCTGCTG...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",weak
4,chr3:24684122-24684222,AAACAAAGGTTTATGATTCTTTTTATATACCAGAGTATTTCATTTC...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",weak
...,...,...,...,...
81890,chr4:89877060-89877160,CCTGTAATCCCAGCTACTCGGGAGGCTGAGGCAAGAGAATTGCTTG...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",negative
81891,chr10:72344975-72345075,AGTGTAGCCTTTGGTGCTCTCTCCTCCAGCTTCACCGTTGGCCGAG...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",negative
81892,chr13:38451312-38451412,TTTTTAATTTTAGTATTTCCAATAGGTGAGTTAGTAGTATCTCACT...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",negative
81893,chr17:71116529-71116629,TCAGTATTGTGCTTGTTTAATATCGAGATTTCAGTCCTACTGAATC...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",negative


#### Convert to jsonl

In [19]:
# make the label is list
if isinstance(df.loc[0, "labels"], str):
    import ast
    full_df["labels"] = full_df["labels"].apply(ast.literal_eval)

In [20]:
# make sure it is list
type(full_df.loc[0, "labels"])

list

In [21]:
# save the result
import json

with open("../data/ctcf_dataset.jsonl", "w") as f:
    for _, row in full_df.iterrows():
        json.dump({
            "id": row["id"],
            "sequence": row["sequence"],
            "labels": row["labels"],
            "class": row["class"]
        }, f)
        f.write("\n")

In [23]:
import json
from sklearn.model_selection import train_test_split

with open("../data/ctcf_dataset.jsonl") as f:
    lines = [json.loads(line) for line in f]

labels = [line["class"] for line in lines]

train_lines, eval_lines = train_test_split(lines, test_size=0.2, random_state=42, stratify=labels)

with open("../data/train_tmp.jsonl", "w") as f:
    for item in train_lines:
        f.write(json.dumps(item) + "\n")

with open("../data/eval_tmp.jsonl", "w") as f:
    for item in eval_lines:
        f.write(json.dumps(item) + "\n")

In [17]:
# delete the last column
df_has_1_clean = df_has_1.drop(columns=["label_list"], errors="ignore")
neg_sampled_clean = neg_sampled.drop(columns=["class"], errors="ignore")

# combine two df
merged_df = pd.concat([df_has_1_clean, neg_sampled_clean], ignore_index=True)

In [18]:
merged_df

Unnamed: 0,id,sequence,labels
0,chr11:11667383-11667483,TCCCATCTCCTTCACTTCTCCACCTTTGACACCAGAGGATGTCACA...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,chr19:42114120-42114220,GCCTGGGCTATACGGCAGGTGGTGGAGGCTCAGCTTGCTGCTGCTG...,"[0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, ..."
2,chr2:191014255-191014355,AGACGCGGCTGTTCCGTGGGCGCCACCGCCTCCCTCTGCGGGCCGC...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,chr19:11441213-11441313,GGTGCCTGTGTGTCTCCGCACCGCAGAGAGAAGGGCCGTAAGGAGA...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,chr18:63844583-63844683,AATGCCCATTGTGTCCATCCACTAGATGAAAACAACACCACTCCCT...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...,...
34685,chr2:37294281-37294381,ACAGGGGTCTCGCTGTGTGGCCCAGGCTGGTGTTGAACTTCTAGAC...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
34686,chr2:155676403-155676503,TGAAAATCCTGCCTCTTAATGATATAATTGTTCTTTTGCTATATTT...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
34687,chr5:135517696-135517796,TTTCTCCACAAACACACCAAGAAAAGGCCATGTGAGGACATGGTGA...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
34688,chr15:83459412-83459512,GAGTCATGGAGGGTTTATGAAAGATGGGTCTTGGAAAGGGAGTTTT...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [19]:
!pwd

/gpfs/wolf2/olcf/trn040/scratch/tianchun/ctcf_project/code


In [21]:
import json
import pandas as pd

# Shuffle the merged dataframe
shuffled_df = merged_df.sample(frac=1, random_state=42).reset_index(drop=True)

# Save to JSON Lines format (one JSON object per line)
with open("../data/merged_ctcf_dataset.jsonl", "w") as f:
    for _, row in shuffled_df.iterrows():
        f.write(json.dumps({
            "id": row["id"],
            "sequence": row["sequence"],
            "labels": row["labels"]
        }) + "\n")


In [22]:
from sklearn.model_selection import train_test_split

# Split into 80% train, 20% eval
train_df, eval_df = train_test_split(shuffled_df, test_size=0.2, random_state=42)

# Save training set
with open("../data/train_tmp.jsonl", "w") as f:
    for _, row in train_df.iterrows():
        f.write(json.dumps({
            "id": row["id"],
            "sequence": row["sequence"],
            "labels": row["labels"]
        }) + "\n")

# Save evaluation set
with open("../data/eval_tmp.jsonl", "w") as f:
    for _, row in eval_df.iterrows():
        f.write(json.dumps({
            "id": row["id"],
            "sequence": row["sequence"],
            "labels": row["labels"]
        }) + "\n")
