In [4]:
import pandas as pd
from Bio import SeqIO
import os

In [5]:
def read_chromosome_sequence(fna_file, chromosome_id):
    # Parse the .fna file using SeqIO
    for record in SeqIO.parse(fna_file, "fasta"):
        # Check if the chromosome ID is in the record description
        if chromosome_id in record.description:
            print(f"Found chromosome: {record.description}")
            return str(record.seq)
    # Return None if the chromosome is not found
    return None

def make_windows(w_plus_k, sequence):
    # make a dictionary of windows of size w+k to count
    windows = dict()
    for i in range(len(sequence)-w_plus_k+1):
        window = sequence[i:i+w_plus_k]
        if window in windows:
            windows[window] += 1
        else:
            windows[window] = 1

    # save to a .csv file
    df = pd.DataFrame(windows.items(), columns=['window', 'count'])
    df.to_csv(f'windows_{w_plus_k}.csv', index=False)

    # print the number of windows
    print(f"Number of windows of size {w_plus_k}: {len(windows)}")

    # print the average count of windows
    print(f"Average count of windows of size {w_plus_k}: {df['count'].mean()}")



def make_naive_windows(w_plus_k, sequence, name):
    # make a list of windows of size w+k 
    windows = []
    #print(type(w_plus_k))
    for i in range(len(sequence)-w_plus_k+1):
        window = sequence[i:i+w_plus_k]
        windows.append(window)

    # save to a .csv file
    df = pd.DataFrame(windows, columns=['window'])
    subfolder = f"sequences_{name}"
    df.to_csv(f'{subfolder}/naive_windows_{w_plus_k}.csv', index=False)



def dna_to_binary_string(dna):
    binary_mapping = {'A': '00', 'C': '01', 'G': '10', 'T': '11'}
    return ''.join([binary_mapping[nuc] for nuc in dna])


def delete_naive_window(size, name):
    subfolder = f"sequences_{name}"
    os.remove(f'{subfolder}/naive_windows_{size}.csv')




def naive_window_to_odd_even(size, name):
    subfolder = f"sequences_{name}"
    df = pd.read_csv(f'{subfolder}/naive_windows_{size}.csv')

    windows = df['window'].to_list()
    
    results = []
    for window in windows:
        binary_string = dna_to_binary_string(window)
        
        # Step 3: Separate into odd and even positions
        odd_positions = binary_string[0::2]
        even_positions = binary_string[1::2]
        
        # Step 4: Store as integers
        odd_number = int(''.join(odd_positions), 2)
        even_number = int(''.join(even_positions), 2)
        
        results.append({'OddNumber': odd_number, 'EvenNumber': even_number})

    # Convert the results to a DataFrame for easy viewing
    result_df = pd.DataFrame(results)

    # Save the results to a CSV file
    result_df.to_csv(f'{subfolder}/contexts_{size}_odd_even.csv', index=False)



def save_as_fasta(sequence, filename):
    with open(filename, 'w') as f:
        f.write(f'<\n{sequence}')

def save_as_seq(sequence, filename):
    with open(filename, 'w') as f:
        f.write(f'{sequence}\n')



In [6]:
FILE_PATH = 'GCA_009914755.4_T2T-CHM13v2.0_genomic.fna' # Change this to the path of the .fna file
chromosome_id = "chromosome X"   # Here we extract the sequence of chromosome X out of the .fna file

sizes = [1000000] # Change this if you want a smaller sequence size (useful mainly for testing)
names = ['1M'] # Change this to the name of the sequence


In [7]:
# get the sequence of the chromosome - note that this can be done in other ways if your data is in a different format,
# The only important thing is that at this stage the variable "sequence" contains a string representation of the sequence in uppercase ACGT
sequence = read_chromosome_sequence(FILE_PATH, chromosome_id)
sequence = sequence.upper()

Found chromosome: CP068255.2 Homo sapiens isolate CHM13 chromosome X


In [9]:
for size, name in zip(sizes,names):
    print(f"Generating {name} sequences")
    shorter_sequence = sequence[:size] # Can remove this and keep shorter_sequence = sequence if not interested in a subset of the sequence

    # create a subfolder for the size
    subfolder = f"sequences_{name}"
    os.makedirs(subfolder, exist_ok=True)
    
    # save it to a .txt file
    with open(f"{subfolder}/sequence_{name}.txt", 'w') as f:
        f.write(shorter_sequence)

    # make contexts of various w+k sizes.
    # For instance if we are interested in k=4 and w =10, we need to just calculate this for "window_size"=14 and not the entire range
    for window_size in range (2, 40):
        print(f"Generating {name} sequences with context size {window_size}")
        make_naive_windows(window_size, shorter_sequence, name)
        naive_window_to_odd_even(window_size, name)
        delete_naive_window(window_size, name)
    # This was done in order to compare to other methods, but it is not necessary for generating low particular density orders.
    save_as_fasta(shorter_sequence, f'{subfolder}/sequence_{name}.fasta')
    save_as_seq(shorter_sequence, f'{subfolder}/sequence_{name}.seq')

Generating 1M sequences
Generating 1M sequences with context size 2
Generating 1M sequences with context size 3
Generating 1M sequences with context size 4
Generating 1M sequences with context size 5
Generating 1M sequences with context size 6
Generating 1M sequences with context size 7
Generating 1M sequences with context size 8
Generating 1M sequences with context size 9
Generating 1M sequences with context size 10
Generating 1M sequences with context size 11
Generating 1M sequences with context size 12
Generating 1M sequences with context size 13
Generating 1M sequences with context size 14
Generating 1M sequences with context size 15
Generating 1M sequences with context size 16
Generating 1M sequences with context size 17
Generating 1M sequences with context size 18
Generating 1M sequences with context size 19
Generating 1M sequences with context size 20
Generating 1M sequences with context size 21
Generating 1M sequences with context size 22
Generating 1M sequences with context si