In [1]:
import json
import os
import numpy as np
import random
from PIL import Image
from tqdm.auto import tqdm

def generate_fcgr_image(sequence, output_path, k):
    image = np.zeros((2**k, 2**k), dtype=np.float32)

    for i in range(len(sequence) - (k-1)):
        pos_x, pos_y, size = 0, 0, 2**k
        for j in range((k-1), -1, -1):
            base = sequence[i + j]
            if base in 'AT': pos_x += size // 2
            if base in 'TG': pos_y += size // 2
            size //= 2
        image[pos_x , pos_y ] += 1

    if np.max(image) > 0:
        image = (image / np.max(image) * 255).astype(np.uint8)

    img = Image.fromarray(image)
    img.save(output_path)


def create_data(variant_names, sequences, train_len=800, test_len=200, k=8):
    for variant in tqdm(variant_names):
        full_sequence = sequences[variant]
        sequence_length = len(full_sequence)

        for i in range(train_len):
            random_length = random.randint(2000, sequence_length)
            start_idx = random.randint(0, sequence_length - random_length)
            random_sequence = full_sequence[start_idx:start_idx + random_length]
            output_path = os.path.join(train_folder, variant, f"image_{i + 1}.jpg")
            generate_fcgr_image(random_sequence, output_path, k)
    
        for i in range(test_len):
            random_length = random.randint(2000, sequence_length)
            start_idx = random.randint(0, sequence_length - random_length)
            random_sequence = full_sequence[start_idx:start_idx + random_length]
            output_path = os.path.join(test_folder, variant, f"image_{i + 1}.jpg")
            generate_fcgr_image(random_sequence, output_path, k)

In [2]:
base_output_folder = "data/virus"
train_folder = os.path.join(base_output_folder, "train")
test_folder = os.path.join(base_output_folder, "test")

with open('accession_ids.json', 'r') as json_file:
    accession_id_dict = json.load(json_file)
variant_names = list(accession_id_dict.keys())

for folder in [train_folder, test_folder]:
    os.makedirs(folder, exist_ok=True)
    for variant in variant_names:
        os.makedirs(os.path.join(folder, variant), exist_ok=True)
        
sequences = {}
for variant in variant_names:
    with open(f"sequence_data/{variant}.txt", "r") as file:
        sequences[variant] = file.read().strip()
        
create_data(variant_names, sequences, train_len=9000, test_len=1000, k=8)

  0%|          | 0/12 [00:00<?, ?it/s]