In [None]:
import subprocess

def getSampleMatrix():
    script = "./drosChr2L.sh"
    n_samples = 5000
    n_samples_per_run = 100
    try:
        result = subprocess.run(
            ["bash", script, str(n_samples), str(n_samples_per_run)],
            capture_output=True,
            text=True,
            check=True,
        )

        print("Output:", result.stdout)
        if result.stderr:
            print("Error:", result.stderr)

    except subprocess.CalledProcessError as e:
        print("Script failed with error:", e.stderr)


getSampleMatrix()

Output: None


Processing file 1 of 1: K.chr1.770000.2705000.txt
Running command: ../bin/sBIF -i ../data/folding_input/K.chr1.770000.2705000.txt -c chr1 -l ../data/chromosome_sizes.txt -s 770000 -e 2705000 -ns 5000 -nr 100 -cl K -o ../output -r 5000 -j chr1 -p 50
Parameters: 
Interaction file :../data/folding_input/K.chr1.770000.2705000.txt
Chromosome :chr1
Chrom lengths file :../data/chromosome_sizes.txt
Start position:770000
End position :2705000
Cell line :K
Output folder :../output
Resolution :5000
Fiber density :0.2368
Number of samples :5000
Number of samples per run :100
Number of potential sphere points :50
Knock-in distance :80
Maximum trials :100
Number of iteractions :100
Job prefix :chr1
Number of threads :50
Bead diameter: 34.2923
Generating samples ...
Inserted K: chr1.770000-2705000 (387 samples) successfully.
Inserted K: chr1.770000-2705000 (387 samples) successfully.
Inserted K: chr1.770000-2705000 (387 samples) successfully.
Inserted K: chr1.770000-2705000 (387 samples) successfully

In [5]:
import subprocess
import re

def getSampleMatrix():
    script = "./drosChr2L.sh"
    n_samples = 5000
    n_samples_per_run = 100
    try:
        result = subprocess.Popen(
            ["bash", script, str(n_samples), str(n_samples_per_run)],
            text=True,
            stdout=subprocess.PIPE,
            bufsize=1,
        )

        pattern = re.compile(r'^\[.*DONE\]')
        for line in result.stdout:
            if pattern.match(line):
                print("Output:", line.strip())

    except subprocess.CalledProcessError as e:
        print("Script failed with error:", e.stderr)


getSampleMatrix()

Output: [position data inserted DONE] All samples' position data with 29.7831 seconds.
Output: [distance data inserted DONE] Inserted all distances into the database in 19.0421 seconds.
Output: [average vector computed DONE] Computed average vector in 0.955892 seconds.
Output: [fq vector computed DONE]Computed frequency condensed vector in 0.882497 seconds.
Output: [fq vector converted DONE]Converted frequency condensed vector to full matrix in 0.00128296 seconds.
Output: [best vector computed DONE]Computed best vector in 3.52203 seconds.
Output: [average vector and fq vector inserted DONE] Inserted average and frequency data into the database in 0.039806 seconds.


In [4]:
import psycopg2
import numpy as np
from time import time
from scipy.spatial.distance import squareform

def query_table():
    conn = psycopg2.connect(
        host="localhost",
        port=5432,
        dbname="test",
        user="siyuanzhao",
    )
    cur = conn.cursor()
    vectors = []
    t3 = time()
    cur.execute("""
        SELECT distance_vector
          FROM distance
         WHERE cell_line   = %s
           AND chrid       = %s
           AND start_value = %s
           AND end_value   = %s
         ORDER BY sampleid
    """, ('IMR', 'chr1', 2455000, 2705000))

    rows = cur.fetchall()
    t4 = time()
    print("Time taken to execute query:", t4 - t3, "seconds")
    for (blob,) in rows:
        print("Processing a row with blob size:", len(blob))
        vec = np.frombuffer(blob, dtype=np.float32)
        vectors.append(vec)
    arr2d = np.stack(vectors, axis=0)

    cur.close()
    conn.close()
    return arr2d

def get_avg_distance_data(vectors):
    vectors = np.array(vectors)
    avg_vector = np.mean(vectors, axis=0)
    matrix_list = squareform(avg_vector).tolist()


    return matrix_list

# Get the frequency data of 5000 chain samples
def get_fq_data(vectors):
    first = vectors[0]
    sum_vec = (first <= 80).astype(int)
    for vec in vectors[1:]:
        sum_vec += (vec <= 80).astype(int)
    avg = sum_vec / len(vectors)

    
    return squareform(avg).tolist()
t1 = time()
arr = query_table()
t2 = time()

t5 = time()
avg_distance = get_avg_distance_data(arr)
t6 = time()
print("Time taken to compute average distance data:", t6 - t5, "seconds")

t7 = time()
fq_data = get_fq_data(arr)
t8 = time()
print("Time taken to compute frequency data:", t8 - t7, "seconds")

print("Time taken to query the table:", t2 - t1, "seconds")
print("Array shape:", arr.shape)
print("Array first 10 elements:", arr[:10])

Time taken to execute query: 0.12284708023071289 seconds
Processing a row with blob size: 4900
Processing a row with blob size: 4900
Processing a row with blob size: 4900
Processing a row with blob size: 4900
Processing a row with blob size: 4900
Processing a row with blob size: 4900
Processing a row with blob size: 4900
Processing a row with blob size: 4900
Processing a row with blob size: 4900
Processing a row with blob size: 4900
Processing a row with blob size: 4900
Processing a row with blob size: 4900
Processing a row with blob size: 4900
Processing a row with blob size: 4900
Processing a row with blob size: 4900
Processing a row with blob size: 4900
Processing a row with blob size: 4900
Processing a row with blob size: 4900
Processing a row with blob size: 4900
Processing a row with blob size: 4900
Processing a row with blob size: 4900
Processing a row with blob size: 4900
Processing a row with blob size: 4900
Processing a row with blob size: 4900
Processing a row with blob size