# General

In [6]:
import json
import numpy as np
import pandas as pd

def save_percentile_vectors_to_csv(json_path, output_csv, data_type='colon', vector_indices=None, average_vectors=False):
    # Load JSON
    with open(json_path, 'r') as f:
        all_results = json.load(f)

    # Create labels_dict
    labels_dict = {}
    for sample_name in all_results:
        if data_type == 'ovarian':
            if sample_name.endswith("_H"):
                labels_dict[sample_name] = 0
            elif sample_name.endswith("_OC"):
                labels_dict[sample_name] = 1
            else:
                raise Exception(f"Invalid ovarian sample: {sample_name}")
        elif data_type == 'colon':
            if sample_name.endswith("_low"):
                labels_dict[sample_name] = 0
            elif sample_name.endswith("_high"):
                labels_dict[sample_name] = 1
            else:
                raise Exception(f"Invalid colon sample: {sample_name}")
        elif data_type == 'kidney':
            if "STA" in sample_name:
                labels_dict[sample_name] = 0
            elif "AR" in sample_name:
                labels_dict[sample_name] = 1
            else:
                raise Exception(f"Invalid kidney sample: {sample_name}")
        elif data_type == 'corona':
            if "TCRB_M" in sample_name:
                labels_dict[sample_name] = 0
            elif "TCRB_F" in sample_name:
                labels_dict[sample_name] = 1
            else:
                raise Exception(f"Invalid corona sample: {sample_name}")

    # Process samples
    rows = []

    if average_vectors:
        for sample_name, percentiles_dict in all_results.items():
            vectors = np.array([np.array(v) for v in percentiles_dict.values()])
            avg_vector = np.mean(vectors, axis=0)
            rows.append([sample_name, labels_dict[sample_name]] + avg_vector.tolist())
    else:
        # Get max length for consistent padding
        if vector_indices is None:
            max_len = max(
                sum(len(v) for v in d.values())
                for d in all_results.values()
            )
        else:
            max_len = max(
                sum(len(v) for i, v in enumerate(d.values()) if i in vector_indices)
                for d in all_results.values()
            )

        for sample_name, percentiles_dict in all_results.items():
            vectors = list(percentiles_dict.values())
            if vector_indices is not None:
                selected_vectors = [vectors[i] for i in vector_indices if i < len(vectors)]
            else:
                selected_vectors = vectors

            flattened = np.concatenate([np.array(v) for v in selected_vectors])
            padded = np.pad(flattened, (0, max_len - len(flattened)), mode='constant')
            rows.append([sample_name, labels_dict[sample_name]] + padded.tolist())

    # Create and save DataFrame
    df = pd.DataFrame(rows)
    df.columns = ["Sample", "Label"] + [f"f{i}" for i in range(len(rows[0]) - 2)]
    df.to_csv(output_csv, index=False)
    print(f"Saved to: {output_csv}")


save_percentile_vectors_to_csv(
    json_path="/home/dsi/orrbavly/GNN_project/embeddings/kidney_percentiles/perc_results_cos_3_21k_clonotype.json",
    output_csv="/home/dsi/orrbavly/GNN_project/embeddings/kidney_percentiles/perc_cos_3_21k_clonotype.csv",
    data_type="kidney",
    vector_indices=None,      
    average_vectors=False     
)

Saved to: /home/dsi/orrbavly/GNN_project/embeddings/kidney_percentiles/perc_cos_3_21k_clonotype.csv


# Adding Sequences column to embedding files

In [6]:
import pandas as pd

df = pd.read_csv("/dsi/scratch/home/dsi/orrbavly/corona_data/embeddings_new/03855000011605_TCRB_M_seqs.csv")

In [12]:
df2 = pd.read_csv("/dsi/scratch/home/dsi/orrbavly/corona_data/embeddings/03845000014380_TCRB_F.csv")

FileNotFoundError: [Errno 2] No such file or directory: '/dsi/scratch/home/dsi/orrbavly/corona_data/embeddings/03845000014380_TCRB_F.csv'

In [5]:
import os
import csv
import re

# Directories
original_dir = "/dsi/scratch/home/dsi/orrbavly/corona_data/original_files/"
embedding_dir = "/dsi/scratch/home/dsi/orrbavly/corona_data/embeddings/"
output_dir = "/dsi/scratch/home/dsi/orrbavly/corona_data/embeddings_new/"

os.makedirs(output_dir, exist_ok=True)

def merge_large_csvs(original_path, embedding_path, output_path):
    try:
        with open(original_path, 'r') as f_seq, open(embedding_path, 'r') as f_emb, open(output_path, 'w', newline='') as f_out:
            reader_seq = csv.reader(f_seq)
            reader_emb = csv.reader(f_emb)
            writer = csv.writer(f_out)

            # Read headers
            seq_header = next(reader_seq)
            emb_header = next(reader_emb)

            if "Sequences" in emb_header:
                print(f"{embedding_path} already has 'Sequences' column — skipping.")
                return

            writer.writerow(["Sequences"] + emb_header)

            row_count = 0
            for seq_row, emb_row in zip(reader_seq, reader_emb):
                sequence = seq_row[0].strip()  # Always take first column
                writer.writerow([sequence] + emb_row)
                row_count += 1

        # Verify output row count
        written_lines = sum(1 for _ in open(output_path)) - 1  # exclude header
        if written_lines == row_count:
            os.remove(embedding_path)
            print(f"✅ Merged {embedding_path} → {output_path}, removed original.")
        else:
            print(f"⚠️ Row mismatch in {output_path}. Expected {row_count}, got {written_lines}. Deleting new file.")
            os.remove(output_path)

    except Exception as e:
        print(f"❌ Error processing {embedding_path}: {e}")
        if os.path.exists(output_path):
            os.remove(output_path)
            print(f"🗑️ Deleted incomplete file: {output_path}")

# Iterate through all embedding files
for emb_file in os.listdir(embedding_dir):
    if emb_file.endswith(".csv") and ("_F" in emb_file or "_M" in emb_file):
        # Match pattern like: 03855000014337_TCRB_F.csv
        match = re.match(r"^(\d+_TCRB)_[FM]\.csv$", emb_file)
        if not match:
            print(f"⚠️ Could not parse embedding file: {emb_file}")
            continue

        base_id = match.group(1)  # e.g., 03855000014337_TCRB
        gender_suffix = "_F" if "_F.csv" in emb_file else "_M"

        embedding_path = os.path.join(embedding_dir, emb_file)
        original_path = os.path.join(original_dir, f"{base_id}.csv")
        output_file = f"{base_id}{gender_suffix}_seqs.csv"
        output_path = os.path.join(output_dir, output_file)

        if not os.path.exists(original_path):
            print(f"⚠️ Original file missing for: {emb_file} → expected {base_id}.csv")
            continue

        if os.path.exists(output_path):
            print(f"🔁 Output already exists: {output_path}")
            continue

        merge_large_csvs(original_path, embedding_path, output_path)


✅ Merged /dsi/scratch/home/dsi/orrbavly/corona_data/embeddings/03855000011605_TCRB_M.csv → /dsi/scratch/home/dsi/orrbavly/corona_data/embeddings_new/03855000011605_TCRB_M_seqs.csv, removed original.


KeyboardInterrupt: 

In [None]:
import os
import pandas as pd

# Paths
original_dir = "/dsi/scratch/home/dsi/orrbavly/corona_data/original_files/"
embedding_dir = "/dsi/scratch/home/dsi/orrbavly/corona_data/embeddings/"
output_dir = "/dsi/scratch/home/dsi/orrbavly/corona_data/embeddings/"
os.makedirs(output_dir, exist_ok=True)

# List all embedding files
for emb_file in os.listdir(embedding_dir):
    if emb_file.endswith("_embedded.csv"):
        sample_prefix = emb_file.replace("_embedded.csv", "")
        original_file = sample_prefix + ".csv"

        original_path = os.path.join(original_dir, original_file)
        embedding_path = os.path.join(embedding_dir, emb_file)
        output_path = os.path.join(output_dir, emb_file)

        # Skip if output already exists
        if os.path.exists(output_path):
            print(f"Skipping {emb_file} (already merged).")
            continue

        # Skip if original sequence file is missing
        if not os.path.exists(original_path):
            print(f"Missing original for {emb_file}")
            continue

        print(f"Merging: {original_file} + {emb_file}")

        try:
            # Load Sequences
            seqs = pd.read_csv(original_path, usecols=["Sequences"]).reset_index(drop=True)

            # Load embeddings (assumed high precision float values)
            emb = pd.read_csv(embedding_path).reset_index(drop=True)

            # Sanity check
            assert len(seqs) == len(emb), "Row mismatch!"

            # Prepend Sequences column to embeddings
            merged = pd.concat([seqs, emb], axis=1)

            # Save preserving float precision (max safe)
            merged.to_csv(output_path, index=False, float_format="%.16g")

        except Exception as e:
            print(f"Error merging {sample_prefix}: {e}")

Skipping P5-S24_TRB_new_embedded.csv (already merged).
Skipping P8-S14_TRB_new_embedded.csv (already merged).
Skipping P5-S11_TRB_new_embedded.csv (already merged).
Skipping P3-S9_TRB_new_embedded.csv (already merged).
Skipping P9-S18_TRB_new_embedded.csv (already merged).
Skipping P8-S24_TRB_new_embedded.csv (already merged).
Skipping P4-S1_TRB_new_embedded.csv (already merged).
Skipping P3-S19_TRB_new_embedded.csv (already merged).
Skipping P8-S20_TRB_new_embedded.csv (already merged).
Skipping P6-S23_TRB_new_embedded.csv (already merged).
Skipping P4-S23_TRB_new_embedded.csv (already merged).
Skipping P7-S19_TRB_new_embedded.csv (already merged).
Skipping P2-S6_TRB_new_embedded.csv (already merged).
Skipping P1-S1_TRB_new_embedded.csv (already merged).
Skipping P1-S5_TRB_new_embedded.csv (already merged).
Skipping P1-S18_TRB_new_embedded.csv (already merged).
Skipping P6-S11_TRB_new_embedded.csv (already merged).
Skipping P8-S3_TRB_new_embedded.csv (already merged).
Skipping P9-S3_T

## check concatanated files

In [5]:
orig = pd.read_csv("/dsi/sbm/OrrBavly/colon_data/new_mixcr/TRB/downsamples_209378/embeddings/P8-S24_TRB_new_embedded.csv")
new = pd.read_csv("/dsi/sbm/OrrBavly/colon_data/new_mixcr/TRB/downsamples_209378/embeddings_new/P8-S24_TRB_new_embedded.csv").drop(columns=["Sequences"])

with open("/dsi/sbm/OrrBavly/colon_data/new_mixcr/TRB/downsamples_209378/embeddings/P8-S24_TRB_new_embedded.csv", "r") as f1, open("/dsi/sbm/OrrBavly/colon_data/new_mixcr/TRB/downsamples_209378/embeddings_new/P8-S24_TRB_new_embedded.csv", "r") as f2:
    print(f1.readline())
    print(f2.readline())

print("Old shape:", orig.shape)
print("New shape:", new.shape)

0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,192,193,194,195,196,197,198,199,200,201,202,203,204,205,206,207,208,209,210,211,212,213,214,215,216,217,218,219,220,221,222,223,224,225,226,227,228,229,230,231,232,233,234,235,236,237,238,239,240,241,242,243,244,245,246,247,248,249,250,251,252,253,254,255,256,257,258,259,260,261,262,263,264,265,266,267,268,269,270,271,272,273,274,275,276,27

In [1]:
import os

embedding_dir = "/dsi/sbm/OrrBavly/colon_data/new_mixcr/TRB/downsamples_209378/embeddings/"
merged_dir = "/dsi/sbm/OrrBavly/colon_data/new_mixcr/TRB/downsamples_209378/embeddings_new/"

results = []

for emb_file in os.listdir(embedding_dir):
    if not emb_file.endswith("_embedded.csv"):
        continue

    merged_file = os.path.join(merged_dir, emb_file)
    original_file = os.path.join(embedding_dir, emb_file)

    if os.path.exists(merged_file):
        original_size = os.path.getsize(original_file)
        merged_size = os.path.getsize(merged_file)
        diff = merged_size - original_size
        pct_change = 100 * diff / original_size

        results.append({
            "file": emb_file,
            "original_MB": round(original_size / 1e6, 2),
            "merged_MB": round(merged_size / 1e6, 2),
            "change_MB": round(diff / 1e6, 2),
            "pct_change": round(pct_change, 2)
        })

# Convert to DataFrame for nice viewing
import pandas as pd
df_sizes = pd.DataFrame(results)
print(df_sizes.sort_values(by="pct_change"))


                            file  original_MB  merged_MB  change_MB  \
28   P4-S14_TRB_new_embedded.csv      1153.47    1117.61     -35.86   
8    P8-S20_TRB_new_embedded.csv      1028.35     996.51     -31.84   
172   P9-S2_TRB_new_embedded.csv       849.85     823.47     -26.38   
9    P6-S23_TRB_new_embedded.csv      1498.87    1452.45     -46.41   
165   P5-S3_TRB_new_embedded.csv       854.33     827.85     -26.48   
..                           ...          ...        ...        ...   
92   P5-S12_TRB_new_embedded.csv       200.28     194.15      -6.13   
91   P5-S20_TRB_new_embedded.csv        99.89      96.83      -3.06   
4    P9-S18_TRB_new_embedded.csv       205.24     198.96      -6.28   
40    P1-S6_TRB_new_embedded.csv        88.19      85.49      -2.70   
107  P8-S15_TRB_new_embedded.csv       162.58     157.62      -4.96   

     pct_change  
28        -3.11  
8         -3.10  
172       -3.10  
9         -3.10  
165       -3.10  
..          ...  
92        -3.06  
91 

In [None]:
import pandas as pd

original_dir = "/dsi/sbm/OrrBavly/colon_data/new_mixcr/TRB/downsamples_209378/original_data/"
embedding_dir = "/dsi/sbm/OrrBavly/colon_data/new_mixcr/TRB/downsamples_209378/embeddings/"
merged_dir = "/dsi/sbm/OrrBavly/colon_data/new_mixcr/TRB/downsamples_209378/embeddings_new/"

records = []

for emb_file in os.listdir(embedding_dir):
    if not emb_file.endswith("_embedded.csv"):
        continue

    sample_prefix = emb_file.replace("_embedded.csv", "")
    original_file = sample_prefix + ".csv"
    merged_file = emb_file

    original_path = os.path.join(original_dir, original_file)
    embedding_path = os.path.join(embedding_dir, emb_file)
    merged_path = os.path.join(merged_dir, merged_file)

    if not (os.path.exists(original_path) and os.path.exists(merged_path)):
        continue

    try:
        original_len = pd.read_csv(original_path, usecols=["Sequences"]).shape[0]
        embedding_len = pd.read_csv(embedding_path).shape[0]
        merged_len = pd.read_csv(merged_path).shape[0]

        records.append({
            "file": emb_file,
            "original_rows": original_len,
            "embedding_rows": embedding_len,
            "merged_rows": merged_len,
            "rows_lost": (embedding_len + 1) - merged_len  # +1 for added Sequences
        })

    except Exception as e:
        print(f"Error processing {emb_file}: {e}")

df_check = pd.DataFrame(records)
print(df_check[df_check["rows_lost"] != 0].sort_values("rows_lost", ascending=False))


Error processing P5-S24_TRB_new_embedded.csv: Error tokenizing data. C error: Calling read(nbytes) on source failed. Try engine='python'.


In [3]:
print(df_check.head)

<bound method NDFrame.head of                             file  original_rows  embedding_rows  merged_rows  \
0    P5-S24_TRB_new_embedded.csv          48942           48942        48942   
1    P8-S14_TRB_new_embedded.csv           5291            5291         5291   
2    P5-S11_TRB_new_embedded.csv          37843           37843        37843   
3     P3-S9_TRB_new_embedded.csv          41897           41897        41897   
4    P9-S18_TRB_new_embedded.csv          13355           13355        13355   
..                           ...            ...             ...          ...   
206   P7-S4_TRB_new_embedded.csv          12373           12373        12373   
207  P5-S15_TRB_new_embedded.csv          55345           55345        55345   
208  P2-S18_TRB_new_embedded.csv           9143            9143         9143   
209   P7-S3_TRB_new_embedded.csv         120951          120951       120951   
210  P5-S13_TRB_new_embedded.csv          44274           44274        44274   

     rows

In [2]:
import pandas as pd
import re
df = pd.read_csv("/home/dsi/orrbavly/GNN_project/data/metadata/colon_meta.csv")

def extract_sample_id(filename):
    match = re.search(r'pool(\d+)_S(\d+)', filename)
    if match:
        pool, sample = match.groups()
        return f"P{pool}-S{sample}"
    else:
        return None  # or return filename if you want to keep unmatched

# Apply to create new column
df['sample_id'] = df['filename'].apply(extract_sample_id)

# Preview
print(df[['filename', 'sample_id']].head())
df.to_csv("/home/dsi/orrbavly/GNN_project/data/metadata/colon_meta.csv")

                            filename sample_id
0   pool1_S1_TRB_mig_cdr3_clones_all     P1-S1
1  pool1_S10_TRB_mig_cdr3_clones_all    P1-S10
2  pool1_S11_TRB_mig_cdr3_clones_all    P1-S11
3  pool1_S12_TRB_mig_cdr3_clones_all    P1-S12
4  pool1_S13_TRB_mig_cdr3_clones_all    P1-S13


In [2]:
import os
import pandas as pd

# Directories
tcr_dir = "/dsi/scratch/home/dsi/orrbavly/corona_data/original_files/"
embedding_dir = "/dsi/scratch/home/dsi/orrbavly/corona_data/embeddings/"
new_dir = "/dsi/scratch/home/dsi/orrbavly/corona_data/new_embeddings/"

# Get list of TCR sequence files
tcr_files = [f for f in os.listdir(tcr_dir) if f.endswith(".csv")]

# Counters for statistics
processed_count = 0
skipped_count = 0

for tcr_file in tcr_files:
    # Construct possible embedding file names
    base_name = tcr_file.replace("_TCRB.csv", "")
    embedding_files = [
        f"{base_name}_TCRB_M.csv",
        f"{base_name}_TCRB_F.csv"
    ]

    # Find the corresponding embedding file
    matching_embedding_file = None
    for emb_file in embedding_files:
        if emb_file in os.listdir(embedding_dir):
            matching_embedding_file = emb_file
            break

    if not matching_embedding_file:
        print(f"Embedding file not found for {tcr_file}, skipping...")
        skipped_count += 1
        continue

    # Load data
    tcr_path = os.path.join(tcr_dir, tcr_file)
    emb_path = os.path.join(embedding_dir, matching_embedding_file)

    df_tcr = pd.read_csv(tcr_path, usecols=["Sequences", "templates"])
    df_emb = pd.read_csv(emb_path)

    # Check if row counts match
    if len(df_tcr) != len(df_emb):
        print(f"Row mismatch for {tcr_file} ({len(df_tcr)} vs {len(df_emb)}), skipping...")
        skipped_count += 1
        continue

    # Merge data: Add "Sequences" and "templates" as the first two columns
    df_emb.insert(0, "Sequences", df_tcr["Sequences"])

    new_path = os.path.join(new_dir, matching_embedding_file)
    # Save back to the same file
    df_emb.to_csv(new_path, index=False)
    print(f"Updated {matching_embedding_file} successfully.")
    processed_count += 1
    if processed_count == 5:
        break
# Print summary
print("\nProcessing completed.")
print(f"Successfully updated: {processed_count}")
print(f"Skipped due to issues: {skipped_count}")


Updated 03855000011616_TCRB_F.csv successfully.
Updated 03855000011623_TCRB_M.csv successfully.
Updated 03855000011624_TCRB_M.csv successfully.
Updated 03855000011628_TCRB_F.csv successfully.
Updated 03855000011629_TCRB_M.csv successfully.

Processing completed.
Successfully updated: 5
Skipped due to issues: 0


In [3]:
import os
import csv

# Directories
tcr_dir = "/dsi/scratch/home/dsi/orrbavly/corona_data/original_files/"
embedding_dir = "/dsi/scratch/home/dsi/orrbavly/corona_data/embeddings/"
new_dir = "/dsi/scratch/home/dsi/orrbavly/corona_data/new_embeddings/"

# Ensure new_dir exists
os.makedirs(new_dir, exist_ok=True)

# Get list of TCR sequence files
tcr_files = [f for f in os.listdir(tcr_dir) if f.endswith(".csv")]

# Counters for statistics
processed_count = 0
skipped_count = 0

for tcr_file in tcr_files:
    # Construct possible embedding file names
    base_name = tcr_file.replace("_TCRB.csv", "")
    embedding_files = [
        f"{base_name}_TCRB_M.csv",
        f"{base_name}_TCRB_F.csv"
    ]

    # Find the corresponding embedding file
    matching_embedding_file = None
    for emb_file in embedding_files:
        if emb_file in os.listdir(embedding_dir):
            matching_embedding_file = emb_file
            break

    if not matching_embedding_file:
        print(f"Embedding file not found for {tcr_file}, skipping...")
        skipped_count += 1
        continue

    # File paths
    tcr_path = os.path.join(tcr_dir, tcr_file)
    emb_path = os.path.join(embedding_dir, matching_embedding_file)
    output_path = os.path.join(new_dir, matching_embedding_file)  # Save to new_dir

    # Open both files using csv.reader and csv.writer
    with open(tcr_path, "r") as tcr_f, open(emb_path, "r") as emb_f, open(output_path, "w", newline='') as out_f:
        tcr_reader = csv.reader(tcr_f)
        emb_reader = csv.reader(emb_f)
        writer = csv.writer(out_f)

        # Read headers (if present)
        tcr_header = next(tcr_reader, None)
        emb_header = next(emb_reader, None)

        # Write new header (if present)
        if emb_header:
            new_header = ["Sequences"] + emb_header
            writer.writerow(new_header)

        # Process rows one by one
        row_count_tcr = 0
        row_count_emb = 0

        for tcr_row, emb_row in zip(tcr_reader, emb_reader):
            writer.writerow([tcr_row[0]] + emb_row)  # Add Sequences column
            row_count_tcr += 1
            row_count_emb += 1

        # Check row count
        if row_count_tcr != row_count_emb:
            print(f"Row mismatch for {tcr_file} ({row_count_tcr} vs {row_count_emb}), skipping update...")
            skipped_count += 1
            os.remove(output_path)  # Delete incomplete file
            continue

    print(f"Updated {matching_embedding_file} successfully in {new_dir}/")
    processed_count += 1
    if processed_count == 5:
        break

# Print summary
print("\nProcessing completed.")
print(f"Successfully updated: {processed_count}")
print(f"Skipped due to issues: {skipped_count}")


Updated 03855000011616_TCRB_F.csv successfully in /dsi/scratch/home/dsi/orrbavly/corona_data/new_embeddings//
Updated 03855000011623_TCRB_M.csv successfully in /dsi/scratch/home/dsi/orrbavly/corona_data/new_embeddings//
Updated 03855000011624_TCRB_M.csv successfully in /dsi/scratch/home/dsi/orrbavly/corona_data/new_embeddings//
Updated 03855000011628_TCRB_F.csv successfully in /dsi/scratch/home/dsi/orrbavly/corona_data/new_embeddings//
Updated 03855000011629_TCRB_M.csv successfully in /dsi/scratch/home/dsi/orrbavly/corona_data/new_embeddings//

Processing completed.
Successfully updated: 5
Skipped due to issues: 0


In [1]:
import json
import os

# Define the folder containing JSON files and the output file name
input_folder = "/home/dsi/orrbavly/GNN_project/embeddings/corona_percentiles/checkpoints"  # Change to your actual folder
output_file = "/home/dsi/orrbavly/GNN_project/embeddings/corona_percentiles/subsets/perc_faiss_cos_every5_1-867.json"

# Initialize an empty dictionary to store merged data
merged_data = {}

# Loop through each JSON file in the directory
for file_name in os.listdir(input_folder):
    if file_name.endswith(".json"):  # Ensure it's a JSON file
        file_path = os.path.join(input_folder, file_name)
        with open(file_path, "r") as file:
            data = json.load(file)  # Load JSON content
            merged_data.update(data)  # Merge into the main dictionary

# Save the merged data to a new JSON file
with open(output_file, "w") as out_file:
    json.dump(merged_data, out_file, indent=4)  # Pretty-print JSON

print(f"Merged JSON saved as {output_file}")


Merged JSON saved as /home/dsi/orrbavly/GNN_project/embeddings/corona_percentiles/subsets/perc_faiss_cos_every5_1-867.json


In [1]:
import pandas as pd
import numpy as np
import os

In [11]:
import os

def count_lines(file_path):
    """Returns the number of lines in a file."""
    with open(file_path, 'r') as f:
        return sum(1 for _ in f)

def check_files(original_dir='/dsi/scratch/home/dsi/orrbavly/corona_data/original_files/', embeddings_dir='/dsi/scratch/home/dsi/orrbavly/corona_data/embeddings/'):
    mismatched_files = []
    counter = 0
    for file_name in os.listdir(original_dir):
        original_path = os.path.join(original_dir, file_name)
        base_name, ext = os.path.splitext(file_name)
        embedding_file_name = f"{base_name}_embedded{ext}"
        embedding_path = os.path.join(embeddings_dir, embedding_file_name)
        
        if os.path.isfile(original_path) and os.path.isfile(embedding_path):
            print(f"Working on file: {file_name}")
            counter += 1
            original_lines = count_lines(original_path)
            embedding_lines = count_lines(embedding_path)
            
            if original_lines != embedding_lines:
                mismatched_files.append(file_name)
    print(f"Counter: {counter}")
    for file in mismatched_files:
        print(file)
    print(f"Total mismatched files: {len(mismatched_files)}")

if __name__ == "__main__":
    check_files()


Working on file: 03855000011616_TCRB.csv
Working on file: 03855000011623_TCRB.csv
Working on file: 03855000011624_TCRB.csv
Working on file: 03855000011628_TCRB.csv
Working on file: 03855000011629_TCRB.csv


KeyboardInterrupt: 

In [5]:
import os
import shutil
import sys

def move_files(file_list, source_dir, destination_dir):
      # Ensure source directory exists
    if not os.path.isdir(source_dir):
        print(f"Error: Source directory '{source_dir}' does not exist.")
        sys.exit(1)

    moved_count = 0
    mismatched_files = []

    # Read the file list and move files
    with open(file_list, "r", encoding="utf-8") as f:
        for line in f:
            basename = line.strip()  # Trim spaces and newlines

            if not basename:
                continue  # Skip empty lines

            # Extract file name and extension
            base_name, ext = os.path.splitext(basename)

            # Append "_embedded" before the extension
            embedded_file_name = f"{base_name}_embedded{ext}"
            print (embedded_file_name)
            src_file = os.path.join(source_dir, embedded_file_name)
            dest_file = os.path.join(destination_dir, embedded_file_name)

            if os.path.exists(src_file):
                shutil.move(src_file, dest_file)
                print(f"Moved: {src_file} -> {dest_file}")
                moved_count += 1
            else:
                print(f"Warning: File '{src_file}' not found.")
                mismatched_files.append(basename)

    print(f"\nFile moving process completed. Total files moved: {moved_count}")
    
    if mismatched_files:
        print("\nFiles not found:")
        for file in mismatched_files:
            print(file)

move_files("/home/dsi/orrbavly/GNN_project/testing_scripts/scripts/list_to_move.txt",  "/dsi/scratch/home/dsi/orrbavly/corona_data/embeddings",
                "/dsi/scratch/home/dsi/orrbavly/corona_data/watchlist_embedding")


Copy of ADIRP0002497_TCRB_embedded.csv
Copy of ADIRP0002519_TCRB_embedded.csv
Copy of ADIRP0002887_TCRB_embedded.csv
Copy of INCOV004-AC-5_TCRB_embedded.csv
Copy of INCOV015-AC-5_TCRB_embedded.csv
Copy of INCOV019-BL-5_TCRB_embedded.csv
Copy of INCOV026-AC-3_TCRB_embedded.csv
Copy of INCOV027-CV-3_TCRB_embedded.csv
Copy of INCOV033-BL-3_TCRB_embedded.csv
Copy of INCOV042-AC-3_TCRB_embedded.csv
Copy of INCOV042-CV-3_TCRB_embedded.csv
Copy of INCOV043-BL-3_TCRB_embedded.csv
Copy of INCOV044-AC-3_TCRB_embedded.csv
Copy of INCOV048-BL-3_TCRB_embedded.csv
Copy of INCOV054-BL-3_TCRB_embedded.csv
Copy of INCOV058-BL-3_TCRB_embedded.csv
Copy of INCOV068-BL-3_TCRB_embedded.csv
Copy of INCOV069-AC-3_TCRB_embedded.csv
Copy of INCOV077-BL-3_TCRB_embedded.csv
Copy of ADIRP0002301_TCRB_embedded.csv
Copy of ADIRP0002425_TCRB_embedded.csv
Copy of ADIRP0002512_TCRB_embedded.csv
Copy of BS-EQ-34-T1-replacement_TCRB_embedded.csv
Copy of BS-GIGI_41-replacement_TCRB_embedded.csv
Copy of BS-GIGI_89-replacem

In [1]:
import os

original_dir = "/dsi/scratch/home/dsi/orrbavly/corona_data/original_files/"
embeddings_dir = "/dsi/scratch/home/dsi/orrbavly/corona_data/embeddings/"

# Get the list of files in original_dir and embeddings_dir
original_files = os.listdir(original_dir)
embedding_files = os.listdir(embeddings_dir)

# Extract base names from embedding_files to check against original_files
embedding_base_names = [os.path.splitext(file_name)[0].replace("_embedded", "") for file_name in embedding_files]

# Check for files in original_files that are not in embedding_files
missing_files = [file for file in original_files if os.path.splitext(file)[0] not in embedding_base_names]

# Count and print the missing files
print(f"Number of files in original_dir but not in embeddings_dir: {len(missing_files)}")
print("Missing files:")
for file in missing_files:
    print(file)


Number of files in original_dir but not in embeddings_dir: 247
Missing files:
Copy of INCOV039-BL-3_TCRB.csv
Copy of INCOV052-AC-3_TCRB.csv
Copy of INCOV066-BL-3_TCRB.csv
Copy of INCOV081-AC-3_TCRB.csv
Copy of KH20-09657_TCRB.csv
Copy of KH20-09672_TCRB.csv
Copy of KH20-09682_TCRB.csv
Copy of KH20-09687_TCRB.csv
Copy of KH20-09701_TCRB.csv
Copy of KH20-09704_TCRB.csv
Copy of KH20-09705_TCRB.csv
Copy of KH20-09706_TCRB.csv
Copy of KH20-09709_TCRB.csv
Copy of KH20-09710_TCRB.csv
Copy of KH20-09712_TCRB.csv
Copy of KH20-09714_TCRB.csv
Copy of KH20-09717_TCRB.csv
Copy of KH20-09718_TCRB.csv
Copy of KH20-09721_TCRB.csv
Copy of KH20-09724_TCRB.csv
Copy of KH20-09729_TCRB.csv
Copy of KH20-09731_TCRB.csv
Copy of KH20-09752_TCRB.csv
Copy of KH20-09951_TCRB.csv
Copy of KH20-09961_TCRB.csv
Copy of KH20-09962_TCRB.csv
Copy of KH20-09964_TCRB.csv
Copy of KH20-09965_TCRB.csv
Copy of KH20-09967_TCRB.csv
Copy of KH20-09973_TCRB.csv
Copy of KH20-09974_TCRB.csv
Copy of KH20-09978_TCRB.csv
Copy of KH20-0

# Analyzing Corona data

In [2]:
# Path to the directory containing the copied files
directory_path = "/dsi/scratch/home/dsi/orrbavly/corona_data/original_files/"

# Column name to rename
old_column_name = "amino_acid"
new_column_name = "Sequences"

# Loop through all files in the directory
for file_name in os.listdir(directory_path):
    if file_name.endswith(".tsv"):  # Process only CSV files
        file_path = os.path.join(directory_path, file_name)
        
        # Load the CSV file
        df = pd.read_csv(file_path, sep='\t')
        
        # Rename the column if it exists
        if old_column_name in df.columns:
            df.rename(columns={old_column_name: new_column_name}, inplace=True)
            print(f"Renaming column in: {file_name}")
        
        # Save the updated CSV back to the same location
        df.to_csv(file_path, index=False)

print("All column names updated.")

Renaming column in: 03845000013199_TCRB.tsv
Renaming column in: 03855000011610_TCRB.tsv
Renaming column in: 03855000011616_TCRB.tsv
Renaming column in: 03855000011623_TCRB.tsv
Renaming column in: 03855000011624_TCRB.tsv
Renaming column in: 03855000011628_TCRB.tsv
Renaming column in: 03855000011629_TCRB.tsv
Renaming column in: 03855000011630_TCRB.tsv
Renaming column in: 03855000011635_TCRB.tsv
Renaming column in: 03855000011646_TCRB.tsv
Renaming column in: 03855000011647_TCRB.tsv
Renaming column in: 03855000011650_TCRB.tsv
Renaming column in: 03855000011651_TCRB.tsv
Renaming column in: 03855000011652_TCRB.tsv
Renaming column in: 03855000011656_TCRB.tsv
Renaming column in: 03855000011657_TCRB.tsv
Renaming column in: 03855000011658_TCRB.tsv
Renaming column in: 03855000011662_TCRB.tsv
Renaming column in: 03855000011664_TCRB.tsv
Renaming column in: 03855000011665_TCRB.tsv
Renaming column in: 03855000011667_TCRB.tsv
Renaming column in: 03855000011671_TCRB.tsv
Renaming column in: 038550000116

In [2]:
# Define the input directory containing TSV files
input_dir = "/dsi/scratch/home/dsi/orrbavly/corona_data/original_files/"
# Loop through all files in the input directory
for file_name in os.listdir(input_dir):
    if file_name.endswith(".tsv"):  # Process only .tsv files
        # Construct the full file path
        file_path = os.path.join(input_dir, file_name)
        
        # Load the TSV file
        df = pd.read_csv(file_path, sep="\t")
        
        # Overwrite the file as a CSV (same name, but .csv extension)
        new_file_path = os.path.join(input_dir, file_name.replace(".tsv", ".csv"))
        df.to_csv(new_file_path, index=False)
        
        # Optionally, remove the old .tsv file
        os.remove(file_path)
        print(f"Converted and replaced {file_name} with {os.path.basename(new_file_path)}")

print("All TSV files have been successfully converted to CSV.")



Converted and replaced 03845000013199_TCRB.tsv with 03845000013199_TCRB.csv
Converted and replaced 03855000011610_TCRB.tsv with 03855000011610_TCRB.csv
Converted and replaced 03855000011616_TCRB.tsv with 03855000011616_TCRB.csv
Converted and replaced 03855000011623_TCRB.tsv with 03855000011623_TCRB.csv
Converted and replaced 03855000011624_TCRB.tsv with 03855000011624_TCRB.csv
Converted and replaced 03855000011628_TCRB.tsv with 03855000011628_TCRB.csv
Converted and replaced 03855000011629_TCRB.tsv with 03855000011629_TCRB.csv
Converted and replaced 03855000011630_TCRB.tsv with 03855000011630_TCRB.csv
Converted and replaced 03855000011635_TCRB.tsv with 03855000011635_TCRB.csv
Converted and replaced 03855000011646_TCRB.tsv with 03855000011646_TCRB.csv
Converted and replaced 03855000011647_TCRB.tsv with 03855000011647_TCRB.csv
Converted and replaced 03855000011650_TCRB.tsv with 03855000011650_TCRB.csv
Converted and replaced 03855000011651_TCRB.tsv with 03855000011651_TCRB.csv
Converted an

In [3]:
import os
import pandas as pd

# Directory containing the CSV files with the combined column and combined values
directory_path = "/dsi/scratch/home/dsi/orrbavly/corona_data/original_files/"

# Loop through all CSV files in the directory
for file_name in os.listdir(directory_path):
    if file_name.endswith(".csv"):  # Process only CSV files
        file_path = os.path.join(directory_path, file_name)
        
        # Load the CSV file
        df = pd.read_csv(file_path)
        
        # Check if the combined column exists
        if "Sequences,templates" in df.columns:
            # Split the combined column into two separate columns
            df[['Sequences', 'templates']] = df['Sequences,templates'].str.split(',', expand=True)
            
            # Drop the original combined column
            df.drop(columns=["Sequences,templates"], inplace=True)
            
            # Save the fixed file back
            df.to_csv(file_path, index=False)
            print(f"Fixed and saved: {file_name}")

print("All files have been fixed.")


Fixed and saved: 03855000011616_TCRB.csv
Fixed and saved: 03855000011623_TCRB.csv
Fixed and saved: 03855000011624_TCRB.csv
Fixed and saved: 03855000011628_TCRB.csv
Fixed and saved: 03855000011629_TCRB.csv
Fixed and saved: 03855000011630_TCRB.csv
Fixed and saved: 03855000011635_TCRB.csv
Fixed and saved: 03855000011647_TCRB.csv
Fixed and saved: 03855000011650_TCRB.csv
Fixed and saved: 03855000011651_TCRB.csv
Fixed and saved: 03855000011652_TCRB.csv
Fixed and saved: 03855000011656_TCRB.csv
Fixed and saved: 03855000011657_TCRB.csv
Fixed and saved: 03855000011658_TCRB.csv
Fixed and saved: 03855000011664_TCRB.csv
Fixed and saved: 03855000011665_TCRB.csv
Fixed and saved: 03855000011667_TCRB.csv
Fixed and saved: 03855000011671_TCRB.csv
Fixed and saved: 03855000011677_TCRB.csv
Fixed and saved: 03855000011688_TCRB.csv
Fixed and saved: 03855000011696_TCRB.csv
Fixed and saved: 03855000011704_TCRB.csv
Fixed and saved: 03855000011709_TCRB.csv
Fixed and saved: 03855000011730_TCRB.csv
Fixed and saved:

In [3]:
import numpy as np
import pandas as pd
import os
import torch
from torch_cluster import radius
import time
from scipy.spatial.distance import pdist, squareform

In [4]:
METRIC = 'cosine'
PERCENTILES = [5, 15, 25, 35, 50, 70, 80, 90, 95]
OUTPUT_FILE = "/home/dsi/orrbavly/GNN_project/embeddings/ovarian_percentiles/perc_ball_cos_3.json"
EMBEDDINGS_FOLDER = "/dsi/sbm/OrrBavly/ovarian_data/embeddings/"

# Radius-Based Methods

In [5]:
def analyze_ball(file_path):
    df = pd.read_csv(file_path)
    df = pd.read_csv(file_path)
    embeddings = df.iloc[:, 1:].values.astype('float32')
    
    # Compute pairwise distances
    # Compute distances
    distances = pdist(embeddings, metric='cosine')
    if np.isnan(distances).any():
        print(f"Skipping file {file_path}: NaN values in distances.")
        return None

    # Calculate statistics
    min_dist = np.min(distances)
    max_dist = np.max(distances)
    mean_dist = np.mean(distances)
    percentiles = np.percentile(distances, [25, 50, 75, 90, 95, 99])

    print(f"min dits: {min_dist}\nmaxdist: {max_dist}\nmean dist: {mean_dist}\npercentiles: {percentiles}")

# Create json ouput file of percentiles
def run_percentiles():
    embeddings_folder = EMBEDDINGS_FOLDER
    output_file = OUTPUT_FILE
    files = os.listdir(embeddings_folder)
    all_results = {}
    i = 1
    all_statistics = []

    print(f"Started working on Metric: {METRIC}\nPercentile: {PERCENTILES}")
    for file in files:
        # Construct full file path
        file_path = os.path.join(embeddings_folder, file)
        # Check if the file is a CSV
        if file.endswith('.csv') and 'fp' not in file:
            print(f"working on file:{file}, number {i}")
            stats = analyze_ball(file_path)
            if stats is not None:  # Only include valid results
                all_statistics.append(stats)
            print(f"finished working on file:{file}, number {i}")
            i+=1
        # Aggregate statistics across all graphs
    
    aggregated_stats = {
        "avg_min_dist": np.mean([s["min_dist"] for s in all_statistics]),
        "avg_max_dist": np.mean([s["max_dist"] for s in all_statistics]),
        "avg_mean_dist": np.mean([s["mean_dist"] for s in all_statistics]),
        "avg_percentiles": np.mean([s["percentiles"] for s in all_statistics], axis=0)
    }

    print("Aggregated Statistics:")
    print(f"Average Min Distance: {aggregated_stats['avg_min_dist']}")
    print(f"Average Max Distance: {aggregated_stats['avg_max_dist']}")
    print(f"Average Mean Distance: {aggregated_stats['avg_mean_dist']}")
    print(f"Average Percentiles: {aggregated_stats['avg_percentiles']}")
    avg_percentiles = aggregated_stats["avg_percentiles"]
    r_values = avg_percentiles[[1, 2, 3, 4]]  # Median, 75th, 90th, and 95th percentiles
    print(f"Suggested Radius Values: {r_values}")


In [None]:
run_percentiles()

Started working on Metric: cosine
Percentile: [5, 15, 25, 35, 50, 70, 80, 90, 95]
working on file:12_nd_A_B_H.csv, number 1


In [7]:
import pandas as pd
import numpy as np
import os 

def analyze_file_for_nan(file_path):
    print(f"Analyzing file: {file_path}")
    
    # Load the file
    df = pd.read_csv(file_path)
    print(f"Data shape: {df.shape}")
    
    # Check for missing or invalid values in the entire dataframe
    if df.isnull().any().any():
        print("Missing values detected in the dataframe.")
        print("Columns with NaN values:")
        print(df.isnull().sum())
    else:
        print("No missing values detected in the dataframe.")

    # Extract embeddings
    embeddings = df.iloc[:, 1:].values.astype('float32')
    
    # Check for all-zero rows
    zero_rows = np.all(embeddings == 0, axis=1)
    print(f"Number of all-zero rows: {np.sum(zero_rows)}")
    
    # Check for rows with NaN or Inf values
    nan_rows = np.isnan(embeddings).any(axis=1)
    inf_rows = np.isinf(embeddings).any(axis=1)
    print(f"Number of rows with NaN values: {np.sum(nan_rows)}")
    print(f"Number of rows with Inf values: {np.sum(inf_rows)}")
    
    # If problematic rows exist, log their indices
    if np.sum(zero_rows) > 0:
        print("Indices of all-zero rows:")
        print(np.where(zero_rows)[0])
    if np.sum(nan_rows) > 0:
        print("Indices of rows with NaN values:")
        print(np.where(nan_rows)[0])
    if np.sum(inf_rows) > 0:
        print("Indices of rows with Inf values:")
        print(np.where(inf_rows)[0])
    
    # Check for duplicate rows
    unique_embeddings = np.unique(embeddings, axis=0)
    num_duplicates = embeddings.shape[0] - unique_embeddings.shape[0]
    print(f"Number of duplicate rows: {num_duplicates}")
    
    # Analyze distances if data is valid
    if not nan_rows.any() and not inf_rows.any() and embeddings.shape[0] > 1:
        from scipy.spatial.distance import pdist
        distances = pdist(embeddings, metric='cosine')
        print(f"Min Distance: {np.min(distances)}")
        print(f"Max Distance: {np.max(distances)}")
        print(f"Mean Distance: {np.mean(distances)}")
        print(f"Percentiles: {np.percentile(distances, [25, 50, 75, 90, 95, 99])}")
    else:
        print("Data contains issues preventing distance analysis.")


folder = "/dsi/sbm/OrrBavly/ovarian_data/embeddings/"
files = os.listdir(folder)
for file in files:
    if 'fp' in file:
        file_path = os.path.join(folder, file)
        # Replace with the problematic file's path
        # file_path = "/dsi/sbm/OrrBavly/ovarian_data/embeddings/34_A_B_OC_fp.csv"
        analyze_file_for_nan(file_path)

Analyzing file: /dsi/sbm/OrrBavly/ovarian_data/embeddings/34_A_B_OC_fp.csv
Data shape: (8769, 769)
Missing values detected in the dataframe.
Columns with NaN values:
Sequences    0
0            0
1            0
2            0
3            0
            ..
763          1
764          1
765          1
766          1
767          1
Length: 769, dtype: int64
Number of all-zero rows: 0
Number of rows with NaN values: 1
Number of rows with Inf values: 0
Indices of rows with NaN values:
[8768]
Number of duplicate rows: 0
Data contains issues preventing distance analysis.
Analyzing file: /dsi/sbm/OrrBavly/ovarian_data/embeddings/31_A_B_OC_fp.csv
Data shape: (15937, 769)
No missing values detected in the dataframe.
Number of all-zero rows: 0
Number of rows with NaN values: 0
Number of rows with Inf values: 0
Number of duplicate rows: 0
Min Distance: 1.052722742311829e-06
Max Distance: 0.925080112185596
Mean Distance: 0.526754336038469
Percentiles: [0.45885421 0.52781019 0.60209841 0.64812525 0.

In [5]:
import pandas as pd
import numpy as np
from torch_cluster import radius
import torch
from scipy.spatial.distance import pdist, squareform


def analyze_with_pdist(file_path, r_values):
    """
    Analyzes graph statistics for different radius values using pdist for cosine distance.
    
    Args:
        embeddings (np.ndarray): Node embeddings (num_nodes x num_features).
        r_values (list): List of radius (r) values to analyze.
    
    Returns:
        dict: Statistics for each radius value.
    """
    df = pd.read_csv(file_path)
    embeddings = df.iloc[:, 1:].values.astype('float32')
    # Compute cosine distances using pdist
    distances = pdist(embeddings, metric='cosine')  # Pairwise cosine distances
    distance_matrix = squareform(distances)  # Convert to square matrix for easy indexing
    num_nodes = embeddings.shape[0]

    # Store results for each radius
    results = {}

    # Analyze each radius value
    for r in r_values:
        print(f"Analyzing radius: {r}")

        # Create adjacency matrix based on radius
        adj_matrix = (distance_matrix <= r).astype(int)

        # Calculate node degrees
        degrees = adj_matrix.sum(axis=1)

        # Graph statistics
        avg_degree = degrees.mean()
        max_degree = degrees.max()
        min_degree = degrees.min()
        sparsity = 1 - (degrees.sum() / (num_nodes * num_nodes))  # Fraction of zero entries in adjacency matrix

        # Save stats for this radius
        results[r] = {
            "num_nodes": num_nodes,
            "avg_degree": avg_degree,
            "max_degree": max_degree,
            "min_degree": min_degree,
            "sparsity": sparsity
        }
        print(f"  Avg Degree: {avg_degree}, Max Degree: {max_degree}, Min Degree: {min_degree}, Sparsity: {sparsity:.4f}")

    return results


def analyze_r_values(file_path, r_values):
    """
    Analyzes the impact of different radius values (r) on graph statistics.
    
    Args:
        file_path (str): Path to the CSV file containing embeddings.
        r_values (list): List of radius values to analyze.

    Returns:
        dict: Statistics for each radius value.
    """
    # Load embeddings from file
    df = pd.read_csv(file_path)
    embeddings = df.iloc[:, 1:].values.astype('float32')
    num_nodes = embeddings.shape[0]

    # Results dictionary
    results = {}

    # Analyze each radius value
    for r in r_values:
        print(f"Analyzing radius: {r}")

        # Compute ball-query-based neighbors
        edge_index = radius(torch.tensor(embeddings), torch.tensor(embeddings), r)
        
        # Calculate node degrees
        degrees = np.bincount(edge_index[0].numpy(), minlength=num_nodes)

        # Graph statistics
        avg_degree = degrees.mean()
        max_degree = degrees.max()
        min_degree = degrees.min()
        sparsity = 1 - (degrees.sum() / (num_nodes * num_nodes))  # Fraction of zero entries in adjacency matrix

        # Save stats for this radius
        results[r] = {
            "num_nodes": num_nodes,
            "avg_degree": avg_degree,
            "max_degree": max_degree,
            "min_degree": min_degree,
            "sparsity": sparsity
        }
        print(f"  Avg Degree: {avg_degree}, Max Degree: {max_degree}, Min Degree: {min_degree}, Sparsity: {sparsity:.4f}")

    return results



In [None]:

# Example usage
file_path = "/dsi/sbm/OrrBavly/ovarian_data/embeddings/1_A_B_H.csv"
r_values = [1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.0]  # Example radius values
stats = analyze_r_values(file_path, r_values)

# Print summarized results
print("\nSummary of Radius Impact:")
for r, metrics in stats.items():
    print(f"Radius {r}: {metrics}")

In [6]:
# Example usage
file_path = "/dsi/sbm/OrrBavly/ovarian_data/embeddings/1_A_B_H.csv"
r_values = [0.45, 0.5, 0.53, 0.6, 0.65, 0.68, 0.72]  # Example radius values
stats = analyze_with_pdist(file_path, r_values)

# Print summarized results
print("\nSummary of Radius Impact:")
for r, metrics in stats.items():
    print(f"Radius {r}: {metrics}")

Analyzing radius: 0.45
  Avg Degree: 3192.161280780892, Max Degree: 8908, Min Degree: 2, Sparsity: 0.7639
Analyzing radius: 0.5
  Avg Degree: 5966.772535679953, Max Degree: 10613, Min Degree: 3, Sparsity: 0.5588
Analyzing radius: 0.53
  Avg Degree: 7465.379501589884, Max Degree: 11495, Min Degree: 3, Sparsity: 0.4479
Analyzing radius: 0.6
  Avg Degree: 10476.904163277379, Max Degree: 13185, Min Degree: 43, Sparsity: 0.2253
Analyzing radius: 0.65
  Avg Degree: 12407.352288693337, Max Degree: 13494, Min Degree: 300, Sparsity: 0.0825
Analyzing radius: 0.68
  Avg Degree: 13079.618353915552, Max Degree: 13519, Min Degree: 849, Sparsity: 0.0328
Analyzing radius: 0.72
  Avg Degree: 13434.289210973897, Max Degree: 13523, Min Degree: 2933, Sparsity: 0.0066

Summary of Radius Impact:
Radius 0.45: {'num_nodes': 13523, 'avg_degree': np.float64(3192.161280780892), 'max_degree': np.int64(8908), 'min_degree': np.int64(2), 'sparsity': np.float64(0.7639457752879619)}
Radius 0.5: {'num_nodes': 13523, 'a

## pdist

In [7]:
import pandas as pd
import numpy as np
from scipy.spatial.distance import pdist, squareform
import torch

def analyze_embedding_file(file_path):
    """
    Analyzes the embeddings in a file to check value ranges, distance distribution,
    and the effect of normalization on cosine distances.

    Args:
        file_path (str): Path to the file containing embeddings.

    Returns:
        None
    """
    # Load embeddings
    print(f"Loading embeddings from {file_path}...")
    df = pd.read_csv(file_path)
    embeddings = df.iloc[:, 1:].values.astype('float32')  # Assuming first column is non-embedding (e.g., ID)
    print(f"Shape of embeddings: {embeddings.shape}")

    # Check embedding value range
    print("\n--- Embedding Value Range ---")
    min_val = np.min(embeddings)
    max_val = np.max(embeddings)
    mean_val = np.mean(embeddings)
    print(f"Min Value: {min_val}, Max Value: {max_val}, Mean Value: {mean_val}")

    # Compute distance distribution with SciPy (cosine distance)
    print("\n--- Distance Distribution (Original Embeddings) ---")
    distances = pdist(embeddings, metric='cosine')
    print("Percentiles (Cosine Distance):", np.percentile(distances, [25, 50, 75, 90, 95]))

    # Normalize embeddings
    print("\n--- Normalizing Embeddings ---")
    normalized_embeddings = embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True)
    normalized_distances = pdist(normalized_embeddings, metric='cosine')
    print("Percentiles (Normalized Cosine Distance):", np.percentile(normalized_distances, [25, 50, 75, 90, 95]))

    # Analyze normalized embeddings with torch
    embeddings_torch = torch.tensor(embeddings, dtype=torch.float32)
    normalized_torch = torch.nn.functional.normalize(embeddings_torch, p=2, dim=1)

    # Compute cosine similarity and distances in torch
    similarity_matrix = torch.matmul(normalized_torch, normalized_torch.T)
    distance_matrix = 1 - similarity_matrix
    print("\n--- Distance Distribution (Torch, Normalized) ---")
    print("Percentiles (Torch Distance):", np.percentile(distance_matrix.numpy(), [25, 50, 75, 90, 95]))

    # Test ball query with torch_cluster.radius
    from torch_cluster import radius

    print("\n--- Ball Query Results (Torch) ---")
    r_values = [0.1, 0.2, 0.3, 0.4, 0.5]  # Adjust as needed
    num_nodes = embeddings.shape[0]
    for r in r_values:
        edge_index = radius(normalized_torch, normalized_torch, r)
        degrees = np.bincount(edge_index[0].numpy(), minlength=num_nodes)
        avg_degree = degrees.mean()
        print(f"Radius {r}: Avg Degree = {avg_degree}, Max Degree = {degrees.max()}, Min Degree = {degrees.min()}")

if __name__ == "__main__":
    # Replace with your file path
    file_path = "/dsi/sbm/OrrBavly/ovarian_data/embeddings/1_A_B_H.csv"
    analyze_embedding_file(file_path)


Loading embeddings from /dsi/sbm/OrrBavly/ovarian_data/embeddings/1_A_B_H.csv...
Shape of embeddings: (13523, 768)

--- Embedding Value Range ---
Min Value: -4.329875946044922, Max Value: 2.9154725074768066, Mean Value: -0.0006522214389406145

--- Distance Distribution (Original Embeddings) ---
Percentiles (Cosine Distance): [0.45366544 0.51525968 0.59233865 0.64246004 0.66729372]

--- Normalizing Embeddings ---
Percentiles (Normalized Cosine Distance): [0.45366544 0.51525969 0.59233865 0.64246004 0.66729372]

--- Distance Distribution (Torch, Normalized) ---
Percentiles (Torch Distance): [0.45365155 0.51524979 0.59233284 0.64245695 0.66729134]

--- Ball Query Results (Torch) ---
Radius 0.1: Avg Degree = 1.000147896176884, Max Degree = 2, Min Degree = 1
Radius 0.2: Avg Degree = 1.0023663388301411, Max Degree = 2, Min Degree = 1
Radius 0.3: Avg Degree = 1.0735043999112623, Max Degree = 10, Min Degree = 1
Radius 0.4: Avg Degree = 1.6759594764475338, Max Degree = 32, Min Degree = 1
Radius

## torch_cluster radius

In [1]:
import pandas as pd
import numpy as np
import torch
from torch_cluster import radius
import os
import scipy.sparse as sp

def normalize_embeddings(embeddings):
    """
    Normalize embeddings to unit vectors (float64 for precision).
    """
    embeddings = embeddings.astype(np.float64)
    return embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True)

def create_sparse_adjacency_matrix(embeddings, r):
    """
    Create a sparse adjacency matrix using ball queries based on cosine distance.
    
    Args:
        embeddings (np.ndarray): Normalized embeddings (num_nodes x num_features).
        r (float): Radius for ball query.

    Returns:
        scipy.sparse.csr_matrix: Sparse adjacency matrix.
    """
    num_nodes = embeddings.shape[0]

    # Convert to float64 tensor for precision
    embeddings_torch = torch.tensor(embeddings, dtype=torch.float64)

    # Perform ball query (returns edge indices)
    edge_index = radius(embeddings_torch, embeddings_torch, r)

    # Convert edge indices to NumPy
    row_indices = edge_index[0].numpy()
    col_indices = edge_index[1].numpy()
    values = np.ones_like(row_indices, dtype=np.float64)  # All edges = 1

    # Create sparse adjacency matrix
    adj_matrix = sp.csr_matrix((values, (row_indices, col_indices)), shape=(num_nodes, num_nodes))
    
    return adj_matrix

def analyze_files(embedding_folder, r_values, batch_size=5000):
    """
    Process multiple embedding files, generate sparse adjacency matrices, and compute graph statistics.

    Args:
        embedding_folder (str): Folder containing CSV embedding files.
        r_values (list): List of radius values to test.
        output_folder (str): Folder to save adjacency matrices and results.
        batch_size (int): Optional batch size for processing large embeddings.

    Returns:
        None
    """
    files = [f for f in os.listdir(embedding_folder) if f.endswith('.csv')]
    files_to_analyze = ["12_nd_A_B_H.csv", "23_A_B_H.csv", "22_nd_A_B_H.csv", "11_nd_A_B_OC.csv",
                         "27_A_B_H.csv", "6_A_B_H.csv", "19_A_B_OC.csv"]
    results = []
    counter = 0 
    for file in files:
        if "fp" not in file and os.path.basename(file) in files_to_analyze:
            print(f"Processing file: {file}")
            file_path = os.path.join(embedding_folder, file)
            # Load and normalize embeddings
            df = pd.read_csv(file_path)
            embeddings = df.iloc[:, 1:].values.astype(np.float64)
            normalized_embeddings = normalize_embeddings(embeddings)
            num_nodes = normalized_embeddings.shape[0]
            for r in r_values:
                print(f"  Analyzing radius: {r}")

                # Create sparse adjacency matrix
                adj_matrix = create_sparse_adjacency_matrix(normalized_embeddings, r)

                # Calculate graph statistics
                degrees = np.array(adj_matrix.sum(axis=1)).flatten()
                avg_degree = np.mean(degrees)
                max_degree = np.max(degrees)
                min_degree = np.min(degrees)
                sparsity = 1 - (degrees.sum() / (num_nodes * num_nodes))
                results.append({
                    "file": file,
                    "radius": r,
                    "num_nodes": num_nodes,
                    "avg_degree": avg_degree,
                    "max_degree": max_degree,
                    "min_degree": min_degree,
                    "sparsity": sparsity
                })

                print(f"    Avg Degree: {avg_degree}, Max Degree: {max_degree}, Min Degree: {min_degree}, Sparsity: {sparsity:.4f}")


if __name__ == "__main__":
    EMBEDDING_FOLDER = "/dsi/sbm/OrrBavly/ovarian_data/embeddings/" 
    R_VALUES = [0.45, 0.5, 0.53, 0.6, 0.65, 0.68, 0.72]  # Radius values to test

    analyze_files(EMBEDDING_FOLDER, R_VALUES)


Processing file: 12_nd_A_B_H.csv
  Analyzing radius: 0.45
    Avg Degree: 3.126079776422764, Max Degree: 32.0, Min Degree: 1.0, Sparsity: 0.9998
  Analyzing radius: 0.5
    Avg Degree: 4.8993267276422765, Max Degree: 32.0, Min Degree: 1.0, Sparsity: 0.9997
  Analyzing radius: 0.53
    Avg Degree: 6.106961382113822, Max Degree: 32.0, Min Degree: 1.0, Sparsity: 0.9996
  Analyzing radius: 0.6
    Avg Degree: 10.357533028455284, Max Degree: 32.0, Min Degree: 1.0, Sparsity: 0.9993
  Analyzing radius: 0.65
    Avg Degree: 16.788363821138212, Max Degree: 32.0, Min Degree: 1.0, Sparsity: 0.9989
  Analyzing radius: 0.68
    Avg Degree: 21.848259654471544, Max Degree: 32.0, Min Degree: 1.0, Sparsity: 0.9986
  Analyzing radius: 0.72
    Avg Degree: 27.787093495934958, Max Degree: 32.0, Min Degree: 1.0, Sparsity: 0.9982
Processing file: 23_A_B_H.csv
  Analyzing radius: 0.45
    Avg Degree: 6.4153426606592365, Max Degree: 32.0, Min Degree: 1.0, Sparsity: 0.9996
  Analyzing radius: 0.5
    Avg Degre

## Radius with cdist

# Instance Hardness

In [None]:
import pandas as pd
import numpy as np
import json



In [20]:
input_json = "/home/dsi/orrbavly/GNN_project/embeddings/ovarian_percentiles/percentiles_results_cos_3_all.json"
data_type = "ovarian"

In [21]:
def load_results(file_path, data_type='ovarian'):
    # load json file
    with open(file_path, 'r') as f:
        all_results = json.load(f)
    # create labels dictionary
    labels_dict = {}
    for sample_name, percentile_dict in all_results.items():
        if data_type == 'ovarian':
            if sample_name.endswith("_H"):
                labels_dict[sample_name] = 0
            elif sample_name.endswith("_OC"):
                labels_dict[sample_name] = 1
            else:
                raise Exception("Error - invalid sample type")
        elif data_type == 'colon':
            if sample_name.endswith("_low"):
                labels_dict[sample_name] = 0
            elif sample_name.endswith("_high"):
                labels_dict[sample_name] = 1
            else:
                raise Exception("Error - invalid sample type")
        elif data_type == 'kidney':
            if "STA" in sample_name:
                labels_dict[sample_name] = 0
            elif "AR" in sample_name:
                labels_dict[sample_name] = 1
            else:
                raise Exception("Error - invalid sample type")
    return all_results, labels_dict


def prepare_data(percentiles_data, labels_dict, vector_indices=None, average_vectors=False):
    data = []
    labels = []

    if average_vectors:
        # Calculate average vector from all k values
        for sample_name, percentiles_dict in percentiles_data.items():
            vectors = np.array(list(percentiles_dict.values()))
            avg_vector = np.mean(vectors, axis=0)
            data.append(avg_vector)
            labels.append(labels_dict[sample_name])
    else:
        max_length = max(len(np.concatenate(list(percentiles_dict.values()))) for percentiles_dict in percentiles_data.values())
        for sample_name, percentiles_dict in percentiles_data.items():
            vectors = list(percentiles_dict.values())
            if vector_indices is not None:
                selected_vectors = [vectors[i] for i in vector_indices if i < len(vectors)]
                flattened_percentiles = np.concatenate(selected_vectors)
            else:
                flattened_percentiles = np.concatenate(vectors)
            ## TODO: find a way to padd all data types, not only "all_k". code was tab to the left
            padded_percentiles = np.pad(flattened_percentiles, (0, max_length - len(flattened_percentiles)), 'constant') 
            data.append(padded_percentiles)
            labels.append(labels_dict[sample_name])

    data = np.array(data)
    labels = np.array(labels)
    return data, labels

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_predict
import xgboost as xgb


def calculate_instance_hardness(X, y, models=None, cv=5):
    """
    Calculate instance hardness based on multiple classifiers' misclassification rates.
    
    Parameters:
    - X: Feature matrix
    - y: Target labels
    - models: List of classifiers to use for instance hardness calculation
    - cv: Number of cross-validation folds
    
    Returns:
    - hardness_scores: Dictionary with sample indices and their instance hardness scores
    """
    if models is None:
        models = [
            RandomForestClassifier(n_estimators=250, random_state=42),
            xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss'), 
            SVC(probability=True, random_state=42)
            ]
    
    # Store misclassifications for each model
    misclassifications = np.zeros(len(y))
    
    for model in models:
        y_pred = cross_val_predict(model, X, y, cv=cv)
        misclassifications += (y_pred != y)
    
    # Calculate the instance hardness score as the proportion of misclassifications
    hardness_scores = misclassifications / len(models)
    
    return dict(enumerate(hardness_scores))

def undersample_using_ih(X, y, sample_names, ih_scores, threshold=0.5):
    """
    Undersample the majority class using an instance hardness threshold.
    
    Parameters:
    - X: Feature matrix
    - y: Target labels
    - sample_names: Names of the samples (for tracking)
    - ih_scores: Dictionary of instance hardness scores
    - threshold: IH threshold for undersampling
    
    Returns:
    - X_resampled, y_resampled, resampled_sample_names: The undersampled feature matrix, labels, and sample names
    """
    minority_class = 1 if np.sum(y == 1) < np.sum(y == 0) else 0
    majority_class = 1 - minority_class
    
    # Split indices by class
    minority_indices = [i for i, label in enumerate(y) if label == minority_class]
    majority_indices = [i for i, label in enumerate(y) if label == majority_class]
    
    # Apply the IH threshold to filter majority class samples
    filtered_majority_indices = [i for i in majority_indices if ih_scores[i] < threshold]
    
    # Combine all minority samples with selected majority samples
    selected_indices = minority_indices + filtered_majority_indices
    
    X_resampled = X[selected_indices]
    y_resampled = y[selected_indices]
    resampled_sample_names = [sample_names[i] for i in selected_indices]
    
    return X_resampled, y_resampled, resampled_sample_names


In [22]:
all_results, labels_dict = load_results(input_json, data_type=data_type)
X, y = prepare_data(all_results, labels_dict)

In [None]:
# Calculate instance hardness scores
ih_scores = calculate_instance_hardness(X, y)

# Apply IH threshold undersampling
X_resampled, y_resampled, resampled_sample_names = undersample_using_ih(X, y, list(all_results.keys()), ih_scores, threshold=0.5)

# Output the resampled dataset size
print(f"Original dataset size: {len(y)}")
print(f"Resampled dataset size: {len(y_resampled)}")