In [2]:
import pandas as pd
import subprocess
import glob
import os

## Prepare for analysis

In [3]:
input = "LCB3-SARS1"
input_dataframe = pd.read_csv(f"output/{input}/opt_binders/all.csv")
input_dataframe

Unnamed: 0,score,plddt,i_ptm,i_pae,rmsd,model_path,input_pdb
0,0.834627,0.955065,0.838477,5.537342,0.833281,output/LCB3-SARS1/opt_binders/binders/929_359/...,output/LCB3-SARS1/opt_binders/binders/929_359/...
1,0.903687,0.958617,0.836827,5.580572,1.172548,output/LCB3-SARS1/opt_binders/binders/929_359/...,output/LCB3-SARS1/opt_binders/binders/929_359/...
2,0.928871,0.953105,0.835658,5.600200,0.360292,output/LCB3-SARS1/opt_binders/binders/929_359/...,output/LCB3-SARS1/opt_binders/binders/929_359/...
3,0.920041,0.958391,0.831440,5.600936,0.443436,output/LCB3-SARS1/opt_binders/binders/929_359/...,output/LCB3-SARS1/opt_binders/binders/929_359/...
4,0.799984,0.959954,0.829238,5.684133,0.883773,output/LCB3-SARS1/opt_binders/binders/929_359/...,output/LCB3-SARS1/opt_binders/binders/929_359/...
...,...,...,...,...,...,...,...
10995,0.746788,0.790371,0.093978,25.993508,6.687660,output/LCB3-SARS1/opt_binders/binders/959_52/i...,output/LCB3-SARS1/opt_binders/binders/959_52/i...
10996,0.776014,0.795970,0.093002,26.031531,12.678558,output/LCB3-SARS1/opt_binders/binders/959_52/i...,output/LCB3-SARS1/opt_binders/binders/959_52/i...
10997,0.740363,0.793123,0.088887,26.228133,26.015142,output/LCB3-SARS1/opt_binders/binders/959_52/i...,output/LCB3-SARS1/opt_binders/binders/959_52/i...
10998,0.796290,0.767311,0.089381,26.380428,10.285028,output/LCB3-SARS1/opt_binders/binders/959_52/i...,output/LCB3-SARS1/opt_binders/binders/959_52/i...


In [None]:
# Functions
# add scaffold name column
def add_scaffold_name_column(filtered, prefix):
    filtered["scaffold_name"] = ""

    for index, row in filtered.iterrows():
        path = row["model_path"]
        file_name = path.split("/")[-1]
        parts = file_name.split(prefix)[-1].split("_")

        if len(parts) >= 5:
            result = f"{parts[0]}_{parts[1]}_{parts[2]}"
        else:
            result = parts[0].split(".")[0]

        filtered.at[index, "scaffold_name"] = result

    return filtered


def repeat_rows_by_column_value(df, column_name, number):
    unique_values = df[column_name].unique()
    repeated_rows = []

    for value in unique_values:
        subset = df[df[column_name] == value]
        num_repeats = min(number, subset.shape[0])
        repeated_rows.extend([subset.iloc[i, :] for i in range(num_repeats)])

    repeated_df = pd.DataFrame(repeated_rows)
    return repeated_df


# best_binders=add_scaffold_name_column(best_binders, input+"_")

## Filter dataframe

In [4]:
filtered = input_dataframe[
    (input_dataframe["rmsd"] < 3) & (input_dataframe["plddt"] > 0.9)
]
# [(input_dataframe["plddt"]>0.7)&(input_dataframe["i_pae"]<8)&(input_dataframe["rmsd"]<3)]

filtered = filtered.sort_values(by="plddt", ascending=False).drop_duplicates(
    "model_path"
)
filtered

Unnamed: 0,score,plddt,i_ptm,i_pae,rmsd,model_path,input_pdb
10452,0.769684,0.971075,0.905478,3.465091,0.180875,output/LCB3-SARS1/opt_binders/binders/180_264/...,output/LCB3-SARS1/opt_binders/binders/180_264/...
10453,0.765836,0.971019,0.909726,3.468438,0.143909,output/LCB3-SARS1/opt_binders/binders/180_264/...,output/LCB3-SARS1/opt_binders/binders/180_264/...
10450,0.743243,0.970576,0.903926,3.454151,0.282338,output/LCB3-SARS1/opt_binders/binders/180_264/...,output/LCB3-SARS1/opt_binders/binders/180_264/...
10451,0.809074,0.970566,0.905959,3.461623,0.293870,output/LCB3-SARS1/opt_binders/binders/180_264/...,output/LCB3-SARS1/opt_binders/binders/180_264/...
9356,0.752808,0.970408,0.901445,3.525705,0.260874,output/LCB3-SARS1/opt_binders/binders/180_264/...,output/LCB3-SARS1/opt_binders/binders/180_264/...
...,...,...,...,...,...,...,...
151,0.925051,0.901284,0.775172,6.886578,0.504138,output/LCB3-SARS1/opt_binders/binders/959_52/i...,output/LCB3-SARS1/opt_binders/binders/959_52/i...
705,0.938610,0.900714,0.783513,6.690602,0.456518,output/LCB3-SARS1/opt_binders/binders/959_52/i...,output/LCB3-SARS1/opt_binders/binders/959_52/i...
709,0.855447,0.900616,0.768148,7.051422,0.367555,output/LCB3-SARS1/opt_binders/binders/959_52/i...,output/LCB3-SARS1/opt_binders/binders/959_52/i...
155,0.907393,0.900515,0.761435,7.007017,0.886129,output/LCB3-SARS1/opt_binders/binders/959_52/i...,output/LCB3-SARS1/opt_binders/binders/959_52/i...


In [None]:
filtered = add_scaffold_name_column(filtered, input + "_")
filtered

In [None]:
# Calculate statistics on scaffolds
scaffold_counts = filtered["scaffold_name"].value_counts()
total_unique_scaffolds = len(scaffold_counts)
total_scaffold_instances = scaffold_counts.sum()

print("Total unique scaffolds:", total_unique_scaffolds)
print("Total scaffold instances:", total_scaffold_instances)
print("\nScaffold counts:")
print(scaffold_counts)

In [None]:
### Check different scaffolds
# filtered=repeat_rows_by_column_value(filtered, "scaffold_name", 1)
# folder=f"/home/tsatler/RFdif/ClusterProteinDesign/scripts/binder_design/output/{input}/opt_binders/test"
# os.makedirs(folder, exist_ok=True)
# for path in filtered["model_path"]:
#     !cp $path $folder

In [None]:
### Filter by good scaffolds
# good_scaffolds = ["2_","55_","61_","54_","24_","13_","11_"]
# filtered = filtered[~filtered['scaffold_name'].isin(good_scaffolds)]
# filtered

In [None]:
### Redundant scaffolds
designs_per_scaffold = 200

filtered = repeat_rows_by_column_value(filtered, "scaffold_name", designs_per_scaffold)
filtered

## Cluster sequences

In [None]:
filtered["seq_split"] = filtered["seq"].apply(lambda x: x.split("/")[-1])
seqs = filtered["seq_split"].to_list()

num_clusters = 50

seqs = filtered["seq_split"].to_list()
# matrix = np.asarray([np.frombuffer(seq.encode(), dtype=np.uint8) for seq in seqs])
max_length = max(len(seq) for seq in seqs)
padded_seqs = [seq.ljust(max_length, "N") for seq in seqs]
matrix = np.asarray(
    [np.frombuffer(seq.encode(), dtype=np.uint8) for seq in padded_seqs]
)
clusterid, error, nfound = kcluster(matrix, nclusters=num_clusters)

# Apply t-SNE to the matrix to reduce the dimensionality and visualize the sequences.
tsne = TSNE(n_components=2, random_state=42)
embedded_matrix = tsne.fit_transform(matrix)

# Create a scatter plot of the embedded points and label them with cluster IDs.
plt.figure(figsize=(10, 6))
for cluster in range(num_clusters):
    cluster_points = embedded_matrix[clusterid == cluster]
    plt.scatter(cluster_points[:, 0], cluster_points[:, 1], label=f"Cluster {cluster}")

plt.title(f"t-SNE Visualization of {input} best protein sequences")
plt.xlabel("t-SNE Dimension 1")
plt.ylabel("t-SNE Dimension 2")
plt.legend()
# plt.savefig(f"output/{input}/filtered_sequences/tsne_binders.png")
plt.show()


# Print the number of sequences in each cluster.
cluster_counts = Counter(clusterid)
sorted_cluster_counts = dict(sorted(cluster_counts.items()))
for cluster, count in sorted_cluster_counts.items():
    print(f"Cluster {cluster}: {count} sequences")

# Add cluster id to dataframe
filtered["clusterid"] = clusterid
# filtered.to_csv(f"output/{input}/filtered_sequences/2_filtered_binders_clus.csv", index=False)

# Calculate average cluster metrics
average_metrics_by_cluster = filtered.groupby("clusterid").mean()
# average_metrics_by_cluster.to_csv(f"output/{input}/filtered_sequences/2_cluster_average.csv", index=False)
average_metrics_by_cluster

## Prepare metrics command

In [5]:
os.makedirs(f"output/{input}/opt_binders", exist_ok=True)

save_path = f"output/{input}/opt_binders/filtered.csv"  # Save filtered
metric_path = f"output/{input}/opt_binders/metrics.csv"  # Save filtered with metrics

# Make filtered dataframe or append new sequences to the old one
if os.path.exists(save_path):
    print("reading existant dataframe...")
    existing_dataframe = pd.read_csv(save_path)
    filtered_new = filtered[
        ~filtered["model_path"].isin(existing_dataframe["model_path"])
    ]
    print(
        f"existing dataframe of len: {len(existing_dataframe)}, new filtered: {len(filtered_new)}"
    )
    existing_dataframe = pd.concat(
        [existing_dataframe, filtered_new], ignore_index=True
    )
    print(f"final length: {len(existing_dataframe)}")
    existing_dataframe = existing_dataframe.sort_values(by="plddt", ascending=False)
    # drop duplicates
    existing_dataframe.to_csv(save_path, index=False)
    existing_dataframe.to_csv(metric_path, index=False)

else:
    filtered.to_csv(save_path, index=False)
    filtered.to_csv(metric_path, index=False)
    existing_dataframe = filtered

reading existant dataframe...
existing dataframe of len: 4436, new filtered: 0
final length: 4436


## Prepare input files for analysis script

In [6]:
save_directory = f"output/{input}/opt_binders/analysis_input"

if not os.path.exists(save_directory):
    os.makedirs(save_directory)

batch_size = 1000

# Split the model_paths into batches
model_paths = existing_dataframe["model_path"]
batches = [
    model_paths[i : i + batch_size] for i in range(0, len(model_paths), batch_size)
]

# Save each batch as a separate TXT file
for i, batch in enumerate(batches):
    save_path = os.path.join(save_directory, "model_paths_" + str(i) + ".txt")
    with open(save_path, "w") as file:
        file.write("\n".join(batch))

## Run analysis script

In [11]:
input_files = glob.glob(f"{save_directory}/*txt")
array_limit = 300 // len(input_files)
target_chain = "A"
binder_chain = "B"
xml_file = "helper_scripts/metrics_calc.xml"

commands = []

for input_file in input_files:
    with open(input_file, "r") as file:
        lines = file.readlines()
    array_number = len(lines) - 1

    bash_arguments = f"--output=/dev/null --array=0-{array_number}%{array_limit}"
    script_arguments = (
        f"{input_file} {target_chain} {binder_chain} {metric_path} {xml_file}"
    )

    command = (
        f"sbatch {bash_arguments} helper_scripts/binder_analysis.sh {script_arguments}"
    )
    print(command)
    commands.append(command)

print(f"This will run {len(commands)} array scripts")

sbatch --output=/dev/null --array=0-999%60 helper_scripts/binder_analysis.sh output/LCB3-SARS1/opt_binders/analysis_input/model_paths_2.txt A B output/LCB3-SARS1/opt_binders/metrics.csv helper_scripts/metrics_calc.xml
sbatch --output=/dev/null --array=0-999%60 helper_scripts/binder_analysis.sh output/LCB3-SARS1/opt_binders/analysis_input/model_paths_0.txt A B output/LCB3-SARS1/opt_binders/metrics.csv helper_scripts/metrics_calc.xml
sbatch --output=/dev/null --array=0-999%60 helper_scripts/binder_analysis.sh output/LCB3-SARS1/opt_binders/analysis_input/model_paths_3.txt A B output/LCB3-SARS1/opt_binders/metrics.csv helper_scripts/metrics_calc.xml
sbatch --output=/dev/null --array=0-435%60 helper_scripts/binder_analysis.sh output/LCB3-SARS1/opt_binders/analysis_input/model_paths_4.txt A B output/LCB3-SARS1/opt_binders/metrics.csv helper_scripts/metrics_calc.xml
sbatch --output=/dev/null --array=0-999%60 helper_scripts/binder_analysis.sh output/LCB3-SARS1/opt_binders/analysis_input/model_

In [9]:
# Run the array bash script
for command in commands:
    subprocess.run(command, shell=True)

Submitted batch job 180149
Submitted batch job 180150
Submitted batch job 180151
Submitted batch job 180152
Submitted batch job 180153


In [10]:
!squeue --me

             JOBID PARTITION     NAME     USER ST       TIME  NODES NODELIST(REASON)
180149_[60-999%60]       amd binder_a  tsatler PD       0:00      1 (JobArrayTaskLimit)
180150_[41-999%60]       amd binder_a  tsatler PD       0:00      1 (None)
 180151_[0-999%60]       amd binder_a  tsatler PD       0:00      1 (None)
 180152_[0-435%60]       amd binder_a  tsatler PD       0:00      1 (None)
 180153_[0-999%60]       amd binder_a  tsatler PD       0:00      1 (None)
          180149_0       amd binder_a  tsatler  R       0:03      1 compute-0-1
          180149_1       amd binder_a  tsatler  R       0:03      1 compute-0-1
          180149_2       amd binder_a  tsatler  R       0:03      1 compute-0-1
          180149_3       amd binder_a  tsatler  R       0:03      1 compute-0-1
          180149_4       amd binder_a  tsatler  R       0:03      1 compute-0-1
          180149_5       amd binder_a  tsatler  R       0:03      1 compute-0-1
          180149_6       amd binder_a  tsatler 