In [42]:
import pandas as pd
from itertools import combinations
from src.foldseek import Foldseek

#### Specify the paths

In [27]:
structure_dir = "../data/structures/trimmed_4units"
structure_db = "../data/databases/4units_db/db"
alignment_db = "../data/databases/alignment_db/db"
alignment_df_path = "../data/dataframes/alignment_df.tsv"
alignment_matrix_path = "../data/dataframes/alignment_matrix.csv"
temp_dir = "/tmp"

#### Initialize Foldseek

In [5]:
foldseek = Foldseek()

#### Create Foldseek DB from structure directory

In [None]:
foldseek.createdb(input_file=structure_dir, output_file=structure_db)

#### Search the database against itself with in exhaustive way

In [None]:
foldseek.search(query_db=structure_db, target_db=structure_db, alignment_db=alignment_db, temp_dir=temp_dir, exhaustive="1", align_type="1", e="inf")

#### Convert the alignment output to dataframe

In [None]:
foldseek.createtsv(query_db=structure_db, target_db=structure_db, alignment_db=alignment_db, output_path=alignment_df_path)

#### Load and parse the dataframe

In [29]:
alignment_df = pd.read_csv(alignment_df_path, delimiter="\t", header=None)
alignment_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,1czdA_1001_1228.pdb,1czdA_1001_1228.pdb,0,1.000,1.0000,0,227,228,0,227,228
1,1czdA_1001_1228.pdb,1b77A_1_228.pdb,98,0.789,0.9896,0,227,228,0,227,228
2,1czdA_1001_1228.pdb,1vpkA_1_361.pdb,72,0.063,0.7152,0,219,228,2,234,235
3,1czdA_1001_1228.pdb,6t8hE_2_247.pdb,73,0.073,0.7127,0,222,228,3,245,246
4,1czdA_1001_1228.pdb,3g65A_1_273.pdb,74,0.089,0.7125,0,223,228,2,248,254
...,...,...,...,...,...,...,...,...,...,...,...
3964,6ap4A_0_381.pdb,1ud9A_1_240.pdb,65,0.079,0.6700,3,246,249,0,236,237
3965,6ap4A_0_381.pdb,3aixB_3_244.pdb,65,0.078,0.6651,3,246,249,0,241,242
3966,6ap4A_0_381.pdb,3hslX_4_300.pdb,69,0.059,0.6560,0,248,249,3,285,287
3967,6ap4A_0_381.pdb,2z0lA_1_299.pdb,69,0.066,0.6461,0,248,249,3,295,299


In [30]:
alignment_df = alignment_df.iloc[:,[0,1,4]]
alignment_df.columns = ["query", "target", "tmscore"]
alignment_df

Unnamed: 0,query,target,tmscore
0,1czdA_1001_1228.pdb,1czdA_1001_1228.pdb,1.0000
1,1czdA_1001_1228.pdb,1b77A_1_228.pdb,0.9896
2,1czdA_1001_1228.pdb,1vpkA_1_361.pdb,0.7152
3,1czdA_1001_1228.pdb,6t8hE_2_247.pdb,0.7127
4,1czdA_1001_1228.pdb,3g65A_1_273.pdb,0.7125
...,...,...,...
3964,6ap4A_0_381.pdb,1ud9A_1_240.pdb,0.6700
3965,6ap4A_0_381.pdb,3aixB_3_244.pdb,0.6651
3966,6ap4A_0_381.pdb,3hslX_4_300.pdb,0.6560
3967,6ap4A_0_381.pdb,2z0lA_1_299.pdb,0.6461


In [31]:
combinations_list = list(combinations(alignment_df["query"].unique(), 2))
len(combinations_list)

1953

In [32]:
filtered_df = alignment_df[alignment_df.apply(lambda row: (row['query'], row['target']) in combinations_list, axis=1)]
filtered_df

Unnamed: 0,query,target,tmscore
1,1czdA_1001_1228.pdb,1b77A_1_228.pdb,0.9896
2,1czdA_1001_1228.pdb,1vpkA_1_361.pdb,0.7152
3,1czdA_1001_1228.pdb,6t8hE_2_247.pdb,0.7127
4,1czdA_1001_1228.pdb,3g65A_1_273.pdb,0.7125
5,1czdA_1001_1228.pdb,6dj8A_0_385.pdb,0.7106
...,...,...,...
3766,1ud9A_1_240.pdb,4tr6A_1_380.pdb,0.6809
3770,1ud9A_1_240.pdb,6ap4A_0_381.pdb,0.6700
3826,3a1jC_17_271.pdb,6ap4A_0_381.pdb,0.7449
3834,3a1jC_17_271.pdb,4tr6A_1_380.pdb,0.7381


In [None]:
# Trim the ".pdb" part from the 'query' column
filtered_df['query'] = filtered_df['query'].str.replace('.pdb', '')
# Trim the ".pdb" part from the 'target' column
filtered_df['target'] = filtered_df['target'].str.replace('.pdb', '')

#### Create a distance matrix from alignment results

In [34]:
alignment_matrix = filtered_df.pivot_table(columns="query", index="target", values="tmscore")

missing_col = [i for i in alignment_matrix.index if i not in alignment_matrix.columns]
missing_index = [i for i in alignment_matrix.columns if i not in alignment_matrix.index]

alignment_matrix.loc[missing_index[0]] = ["None" for i in range(len(alignment_matrix.columns))]
alignment_matrix[missing_col[0]] = ["None" for i in range(len(alignment_matrix.index))]

alignment_matrix.sort_index(inplace=True)
alignment_matrix.sort_index(axis=1, inplace=True)

for i in alignment_matrix.index:
    for c in alignment_matrix.columns:
        if i == c:
            alignment_matrix.loc[i, c] = 1.0
        if pd.isnull(alignment_matrix.loc[i, c]) or alignment_matrix.loc[i, c] == "None":
            alignment_matrix.loc[i, c] = alignment_matrix.loc[c, i]

alignment_matrix = alignment_matrix[alignment_matrix.columns].astype(float)
alignment_matrix = alignment_matrix.apply(lambda x: round(1 - x, 4))
alignment_matrix

query,1b77A_1_228,1czdA_1001_1228,1dmlA_29_319,1iz4A_2_244,1mmiB_1_363,1plqA_1_250,1rxmA_1_238,1ud9A_1_240,1vpkA_1_361,2avtA_1_377,...,6dj8A_0_385,6dlkA_0_379,6manA_0_379,6qh1A_1_253,6t8hE_2_247,7bupA_1_255,7ep8A_1_256,7evpA_3_371,7o1eA_2_251,7rzmA_2_361
target,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1b77A_1_228,0.0000,0.0104,0.3844,0.3211,0.3048,0.3142,0.3119,0.3440,0.2793,0.3298,...,0.2871,0.3038,0.3009,0.2963,0.2907,0.3027,0.3114,0.3011,0.3422,0.2959
1czdA_1001_1228,0.0104,0.0000,0.3844,0.3162,0.3030,0.3117,0.3079,0.3404,0.2848,0.3274,...,0.2894,0.3052,0.3050,0.2898,0.2873,0.2969,0.3038,0.3066,0.3343,0.2997
1dmlA_29_319,0.3844,0.3844,0.0000,0.3696,0.3397,0.3295,0.3376,0.3677,0.3695,0.3399,...,0.3462,0.3496,0.3501,0.3175,0.3272,0.3317,0.3212,0.3420,0.3378,0.3530
1iz4A_2_244,0.3211,0.3162,0.3696,0.0000,0.2747,0.1381,0.1122,0.0808,0.2535,0.3332,...,0.3265,0.3056,0.2738,0.1068,0.0543,0.0999,0.1246,0.2935,0.1153,0.3071
1mmiB_1_363,0.3048,0.3030,0.3397,0.2747,0.0000,0.2397,0.2358,0.2974,0.1104,0.1065,...,0.1392,0.0831,0.0799,0.2442,0.2436,0.2472,0.2618,0.0859,0.2747,0.0634
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7bupA_1_255,0.3027,0.2969,0.3317,0.0999,0.2472,0.0769,0.1381,0.1297,0.2228,0.2972,...,0.2954,0.2738,0.2510,0.0463,0.0847,0.0000,0.0556,0.2643,0.0608,0.2931
7ep8A_1_256,0.3114,0.3038,0.3212,0.1246,0.2618,0.0646,0.1485,0.1341,0.2445,0.3046,...,0.3084,0.2846,0.2634,0.0495,0.0930,0.0556,0.0000,0.2774,0.0395,0.3086
7evpA_3_371,0.3011,0.3066,0.3420,0.2935,0.0859,0.2448,0.2494,0.3023,0.1319,0.0681,...,0.1411,0.0966,0.0961,0.2570,0.2631,0.2643,0.2774,0.0000,0.2902,0.0877
7o1eA_2_251,0.3422,0.3343,0.3378,0.1153,0.2747,0.0707,0.1536,0.1253,0.2602,0.3179,...,0.3224,0.2973,0.2734,0.0588,0.1135,0.0608,0.0395,0.2902,0.0000,0.3216


In [40]:
alignment_matrix.to_csv(alignment_matrix_path)