In [9]:
import pandas as pd
from Bio.Align import PairwiseAligner
from Bio.Align import substitution_matrices
from concurrent.futures import ThreadPoolExecutor, as_completed

In [10]:
# Load the dataset
df = pd.read_csv('./Input_Prn_Info.csv')

In [11]:
# Initialize the aligner
aligner = PairwiseAligner()
aligner.substitution_matrix = substitution_matrices.load("BLOSUM62")
aligner.mode = 'local'  # Local alignment for Smith-Waterman

In [None]:
# Function to calculate normalized Smith-Waterman score
# Symmetric Normalized Score
def calc_similarity(seq1, seq2):
    alignment_score = aligner.score(seq1, seq2)
    max_score_self1 = aligner.score(seq1, seq1)
    max_score_self2 = aligner.score(seq2, seq2)
    if max_score_self1 == 0 or max_score_self2 == 0:
        return 0
    else:
        return alignment_score / (max_score_self1 * max_score_self2)**0.5

In [13]:
# Create an empty DataFrame to store results
similarity_df = pd.DataFrame(index=df['UNIPROT_AC'], columns=df['UNIPROT_AC'])


In [14]:
# Function to process each row
def process_row(index):
    row = df.loc[index]
    scores = [calc_similarity(row['Sequence'], df.loc[j]['Sequence']) for j in df.index]
    return pd.Series(scores, index=df['UNIPROT_AC'])

# Function to apply align_sequences in parallel
def process_rows():
    # Prepare the dataframe to hold results
    similarity_df = pd.DataFrame(index=df['UNIPROT_AC'], columns=df['UNIPROT_AC'])
    with ThreadPoolExecutor(max_workers=18) as executor:
        # Submit tasks
        futures = {executor.submit(process_row, i): i for i in df.index}
        for future in as_completed(futures):
            index = futures[future]
            try:
                result = future.result()
                similarity_df.iloc[index] = result
            except Exception as e:
                print(f"Failed to process row {index}: {e}")
    return similarity_df

In [15]:
# Execute the function
similarity_df = process_rows()