In [1]:
import os
import sys
module_path = "/data/luojaa/eukgen/"
sys.path.append(module_path)
from core_functions.helper_functions import fasta_to_dict, dict_to_fasta, filter_by_entropy 
import pandas as pd
import numpy as np

In [2]:
cd /data/luojaa/eukgen/

/vf/users/luojaa/eukgen


## build profiles from repclade usalign

In [3]:
fasta_root = "/data/luojaa/polvir/USalignDB/repclade.usalign_clean.entropyfilt.fasta"
output_root = "/data/luojaa/polvir/processing/hhdb/"
DB_name = "polcore"
cmd_hhmake = f"hhmake -i {fasta_root} -o {output_root}{DB_name}.hmm -M 50 -add_cons"
#os.system(cmd_hhmake)

## hhalign search against all seqs

In [4]:
#bash command from /data/luojaa/polvir/seqs
#for f in *; do hhalign -i $f -t ../processing/hhdb/polcore.hmm -o ../processing/hhalign_out/$f.ffdata; done
# to do: ffindex later

## Parse hhalign output 

In [222]:
from core_functions.hhsuite_functions import merge_hhsuite_search_results

In [239]:
search_root= "/data/luojaa/polvir/processing/hhalign_mact/"
output_name = "/data/luojaa/polvir/processing/merged_hhalignmact"
chunk_no = 10
pairwise_cov = .1
probability = .1

In [244]:
# for file in *; do printf '\0' >> "$file"; done # add null byte
#merge_hhsuite_search_results(search_root, output_name, write_tsv=True, filter_cov=pairwise_cov, filter_prob=probability)

## define insertion gap based on alignment confidences

In [245]:
#hhalign_merged = "/data/luojaa/polvir/processing/merged_hhalign.tsv"
hhalign_merged = "/data/luojaa/polvir/processing/merged_hhalignmact.tsv"
hhalign_raw = pd.read_csv(hhalign_merged, sep = "\t")
hhalign_df = hhalign_raw[hhalign_raw.Query != hhalign_raw.Target].reset_index()

In [246]:
def conf_encode(scores, target_seq):
    arr_score, rv = np.array(list(scores)), np.array(list(scores))
    arr_seq = np.array(list(target_seq))
    querygap = list((arr_seq != "-") & (arr_score == " "))
    targetgap = list(arr_seq == "-")
    for i, deletion in enumerate(querygap):
        if deletion:
            rv[i] = '7'
    for i, insertion in enumerate(targetgap):
        if insertion:
            rv[i] = '0'
    return [int(i) > 3 for i in rv]

In [247]:
hhalign_df["conf_encoded"] = hhalign_df.apply(lambda x: conf_encode(x.Confidence, x.Target_sequence), axis = 1)

In [248]:
def gap_bounds(bin_array):
    # Convert the input to a NumPy array if it's not already
    arr = np.array(bin_array, dtype=bool)
    
    # Find runs of False
    runs = np.concatenate(([0], np.where(np.diff(arr))[0] + 1, [len(arr)]))
    lengths = np.diff(runs)
    
    # Find long rungs
    long_runs = np.where((lengths > 50) & (~arr[runs[:-1]]))[0]
    
    # Get start and end indices of long False runs
    result = [(runs[i], runs[i+1]-1) for i in long_runs]
    
    return result

In [250]:
hhalign_df["gap_bounds_raw"] = hhalign_df.conf_encoded.apply(lambda x: gap_bounds(x))
hhalign_df["num_gaps"] = hhalign_df.gap_bounds_raw.apply(lambda x: len(x))

In [251]:
def map_gaps(query_seq, query_offset, gaps):
    j = query_offset - 1
    gapmap = {}
    for i, aa in enumerate(list(query_seq)):
        if aa != "-":
            j += 1
        gapmap[i] = j
    gaps_final = []
    for t in gaps:
        t_adj = (gapmap[t[0]], gapmap[t[1]])
        gaps_final.append(t_adj)
    return gaps_final

In [252]:
hhalign_df["gap_pdb"] = hhalign_df.apply(lambda x: map_gaps(x.Query_sequence, x["Query-HMM-start"], x.gap_bounds_raw), axis = 1)

## splice out insertions from pdb files

In [253]:
from Bio import PDB

In [254]:
exclude = """A3_NZ_CP040846
AA1_JAEOSG010000207
AA1_JAEOTO010000014
AA1_JAEOUO010000001
EZ_29176.XP_003881007
EZ_5664.LmjF.23
EZ_5679.XP_010699305
EZ_5811
EZ_99158.XP_008887255
EZ_KAG5504462
Mega_GCA_000911955
Mega_Marseillevirus_LCMAC202""".split("\n")

In [255]:
for i, seq in enumerate(list(hhalign_df.Query)):
    if "." in seq:
        seq_header = ".".join(seq.split(".")[:-1])
    else:
        seq_header = seq
    if seq_header not in exclude:
        input_file = f"/data/luojaa/polvir/af2_structures/{seq_header}.pdb"
        output_file = f"/data/luojaa/polvir/structures/refined_50gap_01mact/{seq_header}.refined.pdb"
        omit_regions = hhalign_df.iloc[i,-1]
        parser = PDB.PDBParser()
        structure = parser.get_structure("protein", input_file)
        
        # Create a new structure for the spliced portion
        new_structure = PDB.Structure.Structure("spliced_protein")
        
        # Iterate through models, chains, and residues
        for model in structure:
            new_model = PDB.Model.Model(model.id)
            new_structure.add(new_model)
            for chain in model:
                new_chain = PDB.Chain.Chain(chain.id)
                new_model.add(new_chain)
                for residue in chain:
                    residue_num = residue.id[1]
                    if not any(start <= residue_num <= end for start, end in omit_regions):
                        # clip unaligned ends
                        if not ((hhalign_df["Query-HMM-start"][i] > residue_num) | (residue_num > hhalign_df["Query-HMM-end"][i])): 
                            new_chain.add(residue)

        # Save the new structure
        io = PDB.PDBIO()
        io.set_structure(new_structure)
        io.save(output_file)