In [1]:
# We select the columns in the MSA that didn't have a gap in 90% of the sequences

# For each MSA, calculate the gap percentage of each column of the MSA

In [1]:
import sys
import os
import glob
import pickle
import pandas as pd
sys.path.append("./scripts/")
from fasta2dict import fasta2dict

In [2]:
script_dir = os.path.basename(os.path.realpath(__file__))
base_dir = os.path.join(script_dir, "..")

In [29]:
# Kept only for checking the format
with open(f"{base_dir}/data/processed/parsed_pssm.pkl", 'rb') as ifile:
    pssm_data = pickle.load(ifile)

In [65]:
def calc_gap_frac_in_msa(msa):
    """Takes an MSA in fasta format, and calculates the gap fraction in each column. It assumes that the first column is the consensus"""
    """ sequence, and finds the gap fraction for the positions of the consensus sequence."""
    cons_seq = list(msa.values())[0]
    non_gap_in_cons = [i for i,x in enumerate(cons_seq) if x != "-"]
    msa_size_wo_cons = len(msa) - 1 # The first sequence is the consensus sequence. We want to see how many other instances are in each family
    nogap_frac_dict = {}
    no_gap_ord_num = 0 # It stores the ordinal number of non-gap column on the consensus sequence
    for i in non_gap_in_cons:
        non_gap_count = 0
        for j in range(1, msa_size_wo_cons + 1):
            if list(msa.values())[j][i] != "-":
                non_gap_count += 1
        nogap_frac_dict[no_gap_ord_num] = non_gap_count / msa_size_wo_cons
        no_gap_ord_num += 1
    return nogap_frac_dict

In [73]:
def avg(iterable):
    return sum(iterable)/len(iterable)

In [79]:
avg_nogap_frac = {x: sum(y)/len(y) for x,y in nogap_frac_top20_all_families.items()}

In [67]:
msa_paths = glob.glob(f"{base_dir}/data/raw/dbs/pfam_split_target/grp_by_family/*.fasta")

In [80]:
nogap_frac_all_families = {os.path.basename(path).replace(".fasta", "") :calc_gap_frac_in_msa(fasta2dict(path)) for path in msa_paths if len(fasta2dict(path)) > 2}

In [81]:
nogap_frac_top20_all_families = {}
for family, nogap_frac_list in nogap_frac_all_families.items():
    nogap_frac_top20_all_families[family] = {x:y for x,y in nogap_frac_all_families[family].items() if x in pssm_data["top20_pssm_profiles"][family].index}

In [83]:
avg(avg_nogap_frac.values())

80.06999076031398

In [39]:
pssm_data["top20_pssm_profiles"]["PF20060"].index

Index([ 17,  20,  30,  34,  36,  37,  40,  43,  44,  47,  70,  73,  75,  77,
        79,  83,  93,  95, 100, 102, 107, 108, 110, 111],
      dtype='int64')

In [37]:
nogap_frac_all_families["PF20060"]

{0: 1.0,
 1: 0.525,
 2: 0.525,
 3: 0.525,
 4: 0.5,
 5: 0.5,
 6: 0.5,
 7: 0.5,
 8: 0.525,
 9: 0.525,
 11: 0.675,
 12: 0.675,
 13: 0.65,
 14: 0.6,
 15: 0.6,
 16: 0.6,
 318: 0.85,
 319: 0.85,
 320: 0.85,
 321: 0.85,
 355: 0.975,
 356: 1.0,
 357: 1.0,
 358: 1.0,
 359: 1.0,
 360: 1.0,
 361: 1.0,
 362: 1.0,
 363: 1.0,
 364: 1.0,
 365: 1.0,
 366: 1.0,
 367: 1.0,
 368: 1.0,
 369: 1.0,
 370: 1.0,
 371: 1.0,
 372: 1.0,
 373: 1.0,
 374: 1.0,
 375: 1.0,
 376: 1.0,
 377: 1.0,
 378: 1.0,
 379: 1.0,
 381: 1.0,
 382: 1.0,
 383: 1.0,
 384: 1.0,
 385: 1.0,
 386: 1.0,
 387: 1.0,
 392: 0.975,
 393: 0.975,
 394: 0.55,
 395: 0.875,
 398: 1.0,
 399: 1.0,
 400: 1.0,
 401: 1.0,
 450: 0.525,
 451: 0.55,
 452: 0.6,
 453: 0.7,
 454: 0.7,
 455: 0.75,
 456: 0.775,
 457: 0.625,
 458: 0.95,
 459: 0.95,
 460: 0.95,
 463: 0.975,
 466: 0.975,
 467: 0.975,
 469: 0.95,
 506: 0.95,
 507: 0.95,
 508: 0.95,
 509: 0.95,
 517: 0.975,
 519: 1.0,
 520: 1.0,
 523: 1.0,
 524: 1.0,
 525: 1.0,
 555: 1.0,
 556: 1.0,
 557: 1.0,
 558: 

In [36]:
nogap_frac_top20_all_families["PF20060"]

{}

In [16]:
# What to do:
    1- Parse each PSSM and find the nogap fraction in each column
    2- Find the average gap fraction in the top20 profiles
    3- For those with PSSM calculate the similarity score using either BLOSUM or PSSM.
    4- Find the average gap fraction in conserved ones for those with multiple sequences.
    5- For those with multi sequence profiles, select the random ones with gap fraction above a speicific percentage. 
For thise with a single sequence, select random columns

In [15]:
gap_frac_dict

{0: 0.9583333333333334,
 1: 1.0,
 2: 1.0,
 3: 1.0,
 4: 1.0,
 5: 1.0,
 6: 1.0,
 7: 1.0,
 8: 1.0,
 9: 1.0,
 10: 1.0,
 12: 1.0,
 13: 1.0,
 14: 1.0,
 17: 0.9583333333333334,
 18: 1.0,
 19: 1.0,
 21: 1.0,
 22: 1.0,
 25: 1.0,
 26: 1.0,
 27: 1.0,
 28: 1.0,
 29: 1.0,
 30: 1.0,
 31: 1.0,
 32: 1.0,
 33: 1.0,
 34: 1.0,
 35: 1.0,
 36: 1.0,
 37: 1.0,
 38: 1.0,
 39: 1.0,
 40: 0.5416666666666666,
 46: 1.0,
 47: 1.0,
 48: 1.0,
 49: 1.0,
 50: 1.0,
 51: 1.0,
 52: 1.0,
 54: 1.0,
 55: 1.0,
 56: 1.0,
 57: 1.0,
 59: 0.6666666666666666,
 60: 0.5833333333333334,
 61: 1.0,
 62: 1.0,
 63: 1.0,
 64: 1.0,
 65: 1.0,
 69: 0.9583333333333334,
 70: 0.9583333333333334,
 71: 0.9583333333333334,
 72: 0.9166666666666666,
 75: 0.9583333333333334,
 76: 0.9583333333333334,
 77: 0.9583333333333334,
 78: 0.9583333333333334,
 79: 0.9583333333333334,
 80: 0.9583333333333334,
 81: 0.9583333333333334,
 82: 1.0,
 83: 1.0,
 84: 1.0,
 85: 1.0,
 86: 1.0,
 87: 1.0,
 89: 1.0,
 90: 1.0,
 91: 1.0,
 92: 1.0,
 93: 1.0,
 94: 1.0,
 95: 1.0,
