In [2]:
import pandas as pd
from pandas.api.types import CategoricalDtype
import numpy as np
import os
import sys
import time
import pickle
import altair as alt
from Bio import SeqIO
import multiprocessing

#my HHsuite module in ~/scripts
import parseHHsuite as HH

#----- NOTEBOOK CONFIG ------

#disable altair max rows
alt.data_transformers.disable_max_rows()
#get default altair style
%run ~/scripts/altair_style_config_default.py

pd.set_option('display.expand_frame_repr', False)
pd.set_option('display.max_colwidth', 300)

#enable output scrolling rather than wrapping
from IPython.display import display, HTML
display(HTML("<style>div.output_area pre {white-space: pre;}</style>"))



root = '/home/tobiassonva/data/eukgen/'
%cd {root}


/vf/users/tobiassonva/data/eukgen


In [None]:
#print mmeory usage of python objects
import sys

# These are the usual ipython objects, including this one you are creating
ipython_vars = ['In', 'Out', 'exit', 'quit', 'get_ipython', 'ipython_vars']

# Get a sorted list of the objects and their sizes
a = sorted([(x, sys.getsizeof(globals().get(x))/(1024**3)) for x in dir() if not x.startswith('_') and x not in sys.modules and x not in ipython_vars], key=lambda x: x[1], reverse=True)
print(*[size for size in a], sep='\n')
sum([size[1] for size in a])


In [None]:
#small helper for pkl parsing
def load_pkl(pkl_file):
    import parseHHsuite as HH
    with open(pkl_file, 'rb') as infile:
        item = pickle.load(infile)
    return item

def dump_pkl(item, pkl_file):
    with open(pkl_file, 'wb') as outfile:
        pickle.dump(item, outfile)
    print(f'Pickled item as {pkl_file}')

#pandas helper function to reset_index inplace
def reindex(df, column):
    df.sort_values(by=column, inplace=True)
    df.set_index(keys=[column], drop=True,inplace=True)
    

    
#take a list of strings and return counts of words separated by spaces 
#ignores anything contained in regex blacklist expression
def calculate_label_counts(labels, blacklist='(protein)'):
    words = [label.split() for label in labels]
    words = [item for sublist in words for item in sublist]
    
    #remove common phrases from filter
    words = pd.Series(words)[~(pd.Series(words).str.contains(blacklist, regex=True))]

    word_counts = words.value_counts()
    
    return word_counts



#basic try to ammend the cropping of profile headers employed by hhsuite, capped at 138 or 142 chars??
#requires searchDF from parse_HHsuite or equivalent Query, Target dataframe
#requires global reference header mapping for both query and targets containing acc and header info as index

def create_hhsuite_header_mapping(searchDF, global_header_mapping):

    accs = []

    #append Target and Query columns 
    entries = pd.concat([searchDF.Query, searchDF.Target]).unique()

    #iterate over entries and try to find a accession
    for hit in entries:
        #initial attempt by direct matching
        try:
            hit_acc = global_header_mapping.loc[hit].acc
        
        #for cropped entriestry to slice acc from first space separated element
        #then refer to the global mapping for header
        except KeyError:
            print('cannot find hit for', hit)
            print('trying via acc')
            hit_acc = hit.split()[0]
            new_header = global_header_mapping[global_header_mapping.acc == hit_acc].index[0]
            #print(new_header)
            #print('new acc is', hit_acc) 
        accs.append(hit_acc)

    #format and return DF
    hhsuite_header_mapping = pd.DataFrame({'acc': accs, 'header':entries})

    hhsuite_header_mapping.sort_values(by='header', inplace=True)
    hhsuite_header_mapping.set_index(keys=['header'], drop=True,inplace=True)
    
    return hhsuite_header_mapping


#alignment "viewer style" plot for HHSuite alignments
#takes input as pandas Series containing the 6 series of an HHSuite alignment
#returns the processed dataframe and the altari chart object
#
#Query_sequence     RRRILGPMSSMMMAMAFLSTYPPEFIKRGLEGLRPDGRRPNELRPI...
#Query_consensus    ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~R~DGR~~delRpI...
#Matches            |+..++.-.||||.+.--   +++-+  ..+++|+|||.+||+|||...
#Target_consensus   ~~~~~~~~~~~~~~~~~~---~~~~~--~~~~~R~dGR~~deLRpv...
#Target_sequence    LSHWLGASGSMMMMMTMQ---MPKLI--DENMMRPDGRAPDELRPV...
#Confidence         455667677777765432   22333  249999999999999999...

def plot_alignment(alignmentDF, query_name, target_name):

    alignmentDF = pd.DataFrame({i:list(alignmentDF[i]) for i in alignmentDF.index})
    alignmentDF['seqn'] = alignmentDF.index
    alignmentDF['opacity'] = [np.tanh(int(i)/4) if i != ' ' else 0 for i in alignmentDF.Confidence]
    alignmentDF_melt = alignmentDF.melt(id_vars=['seqn', 'opacity'], value_vars=alignmentDF.columns[:-1], var_name='series', value_name='token')



    #style configuration
    token_color_dict = {'-': 'white','~': 'white','|': 'white', '+': 'white','.': 'white',' ': 'white',
                    '1': 'white', '2': 'white', '3': 'white', '4': 'white', '5': 'white', '6': 'white', '7': 'white', '8': 'white', '9': 'white',
                    'R': '#6276ba', 'K': '#7297c1', 'H': '#7297c1', 'D': '#b25652', 'E': '#b25652', 'S': '#b5b65e', 'T': '#94ae57', 'N': '#72a551', 'Q': '#72a551', 'C': '#cca389', 'G': '#c4ced4', 'P': '#95b5c7', 'A': '#bfa764', 'V': '#b5b65e', 'I': '#94ae57', 'L': '#72a551', 'M': '#cca389', 'F': '#d8c7be', 'Y': '#c4ced4', 'W': '#6276ba',
                    'r': '#c7cee6', 'k': '#b5c9df', 'h': '#b5c9df', 'd': '#e3c2c0', 'e': '#e3c2c0', 's': '#e6e6c6', 't': '#c8d6a8', 'n': '#bed7ae', 'q': '#bed7ae', 'c': '#ead6c9', 'g': '#eef1f3', 'p': '#d1e0e8', 'a': '#e0d6b5', 'v': '#e6e6c6', 'i': '#c8d6a8', 'l': '#bed7ae', 'm': '#dfc7b6', 'f': '#eee7e3', 'y': '#e0e6e8', 'w': '#b5bfde'}

    seq_len = alignmentDF_melt.seqn.max()
    series_number = len(alignmentDF.columns)
    scale = 12

    title = ['Query:    \t'+query_name, 'Target:   \t'+target_name]

    token_names = list(token_color_dict.keys())
    token_colors = list(token_color_dict.values())
    sort_order = ['Confidence', 'Query_sequence', 'Query_consensus', 'Matches', 'Target_consensus', 'Target_sequence']

    #base chart 
    base = alt.Chart(alignmentDF_melt, title=title).encode(
        alt.X('seqn:O', axis=alt.Axis(title=None, values=list(range(0,seq_len,5)), grid=False)),
        alt.Y('series:O', sort=sort_order, axis=alt.Axis(grid=False, title=None, labelFontSize=scale)),
        alt.Opacity('opacity', legend=None),
    ).properties(width=seq_len*scale*1.4, height=series_number*scale*1.4)

    #residue labels
    text = base.mark_text(color ='black', align='center', fontSize=scale).encode(
        alt.Text('token')
    )

    #colored boxes
    box = base.mark_rect().encode(
        alt.Color('token', scale=alt.Scale(domain=token_names, range=token_colors),
                 legend=alt.Legend(direction='horizontal', columns=4, orient='left', title=None, labelFontSize=scale,
                                   values=['R', 'K', 'H', 'D', 'E', 'S', 'T', 'N', 'Q', 'C', 'G', 'P', 'A', 'V', 'I', 'L', 'M', 'F', 'Y', 'W']))
    )

    chart = alt.layer(box, text).configure_title(fontSize=scale*1.5)

    return alignmentDF, chart


#calculate cumulative sum and distribution for pd.Series
#takes pd.Series as input and returns a parsed DF and altair chart object 
def plot_cumsum_counts(series, title='Chart', x_label='value', y_label='count', 
                       x_min=0, y_min=0, x_max=None, y_max=None,
                       x_scale_type='log', y_scale_type='log', decimals=2):
    
    #format DF for data handling, filter 0 values for plot 
    #round to reduce float data display jaggedness
    series = series[series!=0].round(decimals)
    
    #format distribution dataframe
    countDF = pd.DataFrame(series.value_counts())
    countDF.columns = ['amount']
    countDF.sort_index(inplace=True)
    countDF['cumsum'] = countDF['amount'].cumsum()
    countDF['frac_cumsum'] = countDF['cumsum']/countDF['cumsum'].max()
    countDF.reset_index(inplace=True)
    
    #rename columns for plotting
    countDF.columns = [x_label,y_label,'cumsum','frac_cumsum']

    #format axis domains
    x_range = [x_min, series.max()]
    y_range = [y_min, countDF[y_label].max()]
    
    if x_max:
        x_range = [x_min, x_max]
    
    if y_max:
        y_range = [y_min, y_max]
        
    #plot cumulative distribution
    chart_cumsum = alt.Chart(countDF, title=title).mark_line(color=colorlib['twilight_shifted_r_perm'][2],
                                              strokeWidth=3).encode(
        x=alt.X(x_label, title=x_label, scale=alt.Scale(type=x_scale_type)),
        y=alt.Y('frac_cumsum', title='Cumulative Fraction', scale=alt.Scale(domain=[0,1]), axis=alt.Axis(labelAlign='left')),
        tooltip=alt.Tooltip([x_label, y_label, 'frac_cumsum'])
    )
    
    #plot value distribution
    chart_bar = alt.Chart(countDF).mark_area(interpolate='step-after', 
                                            fillOpacity=0.2, line=True).encode(
        x=alt.X(x_label+':Q', scale=alt.Scale(domain=x_range, type=x_scale_type)),
        y=alt.Y(y_label, scale=alt.Scale(domain=y_range, type=y_scale_type)),
        tooltip=alt.Tooltip([x_label, y_label, 'frac_cumsum'])
    )

    #merge and configure
    merge = alt.layer(chart_bar, chart_cumsum).resolve_scale(y='independent').interactive()

    return countDF, merge




#parse HHSuite outfut fromm ffdata into pkl files 
def parse_and_write(file):
    print(file)
    new_data = HH.load_HHBlitsData(file)
    new_data.write_pkl(file+'.pkl')
    #new_data.write_data_tsv(file+'.tsv')


def parse_filter_write(file):
    thread = multiprocessing.current_process().pid
    print(f'{thread} reading {file}\n')
    new_data = HH.load_HHBlitsData(file)
    new_data.write_pkl(file+'.pkl')
    print(f'{thread} parsing\n')

    for key, query in new_data.data.items():
        query.add_self_hit()
        query.filter_numeric(field='Pairwise_cov', min=20, replace=True, keep_self=True)
        query.filter_numeric(field='Prob', min=50, replace=True, keep_self=True)
    
    print(f'{thread} writing\n')
    new_data.write_pkl(file+'.pkl_filtered')   

In [None]:
#parallel read and filter all data in files, save to pkl

search_root='search/euk-prok/search/'
files = [search_root+file for file in os.listdir(search_root) if file.endswith('ffdata')]

with multiprocessing.Pool(processes=16) as pool:
    pool.map(parse_filter_write, files)
    

    
#open all parsed pkl files in folder and merge into one object

all_data = HH.HHblitsData()
search_root='search/euk-prok/pkl/'
pkl_filtered_files = [file for file in os.listdir(search_root) if file.endswith('pkl')]

for i, file in enumerate(pkl_filtered_files):
    print(i,file)
    new_data = HH.HHblitsData()
    new_data.load_from_pkl(search_root+file)
    all_data.add_entries(new_data.data)

    
#all_data.write_pkl(search_root+'merged_filtered_self-match.pkl')
#all_data.write_data_tsv(search_root+'merged_filtered_self-match.csv')

In [None]:
## load data from /analysis/core_data

#clustering data

print('Loading clustering data')
euk_clust = load_pkl(root+'analysis/core_data/euk72_filtered-prof-search-clust.pkl')['members']
prok_clust = load_pkl(root+'analysis/core_data/prok2111_filtered-prof-search-clust.pkl')['members']

reindex(euk_clust, 'cluster_acc')
reindex(prok_clust, 'cluster_acc')

# header mapping

print('Loading header mapping')
euk_header = load_pkl(root+'analysis/core_data/euk72_header_mapping.pkl')
prok_header = load_pkl(root+'analysis/core_data/prok2111_header_mapping.pkl')

reindex(euk_header, 'header')
reindex(prok_header, 'header')

#hhsuite profile header mapping as hhsuite crops header info
hhsuite_header = load_pkl(root+'analysis/core_data/hhsuite_header_mapping.pkl')

#cluster taxonomic filter info
print('Loading taxonomy info')
euk_tax = load_pkl('euk72/euk72_protein_taxonomy.pkl')

#full prok tax
#prok_tax = load_pkl('prok2111/prok2111_protein_taxonomy.pkl')

#lighter parsed version
prok_tax = load_pkl('analysis/core_data/prok2111_protein_taxonomy_trimmed.pkl')

#search data
print('Loading search data')
#full searchDF
#searchDF = load_pkl(root+'analysis/core_data/merged_filtered_cov20_self-match_tsv.pkl')

#parsed acc viersion without alignment, self hits and indexed in query
searchDF = load_pkl('analysis/core_data/merged_filtered_cov20_self-match_tsv_edited_no_aln.pkl')

search_queries = load_pkl(root+'analysis/core_data/merged_filtered_cov20_self-match_tsv.query.pkl')

In [None]:
#example quesries for testing
euk_queries_test1 = ['OLP83888.1', 'SPQ97222.1', 'XP_009012109.1', 'XP_001461706.1',
       'OLP92683.1', 'XP_002965802.1', 'XP_024530808.1', 'OLP86390.1',
       'KAA0160735.1', 'XP_002681799.1', 'XP_002113075.1', 'OLQ13277.1',
       'XP_001469768.1', 'XP_008902708.1', 'XP_002682078.1', 'XP_024309865.1',
       'OLP87304.1', 'XP_004365904.1', 'XP_012899378.1', 'XP_032224311.1',
       'XP_032223284.1', 'XP_001707828.1', 'XP_005536084.1', 'XP_018187362.1',
       'XP_645838.1', 'PXF40497.1', 'XP_005847475.1', 'KAA0160637.1',
       'XP_013754706.1', 'XP_008905306.1']

euk_queries_test2 = ['CBN77353.1', 'CEL94470.1', 'CEL98020.1', 'CEM00912.1',
       'CEM13793.1', 'CEO94447.1', 'CEP02189.1', 'CEP02404.1',
       'EPZ31333.1', 'GBG32138.1', 'GBG34166.1', 'GBG34636.1',
       'GBG88810.1', 'KAA0151157.1', 'KAA0167757.1', 'KAA6364588.1',
       'KAA6383781.1', 'NP_001022034.1', 'NP_001105121.2',
       'NP_001170744.1', 'NP_001189295.1', 'NP_001242666.1',
       'NP_001259573.1', 'NP_001261837.1', 'NP_001294564.1',
       'NP_001307934.1', 'NP_001328712.1', 'NP_012528.1', 'NP_050092.1',
       'NP_051148.1', 'NP_189541.1', 'NP_197350.1', 'NP_498455.2',
       'NP_505960.3', 'NP_588329.1', 'NP_595422.1', 'NP_609709.1',
       'NP_611238.2', 'NP_649295.1', 'OAD00700.1', 'OAD03858.1',
       'OAD05886.1', 'OAE33051.1', 'OLP78629.1', 'OLQ06972.1',
       'OLQ08228.1', 'OLQ08510.1', 'OLQ11720.1', 'OLQ12045.1',
       'OLQ14344.1']

euk_queries_test3 = ['XP_008911403.1', 'XP_011408184.1', 'XP_002681038.1', 'XP_002673113.1',
                     'OAD09041.1', 'XP_001634466.1', 'XP_005765180.1', 'XP_011407364.1', 
                     'XP_005789988.1', 'KAA6344160.1', 'KAA6409619.1', 'XP_002287408.1',
                     'OAE21175.1', 'RKP17192.1', 'XP_013760427.1', 'KAA0163767.1',
                     'XP_002119908.1', 'XP_009692086.1']

#queries in superkingdoms with between 20, 50 mmebers hitting less than 20 prokaryotic profiles
#and the representative seuqence is from a fish, insect or mammal
euk_queries_test4 = ['NP_001002332.1', 'NP_001240313.1',
       'NP_001259573.1', 'NP_001260847.1', 
       'NP_001278869.1', 'NP_001307724.1', 'NP_001307934.1', 'NP_001334755.1',
       'NP_001356620.1', 'NP_115888.1', 
       'NP_610753.1', 'NP_611238.2', 'NP_956312.1',
       'NP_998197.1', 'NP_998403.1', 'XP_005256905.1',  'XP_017206845.1', 'XP_021326060.1', 'XP_021336265.1']

#queries in superkingdoms with between 20, 50 mmebers hitting less than 20 prokaryotic profiles
#and there are between 200 and 2000 prokaryotic hits 
euk_queries_test5 = ['AGK83073.1', 'CBN73833.1', 'CBN79086.1', 'CEM35385.1', 'CEO94447.1', 'CEP00213.1', 'CEP03651.1', 'EPZ30938.1', 'EPZ31301.1', 'GBG60132.1', 
'GBG70565.1', 'GBG80562.1', 'GBG83744.1', 'KAA0165271.1', 'KAA0172078.1', 'KAA6408708.1', 'NP_001002332.1', 'NP_001259573.1', 'NP_001294564.1',
 'NP_001307724.1', 'NP_001328712.1', 'NP_001334755.1', 'NP_001356620.1', 'NP_011081.1', 'NP_050092.1', 'NP_594946.1', 'NP_610753.1', 'NP_848958.1',
  'NP_849074.1', 'NP_956312.1', 'NP_998197.1', 'OAD04802.1', 'OAD06369.1', 'OAE33370.1', 'OLP84660.1', 'OLQ06972.1', 'OLQ08228.1', 'OLQ14344.1',
   'OSX69435.1', 'OSX71470.1', 'OSX72678.1', 'OSX75094.1', 'OSX77054.1', 'PTQ50428.1', 'PXF41822.1', 'PXF45288.1', 'RKP17849.1', 'RKP18091.1', 
   'RKP20265.1', 'RWR93989.1', 'RWR97906.1', 'RWR98344.1', 'SLM34047.1', 'SLM40311.1', 'SLM40671.1', 'SPQ96285.1', 'SPQ98172.1']


print('Done')

#calcualtion of profiles to be included in MSA evaluation
euk_lca = load_pkl('analysis/core_data/euk72_filtered-prof-search-clust.lca.pkl')
euk_lca_superkingdom = euk_lca[euk_lca.lca.isin(['superkingdom'])].index.unique().values

searchDF_filtered = searchDF[(searchDF.index.isin(euk_lca_superkingdom)) &
                            (searchDF.Pairwise_cov > 0.5) &
                            (searchDF.Prob > 50)]


queries_filtered = searchDF_filtered.index.unique().values


#extended set for full miscrocosm evalution testing
wider_set = searchDF_filtered[(euk_clust.index.value_counts().between(20,1000)) & 
                  (searchDF.index.value_counts().between(2,500))]
wider_set_queries = wider_set.sample(1000).index.unique()[0:200].values

In [None]:
def structure_microcosm(queries, query_hits, query_clusters, target_clusters, root):    
    root_query = root+query+'/'
    os.mkdir(root_query)

    print(f'Q:{query}')
    query_members = query_clusters.loc[query, 'acc']
    hits = query_hits.loc[query]
    members = target_clusters.loc[hits]

    #write quesry accessions to .acc
    with open(root_query+f'{query}.acc', 'w') as outfile:
        outfile.write(pd.DataFrame(query_members).to_csv(sep='\t', header=None, index=None))

    with open(root_query+f'{query}.targets', 'w') as outfile:
        outfile.write(pd.DataFrame(members).to_csv(sep='\t', header=None))
            

In [None]:
#os.system('rm -r microcosm4/*')
for query in wider_set_queries:
    structure_microcosm(query, searchDF_filtered.Target, euk_clust, prok_clust, 'microcosm2/')

In [None]:
query = 'CBJ26283.1'
structure_microcosm(query, searchDF_filtered.Target, euk_clust, prok_clust, 'microcosm2/')

In [None]:
test = searchDF_filtered[(euk_clust.index.value_counts().between(20,100)) & 
                  (searchDF.index.value_counts().between(2,30))]

wider_set = searchDF_filtered[(euk_clust.index.value_counts().between(20,1000)) & 
                  (searchDF.index.value_counts().between(2,500))]


In [None]:
wider_set.sample(1000).index.unique()[0:200].values

In [None]:
counts = 0

for n, i in enumerate(searchDF_filtered.index.unique()):
    hits = searchDF_filtered.loc[[i],'Target'].values
    counts += prok_clust.loc[hits].shape[0]
    print(n, round(counts/(n+1),1), counts, i, sep='\t')
    
        


In [None]:
query = 'OAE21175.1'
query_clusters = euk_clust
query_hits = searchDF_filtered.Target
target_clusters = prok_clust

query_members = query_clusters.loc[query, 'acc']
hits = query_hits.loc[query]
members = target_clusters.loc[hits]

In [None]:
query_clusters.loc[query, 'acc'].drop_duplicates()

In [None]:
hits

In [None]:
cluster_file = 'microcosm2/KAA0163767.1/KAA0163767.1.acc'
with open(cluster_file, 'r') as infile:
    clusters = {}

    for l in infile.readlines():
        cluster_acc, acc = l.strip().split('\t')

        if cluster_acc not in clusters.keys():
            clusters[cluster_acc] = [acc]

        else:
            clusters[cluster_acc].append(acc)
clusters

In [None]:
#given a series of query accessions, retreive all proteins from prokaryotic hits and return their phylogenetic distribution count
#designed for multiprocess Pool and checkpointing to .pkl files
#not very portable 

def get_hit_tax_dist(queries):
    stime = time.time()
    querynr = len(queries)
    
    thread = multiprocessing.current_process().pid
    print(f'{thread}: started\n')
    
    print(f'{thread}: processing {querynr} queries\n')
    
    #load data from files 
    print(f'{thread}: loading search\n')
    searchDF = load_pkl(root+'analysis/core_data/merged_filtered_cov20_self-match_tsv_edited_no_aln.pkl')
    
    print(f'{thread}: loading clust\n')
    prok_clust = load_pkl(root+'analysis/core_data/prok2111_filtered-prof-search-clust.pkl')['members']
    reindex(prok_clust, 'cluster_acc')

    print(f'{thread}: loading tax\n')
    prok_tax = load_pkl('analysis/core_data/prok2111_protein_taxonomy_trimmed.pkl')

    #process only the given slice
    searchDF = searchDF[(searchDF.Query.isin(queries)) &
                         (searchDF.Query != searchDF.Target) &
                         (searchDF.Pairwise_cov > 0.5)]

    #set index for faster iterating over queries
    searchDF.sort_values(by='Query', inplace=True)
    searchDF.set_index(keys=['Query'], drop=True, inplace=True)

    #iterate and pool taxa distributions
    taxa = {}
    n = 0
    printn = 50
    checkn = 1000

    for query in queries:
        
        if n%printn == 0:
            print(f'{thread}: calculating {query} \t{n}|{querynr} \tT+{round(time.time()-stime)} seconds\n')
        
        #find all target profile hits
        profiles = pd.Series(searchDF.loc[query, 'Target'])

        #find all proteins in target profile hits
        proteins = pd.Series(prok_clust.loc[profiles,'acc'])

        #find all taxonomic information from proteins in taget profile hits
        query_taxa = prok_tax.loc[proteins, 'class'].value_counts()

        #add to dict
        taxa[query] = query_taxa
        
        #itermediate save
        if n != 0 and n%checkn == 0:
            print(f'{thread}: saved checkpoint {n/checkn}')
            dump_pkl(taxa, f'analysis/core_data/tax/{thread}_checkpoint_{int(n/checkn)}_tax.pkl')
            taxa = {}
            
        n+=1
        
    dump_pkl(taxa, f'analysis/core_data/tax/{thread}_tax.pkl')
    return taxa





#launch parallel execution
queries = searchDF[(searchDF.Query != searchDF.Target)
                    & (searchDF.Pairwise_cov > 0.5)].Query.unique()

splits = np.array_split(queries, 16)
with multiprocessing.Pool(processes=16) as pool:
    pool.map(get_hit_tax_dist, splits)
    
    
#load data from savepoints into one dictionary
tax_data = {}
for file in os.listdir('analysis/core_data/tax/'):
    print('analysis/core_data/tax/'+file)
    data = load_pkl('analysis/core_data/tax/'+file)
    tax_data = tax_data.copy()
    tax_data.update(data)
    
#merge dict series into one dataframe
query_tax = pd.DataFrame()
for query, data in tax_data.items():
    print(query)

    data.name = query
    
    temp_tax = pd.DataFrame(data).transpose()
    query_tax = pd.concat([query_tax, temp_tax])

#save processed dataframe
dump_pkl(query_tax, 'analysis/core_data/hit_distribution_cov50.pkl')




#calculate query statistics tables
#load raw hit counts
query_tax = load_pkl('analysis/core_data/hit_distribution_cov50.pkl')

#normalize to relative total hits
query_tax_rel = query_tax.div(query_tax.sum(axis=1), axis=0)
#calculate percentile ranks for relative observations skipping 0 observations
query_tax_rel_percentile = query_tax_rel.apply(lambda df: df[df!=0].rank(method='max', pct=True), axis=0).fillna(0)

#multiply by individual hits to get weights
query_tax_weight = pd.DataFrame(query_tax.values*query_tax_rel.values, columns=query_tax.columns, index=query_tax.index)
#calculate percentile rank of observation of weight
query_tax_weight_percentile = query_tax_weight.apply(lambda df: df.rank(method='max', pct=True), axis=0)


#write proteins from query clusters which have top 10% relative hits in 
#respective taxon to file

prominent_taxon_proteins = {}
for taxon in query_tax.columns:
    prominent_queries = query_tax_rel_percentile[(query_tax_rel_percentile[taxon].between(0.85, 0.90))].index
    prominent_proteins = euk_clust.loc[prominent_queries, 'acc'].values

    homo_tax = euk_tax[euk_tax['class'] == 'Mammalia']
    prominent_homo_proteins = homo_tax[homo_tax.index.isin(prominent_proteins)].index
    outdata = euk_header[euk_header.acc.isin(prominent_homo_proteins)]
    print(taxon, outdata.shape[0])
    outdata.reset_index(inplace=True)
    
    outdata['header'].to_csv(f'analysis/core_data/significant/{taxon.replace(" ", "_")}.tsv', sep='\t', index=None, header=None)
    


In [None]:

#describe search results

print('Load data')
#load parse_HHSuite data from pkl 
search_data = HH.HHblitsData()
search_data.load_from_pkl(root+'search/euk-prok/merged_filtered_cov20_self-match.pkl')

#search block csv
search_csv = pd.read_csv(root+'search/euk-prok/merged_filtered_cov20_self-match.csv', sep='\t')
#query block csv
search_queries_csv = pd.read_csv(root+'search/euk-prok/merged_filtered_cov20_self-match.queries.csv', sep='\t', index_col='Query')

#remove self hits
search_nonself_csv = search_csv[(search_csv.Query != search_csv.Target)]

#total hits
print(f'Total hits = {search_nonself_csv .shape[0]}\n')

#identities
probs = search_nonself_csv.Prob.describe()
print('Probability breakdown\n', probs,'\n')

#probabilities
ident = search_nonself_csv.Identities.describe()
print('Identity breakdown\n', ident,'\n')


#calculate number of hits per query and number of queries per hit
print('calculating hit statistics\n')
hits = []
for query in search_data.query_names[0:100000]:
    query_data = pd.DataFrame(search_data.data[query].hit_dict)
    hits.append(query_data[query_data.Target != query].shape[0])
    
inverse_hits = search_nonself_csv.Target.value_counts()

print('Most frequent inverse hits\n')
print(inverse_hits[0:20], '\n')

print('most common words among inverse hits\n')
print(calculate_label_counts(inverse_hits[inverse_hits>100].index)[0:20], '\n')

#plot hit distributions
plot_cumsum_counts(pd.Series(inverse_hits.values)) &  plot_cumsum_counts(pd.Series(hits))

In [None]:
#plot distribution of probabilities against identities

subset = search_data[search_data.Query != search_data.Target].sample(100000)

chart = alt.Chart(subset).mark_rect().encode(
    x=alt.X('Prob:Q', bin=alt.Bin(extent=[0,100], step=1), axis=alt.Axis(grid=False)),
    y=alt.Y('Identities:Q', bin=alt.Bin(extent=[0,100], step=1), axis=alt.Axis(grid=False)),
    color=alt.Color('count()', scale=alt.Scale(scheme='bluepurple', domain=[-1,200]))

).properties(width=800, height=450)


#precomputed histogram for faster render
#doesn't work properly

xvalue='Prob'
yvalue='Similarity'

xbins = 100
ybins = 100
xrange = [subset[xvalue].min(), subset[xvalue].max()]
yrange = [subset[yvalue].min(), subset[yvalue].max()]

hist_data = np.histogram2d(subset[xvalue], subset[yvalue], 
                           bins=(np.linspace(xrange[0], xrange[1], xbins).round(2), 
                                 np.linspace(yrange[0], yrange[1], ybins).round(2)), density=True)
x = []
y = []
v = []
for i, m in enumerate(hist_data[1][:-1]):
    for j, n in enumerate(hist_data[2][:-1]):
        x.append(m)
        y.append(n)
        v.append(hist_data[0][i][j])
        
hist_data_plot = pd.DataFrame({'xbins': x, 'ybins': y, 'density': v})


alt.Chart(hist_data_plot).mark_rect().encode(
    x=alt.X('xbins:O', axis=alt.Axis(grid=False)),
    y=alt.Y('ybins:O', axis=alt.Axis(grid=False)),
    color = alt.Color('density',  scale=alt.Scale(scheme='bluepurple', domain=[min(v)*5, max(v)/10]))
).properties(width=800, height=450)

------ REFORMATTING FOR CLUSTERING --------

In [None]:
#dirty check for label consistency of queries and blasttab formatting for mmseqs

root = 'clust/euk72-profiles/profile-profile-hhsearch/'
#header = ['Query','Target','Prob','E-value','P-value','Score','SS','Cols','Identities','Similarity','Sum_probs','Query-HMM-start','Query-HMM-end','Template-HMM-start','Template-HMM-end','Template_columns','Template_Neff']
data = pd.read_csv(root+'merged_data_filtered_c80.tsv', sep = '\t')


In [None]:
lookup = '/data/tobiassonva/data/eukgen/euk72/euk72.lookup'
lookup = pd.read_csv(lookup, sep='\t', header=None, index_col=1, names=['name', 'index', 'none'])

In [None]:
#split the labels to get accessions
parse_query = [entry.split()[0] for entry in data.Query]
parse_target = [entry.split()[0] for entry in data.Target]


#find label discrepancies
queries = parse_query
entries = lookup.index

clusters = '/data/tobiassonva/data/eukgen/clust/euk72-profiles/euk72_filtered-casc-clust-6-merged.tsv'
clusters = pd.read_csv(clusters, sep ='\t', header=None, names=['clust', 'mem'])['clust']

a = set(queries).difference(set(clusters))
b = set(clusters).difference(set(queries))

print(a)
print(b)

In [None]:
#inconsistent labels from mmseqs internal parsing
query = pd.Series(parse_query)

query.replace('pir||A44923', 'A44923',inplace=True)
query.replace('prf||1111187A', '1111187A', inplace=True)
query.replace('prf||1111187C', '1111187C', inplace=True)

target = pd.Series(parse_target)

target.replace('pir||A44923', 'A44923',inplace=True)
target.replace('prf||1111187A', '1111187A', inplace=True)
target.replace('prf||1111187C', '1111187C', inplace=True)

In [None]:
query_index = [lookup.loc[entry, 'name'] for entry in query]
target_index = [lookup.loc[entry, 'name'] for entry in target]

In [None]:
query = pd.Series(parse_query)
query_index = [lookup.loc[entry, 'name'] for entry in query]
print('target')
target = pd.Series(parse_target)
target_index = [lookup.loc[entry, 'name'] for entry in target]

In [None]:
data.Query = query_index
data.Target = target_index
data_queries = data.Query.unique()
data_filtered = data[data.Target.isin(data_queries)]

In [None]:
#format blasttab format mode 8 for mmseqs tsv2result
blasttab_m8 = pd.DataFrame({'Query': data_filtered.Query, 'Target':data_filtered.Target})

add_columns = ['Identities', 'Template_columns',  'Query-HMM-start','Query-HMM-end','Template-HMM-start','Template-HMM-end','E-value', 'Score']
order = ['Query', 'Target', 'Identities', 'Template_columns', 'Mismatch', 'Gap_open', 'Query-HMM-start','Query-HMM-end','Template-HMM-start','Template-HMM-end','E-value', 'Score']


blasttab_m8[add_columns] = data_filtered[add_columns]
blasttab_m8['Mismatch'] = [0 for _ in blasttab_m8.index]
blasttab_m8['Gap_open'] = [0 for _ in blasttab_m8.index]

blasttab_m8

In [None]:
blasttab_m8.to_csv(root+'merged_data_filtered_c80.tsv.blasttab', sep = '\t', index=None, header=None)

In [17]:
#format from self_search

import pandas as pd

search_file = '/home/tobiassonva/data/eukgen/processing/euk72_ep3/self_search/cluster/self_search.acc'
hh_search = pd.read_csv(search_file, sep='\t', header=None, names=['cluster_acc','acc'])
hh_search = hh_search.drop_duplicates()

In [13]:
lookup_file_euk = '/home/tobiassonva/data/eukgen/processing/euk72_ep3/euk72_ep.repseq.lookup'
lookup_file_prok = '/home/tobiassonva/data/eukgen/processing/prok2111_as/prok2111_as.repseq.lookup'
lookup = pd.read_csv(lookup_file_euk, sep='\t', header=None, index_col=1)
# lookup = pd.read_csv(lookup_file_prok, sep='\t', header=None, index_col=1)

In [4]:
lookup.index.value_counts()[lookup.index.value_counts() > 1]

KeyboardInterrupt: 

In [123]:
#strip header to match accession
# hh_search.Query = [n.split(' ')[0] for n in hh_search.Query]
# hh_search.Target = [n.split(' ')[0] for n in hh_search.Target]

#replace poorly parsed accessions 
#hh_search.replace({'pir||A44923': 'A44923', 'prf||1111187C': '1111187C'}, inplace=True)

#drop all extra rows
#hh_search = hh_search.iloc[:, :2]

hh_search.to_csv('/home/tobiassonva/data/eukgen/processing/euk72_ep2/self_search/euk72_ep.hits.tsv', sep='\t', header=None, index=None)
pd.Series(hh_search.Target.unique()).to_csv('/home/tobiassonva/data/eukgen/processing/euk72_ep2/self_search/euk72_ep.unique_accs.tsv', sep='\t', header=None, index=None)

In [11]:
lookup

Unnamed: 0_level_0,0,2
1,Unnamed: 1_level_1,Unnamed: 2_level_1
NP_904080.1,0,0
NP_178089.1,1,0
NP_001322245.1,2,0
XP_006677078.1,3,0
NP_506559.1,4,0
...,...,...
EP01083P185297,30375741,0
EP01083P218065,30375742,0
EP01083P250833,30375743,0
EP01083P283601,30375744,0


In [18]:
%%time
#translate to mmseqs index

hh_search_i = pd.DataFrame()

hh_search_i['cluster_acc_i'] = lookup.loc[hh_search['cluster_acc']][0].values
hh_search_i['acc_i'] = lookup.loc[hh_search['acc']][0].values


CPU times: user 4.98 s, sys: 7.13 ms, total: 4.99 s
Wall time: 5 s


In [19]:
hh_search_i.cluster_acc_i.unique().shape

(1542971,)

In [20]:
hh_search_i.acc_i.unique().shape

(1542971,)

In [21]:
pd.Series(hh_search_i.cluster_acc_i.unique()).to_csv(search_file+'.i.unique',sep='\t', header=None, index=None)

In [22]:
hh_search_i.to_csv(search_file+'.i',sep='\t', header=None, index=None)

In [44]:
hh_search_i.cluster_acc_i.unique().shape

(1541367,)

In [43]:
pd.Series(hh_search_i.cluster_acc_i.unique()).to_csv(search_file+'.query_unique', header=None, index=None)

In [124]:
hh_search_index = pd.DataFrame(columns=['Query', 'Target'])
hh_search_index.Query = lookup.loc[hh_search.Query][0].values
hh_search_index.Target = lookup.loc[hh_search.Target][0].values

hh_search_index.to_csv('/home/tobiassonva/data/eukgen/processing/euk72_ep2/self_search/euk72_ep.hits_i.tsv', sep='\t', header=None, index=None)
pd.Series(hh_search_index.Target.unique()).to_csv('/home/tobiassonva/data/eukgen/processing/euk72_ep2/self_search/euk72_ep.unique_accs_i.tsv', sep='\t', header=None, index=None)

In [125]:
lookup.index.value_counts()

1
NP_904080.1       1
XP_024309639.1    1
XP_001448410.1    1
XP_026411480.1    1
XP_026458534.1    1
                 ..
XP_008648114.1    1
XP_019076432.1    1
XP_002274411.1    1
XP_001322998.1    1
XP_005770914.1    1
Name: count, Length: 1814550, dtype: int64

In [120]:
hh_search_index[hh_search_index.Query =='11041657']

Unnamed: 0,Query,Target


In [49]:
a = pd.read_csv('/data/tobiassonva/data/eukgen/processing/euk72_ep3/self_search/tmp_head', sep='\t')

In [67]:
a[(a.Pairwise_cov > 0.5) & (a.Prob > 80)]['E-value'].describe()


count     8.950320e+05
mean      4.641312e-03
std       4.574427e-02
min       0.000000e+00
25%      4.000000e-169
50%       2.000000e-85
75%       2.600000e-25
max       1.100000e+00
Name: E-value, dtype: float64

In [65]:
a[


(553445,)

In [None]:
hh_search[9]

In [114]:
hh_search_index.loc[181943,:]

Query     2938585
Target    2938585
Name: 181943, dtype: int64

In [112]:
hh_search_index[hh_search.Query == '1111187C']

Unnamed: 0,Query,Target
181934,18933872,18933872


In [60]:
%%time

hh_search_index = pd.DataFrame(columns=['Query', 'Target'])


hh_search_index['Query'] = lookup.loc[hh_search.Query.values[0:10000]][0].values
hh_search_index['Target'] = lookup.loc[hh_search.Target.values[0:10000]][0].values

hh_search_index

ValueError: Length of values (10039) does not match length of index (10030)

In [53]:
hh_search

Unnamed: 0,Query,Target
0,XP_005535545.1,XP_005535545.1
1,CBJ26689.1,CBJ26689.1
2,CBJ26982.1,CBJ26982.1
3,CBJ26982.1,CBJ26982.1
4,CBJ26982.1,CBJ26982.1
...,...,...
244959,CEM09677.1,CEM09677.1
244960,CEM07901.1,CEM07901.1
244961,CEM11326.1,CEM11326.1
244962,CEM14633.1,CEM14633.1


In [33]:
casc_clusters = casc_clust.set_index('acc').loc[hh_clust.acc].cluster_acc
casc_clusters

acc
GBG58979.1            GBG58979.1
NP_509080.2          NP_509080.2
XP_001311970.1    XP_001311970.1
XP_002838044.1    XP_002838044.1
CEL96693.1            CEL96693.1
                       ...      
XP_009053570.1    XP_009053570.1
OLP74334.1            OLP74334.1
OAE22863.1            OAE22863.1
XP_004346567.1    XP_004346567.1
PTQ45874.1            PTQ45874.1
Name: cluster_acc, Length: 192002, dtype: object

In [35]:
casc_clust.set_index('cluster_acc').loc[casc_clusters]

Unnamed: 0_level_0,acc
cluster_acc,Unnamed: 1_level_1
GBG58979.1,GBG58979.1
NP_509080.2,NP_509080.2
XP_001311970.1,XP_001311970.1
XP_002838044.1,XP_002838044.1
CEL96693.1,CEL96693.1
...,...
OLP74334.1,OLP74334.1
OAE22863.1,OAE22863.1
XP_004346567.1,XP_004346567.1
PTQ45874.1,PTQ45874.1


In [25]:
casc_clusters

Index(['GBG58979.1', 'NP_509080.2', 'XP_001311970.1', 'XP_002838044.1',
       'CEL96693.1', 'EP00615P012161', 'EP00615P004769', 'EP00615P003329',
       'EP00615P017473', 'EP00615P007201',
       ...
       'XP_008897273.1', 'XP_020406395.1', 'XP_001745180.1', 'NP_012105.1',
       'GBG59653.1', 'XP_009053570.1', 'OLP74334.1', 'OAE22863.1',
       'XP_004346567.1', 'PTQ45874.1'],
      dtype='object', name='acc', length=192002)

In [27]:
casc_clust.loc[casc_clusters.values]

KeyError: "None of [Index(['GBG58979.1', 'NP_509080.2', 'XP_001311970.1', 'XP_002838044.1',\n       'CEL96693.1', 'EP00615P012161', 'EP00615P004769', 'EP00615P003329',\n       'EP00615P017473', 'EP00615P007201',\n       ...\n       'XP_008897273.1', 'XP_020406395.1', 'XP_001745180.1', 'NP_012105.1',\n       'GBG59653.1', 'XP_009053570.1', 'OLP74334.1', 'OAE22863.1',\n       'XP_004346567.1', 'PTQ45874.1'],\n      dtype='object', length=192002)] are in the [index]"

In [None]:
root = '/home/tobiassonva/data/eukgen/processing/euk72_ep/'
with open(root+'euk72_ep.test_clust', 'r') as infile:
    clusters = [line for line in infile.read().split('\n')]
    
cluster_seqs_lookup = pd.read_csv(root+'euk72_ep.test_seqs.lookup', sep='\t', header=None, names = ['a', 'b', 'c'])
repseq_seqs_lookup = pd.read_csv(root+'euk72_ep.repseq.lookup', sep='\t', header=None, names = ['a', 'b', 'c'])

cluster_seqs_lookup.set_index('a', inplace=True)
repseq_seqs_lookup_reverse = repseq_seqs_lookup.set_index('a')
repseq_seqs_lookup.set_index('b', inplace=True)

cluster_seqs_lookup

In [None]:
repseq_seqs_lookup

In [None]:
repseq_seqs_lookup_reverse

In [None]:
idlines = [int(line.strip('\x00')) for line in clusters if line.strip('\x00') != '']

In [None]:
cluster_seqs_lookup

In [None]:
%%time
cluster_seqs_lookup.loc[idlines].b
repseq_accs = repseq_seqs_lookup.loc[cluster_seqs_lookup.loc[idlines].b]
repseq_ids = repseq_accs[~repseq_accs.index.duplicated(keep='first')].a

In [None]:
indexes = [False if i[0] == "\x00" else True for i in clusters]
with open(root+'euk72_ep.test_clust_edit', 'w') as out:
    for i, n in enumerate(repseq_ids):
        print(i,n)
        if indexes[i]:
            out.write(str(n)+'\n')
        else:
            out.write('\x00'+str(n)+'\n')

In [None]:
indexes