# Run and analyse antismash results

In [1]:
# import libraries
import os 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from Bio import SeqIO
import numpy as np
from collections import OrderedDict
import networkx as nx
import scipy
import math
import collections
import pickle
import plotly.plotly as py
import shutil
import cufflinks as cf

In [2]:
# import directories
df_dir = '/home/omkar/Projects/panGenome/data/entero_project/general/'
fix_accession_ids(data_dir)

NameError: name 'fix_accession_ids' is not defined

In [7]:
def fix_accession_ids(data_dir):
    # This is a fix to get fasta and gff files compatible for antiSMASH run
    # add accc| to all the fasta files so that IDs match with gff
    # The PATRICID_antismash.fasta and PATRICID.PATRIC.gff can be used in antiSMASH run
    genome_ids = [f for f in os.listdir(data_dir)]

    for pat_id in genome_ids:
        fasta_file = os.path.join(data_dir, pat_id, pat_id + '.fna')
        anti_fasta_file = os.path.join(data_dir, pat_id, pat_id + '_antismash.fasta')
        with open(fasta_file) as original, open(anti_fasta_file, 'w') as corrected:
            records = SeqIO.parse(original, 'fasta')
            for record in records:          
                record.id = 'accn|' + record.id
                record.description = record.description
                SeqIO.write(record, corrected, 'fasta')

python run_antismash.py *_antismash.fasta --output-dir as5/ --genefinding-gff3 *.PATRIC.gff -d

# Process antismash output results

In [None]:
# Rename the cluster files
file_list = [f for f in os.listdir(data_dir) if 'part' not in f]

# Fix directory name
import shutil
for pat_id in file_list:
    src = os.path.join('/home/omkar/Projects/panGenome/data/newer_genomes/genomes/as_outpout/as4/', pat_id + '_antismash')
    if os.path.isdir(src):
        dest = os.path.join(data_dir, pat_id)
        shutil.move(src,dest)
        rename_from = os.path.join(dest, pat_id + '_antismash')
        rename_to = os.path.join(dest, 'as4')
        os.rename(rename_from, rename_to)

In [None]:
def fix_antismash_filenames(data_dir)
    # Fix gbk filenames after antiSMASH run with PATRIC IDs for consistency with further analysis
    genome_ids = [f for f in os.listdir(os.path.join(data_dir))]
    
    for pat_id in genome_ids:
        final_gbk = os.path.join(data_dir, pat_id, 'as4', pat_id + '.final.gbk')
        if not os.path.isfile(final_gbk):
            list_gbk = [f for f in os.listdir(os.path.join(data_dir, pat_id, 'as4')) if 'gbk' in f] 
            for gbk in list_gbk:
                if 'final' in gbk:
                    src = os.path.join(data_dir, pat_id, 'as4', gbk)
                    dest = os.path.join(data_dir, pat_id, 'as4', pat_id + gbk[-10:])
                    os.rename(src, dest)
                elif 'cluster' in gbk:
                    src = os.path.join(data_dir, pat_id, 'as4', gbk)
                    dest = os.path.join(data_dir, pat_id, 'as4', pat_id + gbk[-15:])
                    os.rename(src,dest)

In [None]:
# Rename extra files
for pat_id in file_list:
    final_gbk = os.path.join(data_dir, pat_id, 'as4', pat_id + '.final.embl')
    if not os.path.isfile(final_gbk):
        list_others = [f for f in os.listdir(os.path.join(data_dir, pat_id, 'as4')) if 'accn' in f] 
#         print(list_others)
        for f in list_others:
            tmp_splits = f.split('.')
            id_from = tmp_splits[0]
            id_to = f.replace(id_from, pat_id)            
            rename_from = os.path.join(data_dir, pat_id, 'as4', f)
            rename_to = os.path.join(data_dir, pat_id, 'as4', id_to)
            os.rename(rename_from, rename_to)


# Analyze antismash output results

In [None]:
# Get genus, organism, patric ids
array_0 = df_entero.index.tolist()
array_1 = df_entero.Genus.tolist()
array_2 = df_entero['Genome Name'].tolist()
array_3 = df_entero.Plasmids.tolist()
array_5 = df_entero['Genome Length'].tolist()

pat_genus_dict = dict(zip(array_0, array_1))
pat_genome_dict = dict(zip(array_0, array_2))
pat_plasmids_dict = dict(zip(array_0, array_3))
pat_genome_len_dict = dict(zip(array_0, array_5))

# genus_list = []
# genus_cat_list = []
# genome_list = []
# genome_len_list = []
# pat_list = []
# no_plasmids_list = []
# no_records_list = []
# no_clusters_per_genome_list = []
# no_clusters_per_record_list = []
# record_accession_list = []
# record_len_list = []
# record_desc_list = []
# cluster_list = []
# start_pos_list = []
# end_pos_list = []
# product_list = []
# product_cat_list = []
# contig_edge_list = []



In [None]:
# Generate lists for generating dataframe
cnt = 1
for pat_id in pat_genus_dict.keys():
    gbk_path = os.path.join(data_dir, 'genomes', pat_id, 'as4', pat_id + '.final.gbk')
    
    no_records = len(list(SeqIO.parse(gbk_path, "genbank")))
    input_dir = os.path.join(data_dir, 'genomes', pat_id, 'as4')
    cluster_files = [f for f in os.listdir(input_dir) if 'gbk' in f and 'cluster' in f]
    with open(gbk_path, "r") as handle:
        no_of_clust_genome = 0
        for record in SeqIO.parse(handle, "genbank"):
            no_of_clust_rec = 0
            record_len = len(record.seq)
            record_desc = record.description.split('[')[1].split('|')[0].strip()
            for feature in record.features:
                if feature.type == 'cluster':
                    contig_edge = feature.qualifiers['contig_edge'][0]
                    if contig_edge == 'False':
                        no_of_clust_rec = no_of_clust_rec + 1
                        no_of_clust_genome = no_of_clust_genome + 1
                        product = feature.qualifiers['product'][0]
                        note = feature.qualifiers['note'][0]
                        cluster_id = note.split(':')[1].strip()
                        start = feature.location.start.position
                        end = feature.location.end.position
                        genus_name = pat_genus_dict[pat_id]
                        
                        genus_list.append(genus_name)
                        genome_list.append(pat_genome_dict[pat_id])
                        genome_len_list.append(pat_genome_len_dict[pat_id])
                        pat_list.append(pat_id) 
                        product_list.append(product) 
                        contig_edge_list.append(contig_edge)
                        start_pos_list.append(start)
                        end_pos_list.append(end)
                        cluster_list.append(pat_id + '.cluster0' + "{:02d}".format(int(cluster_id)))
                        no_records_list.append(no_records)
                        record_accession_list.append(record.id)
                        record_desc_list.append(record_desc)
                        record_len_list.append(record_len)
                        no_plasmids_list.append(pat_plasmids_dict[pat_id])

                        if genus_name in abundant_genera:
                            genus_cat_list.append(genus_name)
                        else:
                            genus_cat_list.append('Other')

                        if product in abundant_products:
                            product_cat_list.append(product)
                        else:
                            product_cat_list.append('Other')
                    
            for i_clust in range(no_of_clust_rec):
                no_clusters_per_record_list.append(no_of_clust_rec)   
        for i_clust_g in range(no_of_clust_genome):
            no_clusters_per_genome_list.append(no_of_clust_genome) 
    
    print(cnt)
    cnt = cnt +1 

In [None]:
tuples = list(zip(genus_list, genus_cat_list, genome_list, genome_len_list, pat_list, no_clusters_per_genome_list, 
                  no_records_list, no_plasmids_list, record_accession_list, record_len_list, 
                  no_clusters_per_record_list, cluster_list))

index = pd.MultiIndex.from_tuples(tuples, names=['genus', 'genus_cat','genome_name', 'genome_length', 'patric_id', 
                        'no_of_clusters_per_genome', 'no_of_records', 'no_of plasmids', 'record_accession',
                        'record_len', 'no_of_clusters_per_record', 'cluster_id'])

df_clusters = pd.DataFrame(index=index)
df_clusters['product'] = product_list 
df_clusters['product_cat'] = product_cat_list 
df_clusters['contig_edge'] = contig_edge_list
df_clusters['start_pos'] = start_pos_list
df_clusters['end_pos'] = end_pos_list

In [None]:
clusters_list = df_clusters['product'].tolist()
hybrid_list = []
for cluster in clusters_list:
    if '-' in cluster:
        hybrid_list.append('Yes')
    else:
        hybrid_list.append('No')
df_clusters['hybrid_type'] = hybrid_list

df_products_count = df_clusters.groupby(['bigscape_class','hybrid_type','product_cat', 'product']).count()
df_products_count = df_products_count.drop(['start_pos', 'end_pos', 'plasmid_annotation', 'sequence_type'], axis=1)
df_products_count.to_csv('/home/omkar/Projects/panGenome/data/entero_project/general/df_product_counts.csv')
df_products_count.to_pickle('/home/omkar/Projects/panGenome/data/entero_project/general/df_product_counts.p')

In [None]:
# Remove clusters with contig edge
df_clusters_tmp_no_contig = df_clusters_tmp[df_clusters_tmp.contig_edge == 'True']

clusters_ids_rmv_ = df_clusters_tmp_no_contig.index.get_level_values(11).tolist()

mv_from_clusters_path_list = [os.path.join('/home/omkar/Projects/panGenome/data/entero_project/genomes/',clust[:-11],'as4',clust+'.gbk') for clust in  clusters_ids_rmv_]
mv_to_clusters_path_list = [os.path.join('/home/omkar/Projects/panGenome/data/entero_project/filtered_contig_edge/',clust+'.gbk') for clust in  clusters_ids_rmv_]

for tmp_id in range(len(mv_from_clusters_path_list)):
    shutil.move(mv_from_clusters_path_list[tmp_id], mv_to_clusters_path_list[tmp_id])

In [None]:
idx = pd.IndexSlice

pat_tgt_list = []
for tmp_idx in range(len(no_clusters_per_genome_list)):
    if no_clusters_per_genome_list[tmp_idx] != no_clusters_per_record_list[tmp_idx]:
        pat_tgt_list.append(pat_list[tmp_idx])

df_cluster_records = df_clusters.loc[idx[:,:,:,:,pat_tgt_list,:,:,:,:,:,:,:],:]

df_cluster_records.to_pickle('/home/omkar/Projects/panGenome/data/entero_project/general/df_clusters_records.p')
df_cluster_records.to_csv('/home/omkar/Projects/panGenome/data/entero_project/general/df_clusters_records.csv')

In [None]:
# Write dataframe of clusters
df_clusters.to_pickle('/home/omkar/Projects/panGenome/data/entero_project/general/df_clusters.p')
df_clusters.to_csv('/home/omkar/Projects/panGenome/data/entero_project/general/df_clusters.csv')

## Figure 1

In [None]:
# Cluster count vs genome size scatter plots
df_genome_data = df_entero
# df_genome_data = df_genome_data.set_index('Strain_ID')

scat_data = df_genome_data[['Genome Length', 'no_of_clusters', 'GC Content']].astype(float)
scat_data['Genome Length'] = round(scat_data['Genome Length']/1000000,1)
scat_data['GC Content'] = round(scat_data['GC Content'],1)
scat_data['genus_cat'] = df_genome_data.genus_cat
df = scat_data

df_counts = df.groupby(['GC Content', 'no_of_clusters','genus_cat']).size().reset_index(name='counts')
# df_counts = df.groupby(['Genome Length', 'no_of_clusters','genus_cat']).size().reset_index(name='counts')

cnt_colors = df_counts.genus_cat.map(my_palette)

# Create Fig and gridspec
fig = plt.figure(figsize=(16, 10), dpi= 80)
grid = plt.GridSpec(4, 4, hspace=0.6, wspace=0.2)

# Define the axes
ax_main = fig.add_subplot(grid[:-1, :-1])
ax_right = fig.add_subplot(grid[:-1, -1], yticklabels=[])
ax_bottom = fig.add_subplot(grid[-1, 0:-1], xticklabels=[])

# Scatterplot on main ax
# ax_main.scatter('Genome Length', 'no_of_clusters', data=df_counts, s=df_counts.counts*40, c=cnt_colors, alpha = 0.55, 
#                 edgecolors='gray', linewidths=.5)
ax_main.scatter('GC Content', 'no_of_clusters', data=df_counts, s=df_counts.counts*33, c=cnt_colors, alpha = 0.55,
               edgecolors='gray', linewidths=.5)
# sns.stripplot('Genome Length', 'no_of_clusters', data=df, c=row_colors, jitter=0.25, ax=ax_main)

# histogram on the right
# ax_bottom.hist(df['GC Content'], 40, histtype='stepfilled', orientation='vertical', color='deeppink')
ax_bottom.hist(df['Genome Length'], 40, histtype='stepfilled', orientation='vertical', color='deeppink')

ax_bottom.invert_yaxis()

# histogram in the bottom
ax_right.hist(df['no_of_clusters'], int(max(df.no_of_clusters) + 1), histtype='stepfilled', orientation='horizontal', color='deeppink')

# Decorations
ax_main.set(title='No. of clusters vs GC Content', 
            xlabel='GC Content(%)', ylabel='No of clusters per genome')
# ax_main.set(title='No. of clusters vs Genome Length', 
#             xlabel='Genome Length (Mb)', ylabel='No of clusters per genome')
ax_main.title.set_fontsize(24)

ax_bottom.set(ylabel='No of genomes')
ax_right.set(xlabel='No of genomes')

# Annotate 
from matplotlib.patches import Ellipse
# el = Ellipse((5.4, 21), 1.2, 6, angle=0,facecolor='g',alpha=0.2)

# ax_main.add_artist(el)
# el.set_clip_box(ax_main.bbox)
# ax_main.annotate('Photorhabdus()',
#             xy=(5.8, 22),      # theta, radius
#             xytext=(6.5, 22),   # theta, radius
#             arrowprops=dict(facecolor='black', shrink=0.04),
#             horizontalalignment='left',
#             verticalalignment='bottom', fontsize=16,
#             clip_on=True)  # clip to the axes bounding box


for item in ([ax_main.xaxis.label, ax_main.yaxis.label] + ax_main.get_xticklabels() + ax_main.get_yticklabels()):
    item.set_fontsize(18)

for item in ([ax_right.xaxis.label, ax_right.yaxis.label] + ax_right.get_xticklabels() + ax_right.get_yticklabels()):
    item.set_fontsize(18)
    
for item in ([ax_bottom.xaxis.label, ax_bottom.yaxis.label] + ax_bottom.get_xticklabels() + ax_bottom.get_yticklabels()):
    item.set_fontsize(18)
plt.show()

In [None]:
scat_data.to_csv('/home/omkar/Projects/panGenome/data/entero_project/general/scat_data_cluster_len.csv')
df_counts.to_csv('/home/omkar/Projects/panGenome/data/entero_project/general/scat_data_counts_cluster_len.csv')

## Supp Figure 

In [None]:
# Generate figure for genera distribution and number of clusters per genome 
idx = pd.IndexSlice
prod_columns = [item for item in abundant_products]
prod_columns.append('Other')
for col in prod_columns:
    df_genome_data.loc[:,col] = 0
for pat_id in df_genome_data.index:
    genus_name = pat_genus_dict[pat_id]
    if genus_name in abundant_genera:
        df_genome_data.loc[pat_id, 'genus_cat'] = genus_name
    else:
        df_genome_data.loc[pat_id, 'genus_cat'] = 'Other'
        
    if pat_id in df_clusters.index.get_level_values(4).tolist():
        no_of_clusters = df_clusters.loc[idx[:,:,:,:,pat_id,:,:,:,:,:,:,:],:].index.get_level_values(5).tolist()[0]
        df_genome_data.loc[pat_id, 'no_of_clusters'] = no_of_clusters
        product_cat_list_tmp = df_clusters.loc[idx[:,:,:,:,pat_id,:,:,:,:,:,:,:],'product_cat'].tolist()
        for product_cat in product_cat_list_tmp:
            df_genome_data.loc[pat_id, product_cat] = df_genome_data.loc[pat_id, product_cat] + 1
    else:
        df_genome_data.loc[pat_id, 'no_of_clusters'] = 0
        
# Reindexing
df_genome_data = df_genome_data.reindex(columns= extracted_columns + ['genus_cat', 'no_of_clusters'] + prod_columns)
df_genome_data.sort_values('genus_cat', inplace=True)

genome_product_map = df_genome_data.iloc[:,6:]
# Colors according to genera
# Prepare a vector of color mapped to the 'cyl' column
my_palette = dict(zip(df_genome_data.genus_cat.unique(), sns.color_palette("tab20", len(df_genome_data.genus_cat.unique()))))
row_colors = df_genome_data.genus_cat.map(my_palette)

reorder_index = []
genus_list = list(abundant_genera)
genus_list.append('Other')

for genus in genus_list:
    genus_block_ids = df_genome_data[df_genome_data.genus_cat == genus].index
    df_genus_block = genome_product_map.loc[genus_block_ids,:]
    r_block = sns.clustermap(df_genus_block, cmap=sns.color_palette("CMRmap_r",10), col_cluster=False, row_cluster=True, row_colors=row_colors, 
              linewidths=0, yticklabels=False)
    reorder_index = reorder_index + r_block.data2d.index.tolist()

In [None]:
genus_count = df_genome_data.groupby('genus_cat').count()
genus_count = genus_count['Genus']
genus_count = genus_count.reindex(index=genus_list)
group_names = genus_count.index.tolist()
group_size = genus_count.tolist()

cluster_count = df_clusters.groupby('genus_cat').count()
cluster_count = cluster_count['product']
cluster_count = cluster_count.reindex(index=genus_list)
cluster_size = cluster_count.tolist()


In [None]:
genome_product_map = df_genome_data.iloc[:,6:]
genome_product_map = genome_product_map.reindex(index=reorder_index)
sns.set_context("poster", font_scale=1 )

cmap = sns.color_palette("BuPu",20)


g = sns.clustermap(genome_product_map, cmap=cmap, col_cluster=True, row_cluster=False, row_colors=row_colors, 
              figsize = (10,20),linewidths=0, yticklabels=False)

for genus in genus_list:
    label = genus + ': #'+ str(genus_count.loc[genus]) + ' (#'+str(round(cluster_count.loc[genus]/genus_count.loc[genus],1)) + ')'   
    g.ax_col_dendrogram.bar(0, 0, color=my_palette[genus],
                            label=label, linewidth=0)
g.ax_col_dendrogram.legend(loc=(1.1,-2), ncol=1)

In [None]:
# Save or load color schemes
pickle_out = open('/home/omkar/Projects/panGenome/data/entero_project/general/genus_colors.p',"rb")
my_palette = pickle.load(pickle_out)
pickle_out.close()

pickle_out = open('/home/omkar/Projects/panGenome/data/entero_project/general/genus_colors.p',"wb")
pickle.dump(my_palette, pickle_out)
pickle_out.close()

In [None]:
my_palette = dict(zip(df_genome_data.genus_cat.unique(), sns.color_palette("tab20", len(df_genome_data.genus_cat.unique()))))
row_colors = df_genome_data.genus_cat.map(my_palette)

In [None]:
df_genome_data.to_pickle('/home/omkar/Projects/panGenome/data/entero_project/general/df_cluster_type_dist_per_genome.p')
df_genome_data.to_csv('/home/omkar/Projects/panGenome/data/entero_project/general/df_cluster_type_dist_per_genome.csv')
# df_genome_data = pd.read_pickle('/home/omkar/Projects/panGenome/data/entero_project/general/df_genome_data.p')


## Supp Figure 

In [None]:
# Figure with average cluster distribution
df_genus_avg_cluster_type = pd.DataFrame(index=genus_list, columns=genome_product_map.columns)
for genus in genus_list:
    genus_block_ids = df_genome_data[df_genome_data.genus_cat == genus].index
    df_genus_block = genome_product_map.loc[genus_block_ids,:]
    product_avg = round(df_genus_block.sum(0)/df_genus_block.shape[0], 1)
    df_genus_avg_cluster_type.loc[genus, :] = product_avg
    
df_genus_avg_cluster_type = df_genus_avg_cluster_type[df_genus_avg_cluster_type.columns].astype(float)
sns.set_context("poster", font_scale=1 )
cmap = sns.color_palette("CMRmap_r",50)
# cmap = sns.color_palette("cubehelix_r", 50)
# cmap = sns.color_palette("BuPu",30)

g = sns.clustermap(df_genus_avg_cluster_type, annot=True,cmap=cmap, figsize = (30,30),linewidths=0)
# g = sns.clustermap(df_genus_avg_cluster_type, cmap=cmap, figsize = (10,10),linewidths=0)


df_genus_avg_cluster_type.to_csv('/home/omkar/Projects/panGenome/data/entero_project/general/df_genus_avg_cluster_type.csv')

In [None]:
genus_count = df_genome_data.groupby('genus_cat').count()
genus_count = genus_count['Genus']
genus_count = genus_count.reindex(index=genus_list)
group_names = genus_count.index.tolist()
group_size = genus_count.tolist()


sns.set(context='poster', style='white', palette='deep', font='sans-serif', font_scale=1)
# # First Ring (outside)
fig, ax = plt.subplots()
ax.axis('equal')

group_colors = []
for genus_id in group_names:
    group_colors.append(my_palette[genus_id])
    
explode = (0,0,0,0,0,0,0,0.4,0.6,0.8,2,1.3,2.2,1.9,2.2,0)
mypie, _ = ax.pie(group_size, radius=4, labels=list(zip(group_size,group_names,)), labeldistance=1.01, explode=explode,rotatelabels=True, colors=group_colors, textprops={'fontsize': 18 })
plt.setp( mypie, width=4, edgecolor='white')
plt.rcParams.update({'font.size': 18, 'font.weight':15})

In [None]:
### Supp Figure

In [None]:
genus_vs_clusters_reduced = genus_vs_clusters[np.logical_or(genus_vs_clusters.Avg_clusters>5, genus_vs_clusters.Genomes>50)]
genus_vs_clusters_reduced_avg = genus_vs_clusters_reduced.iloc[:,11:]


data = genus_vs_clusters_reduced_avg

label_tmp_list = []
for index in data.index:
    ylabel = index + '(' + str(genus_vs_clusters.loc[index,'Genomes']) + ')'

    label_tmp_list.append(ylabel)
    
sns.set(font_scale=3.4)

r = sns.clustermap(data, cmap='BuPu',figsize=(40,40))



In [None]:
genus_vs_clusters_reduced = genus_vs_clusters[np.logical_or(genus_vs_clusters.Avg_clusters>5, genus_vs_clusters.Genomes>50)]
genus_vs_clusters_reduced_avg = genus_vs_clusters_reduced.iloc[:,11:]


data = genus_vs_clusters_reduced_avg
label_tmp_list = []
for index in data.index:
    ylabel = index + '(' + str(genus_vs_clusters.loc[index,'Genomes']) + ')'
    label_tmp_list.append(ylabel)

sns.set(font_scale=3.4)
data.index = label_tmp_list
r = sns.clustermap(data, cmap='BuPu',annot=True,figsize=(40,40))

