# Calculate average nucleotide identity of *Prymnesium parvum* single copy orthogroups

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
#import subprocess
#import sys
import numpy as np
import matplotlib.patches as patches
import glob
#import os
#from scipy import stats
#import re
#from collections import OrderedDict
import statistics
from Bio import SeqIO
from Bio import AlignIO
import itertools

### Define working directory

In [None]:
paramfile = '../../../figshare/orthofinder/Species_Tree/Supermatrix_in.param'
alignment_files = '../../../figshare/orthofinder/MultipleSequenceAlignments_GUIDANCE/OG*aln'

### Extract orthogroups used to make species tree

In [None]:
st_orthogroups=[]
with open(paramfile) as sources:
    for line in sources:
        OG=line.split(', ')[1].split(' = ')[0]
        st_orthogroups.append(OG)
len(st_orthogroups)

### Extract  trimmed codon alignments and calc pairwise nucleotide identity

In [None]:
ident_dict={}

for alignment in glob.glob(alignment_files):
    if 'trim' not in alignment:
        continue
    else:
        OG=alignment.split('/')[-1].split('.')[0]
        if OG in st_orthogroups:
            records = list(SeqIO.parse(alignment, "fasta"))
            combos = list(itertools.combinations(records, 2))

            for combo in combos:
                identities = 0
                #print(combo[0].seq)
                seq_one = combo[0].seq
                seq_two = combo[1].seq
                aln_len = len(seq_one)

                for idx in range(0, aln_len):
                    if seq_one[idx] == seq_two[idx]:
                        identities += 1
                pid = identities / aln_len
                
                #print(OG, combo[0].id, combo[1].id, pid)
                
                strainA = combo[0].id.split('_')[-1]
                strainB = combo[1].id.split('_')[-1]

                strains = [strainA,strainB]
                strains.sort()
                strains = '-'.join(strains)
                #print(OG, strains, pid)

                if strains in ident_dict:
                    ident_dict[strains].append(pid)
                else:
                    ident_dict[strains]=[pid]



### Prepare dict for plotting

In [None]:
count=0
plot_dict={}
for strains in ident_dict:
    count+=1
    
    identities=ident_dict[strains]
    strainA=strains.split('-')[0]
    strainB=strains.split('-')[1]
    mean=statistics.mean(identities)
    sd=statistics.stdev(identities)

    plot_dict['Comb-'+str(count)]=[strainA,strainB,mean,sd]
    count+=1
    plot_dict['Comb-'+str(count)]=[strainB,strainA,mean,sd]

### Make dataframe

In [None]:
df=pd.DataFrame.from_dict(plot_dict, orient='index', columns=['StrainA', 'StrainB', '%Ident', 'SD'])
df

### Identitify min and max identities

In [None]:
df['%Ident'].min()

In [None]:
df['%Ident'].max()

### pivot dataframe

In [None]:
order=['12B1','UTEX2797','CCMP3037','12A1','CCMP2941','RCC3703','K0081','K0374','RCC3426','KAC39','K0252','RCC191','RCC1433','UTEX995','RCC1436']
df_heat = df.pivot("StrainA", "StrainB", "%Ident")
df_heat = df_heat.reindex(index=order, columns=order)
df_heat.to_csv(Rootdir+'/figshare/orthofinder/Comparative_Genomics_Statistics/strain_ident_nuc.txt', sep='\t', header=True, index=True)
df_heat

In [None]:
import seaborn as sns
import matplotlib.colors
from matplotlib.colors import LinearSegmentedColormap


Colours=['#FFFFE5', '#FFF7BC', '#FEE391', '#FEC44F', '#FB9A29', '#EC7014', '#CC4C02', '#993404', '#662506']

#normalize colors to min and maxc identities
norm=plt.Normalize(.95,1)

cmap = matplotlib.colors.LinearSegmentedColormap.from_list("", Colours)

with sns.axes_style("white"):
    f, ax = plt.subplots(figsize=(15, 7))
    ax = sns.heatmap(df_heat, square=True, cmap=cmap, norm=norm)
    plt.savefig(Rootdir+'/figshare/orthofinder/Comparative_Genomics_Statistics/strain_ident_nuc_heatmap.svg',dpi=500)