In [None]:
import glob
import os
import statistics
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd

# Create yn00 control files

In [None]:
alnfiles = '../../../figshare/orthofinder/MultipleSequenceAlignments_GUIDANCE/*.trim.aln'

Example yn00 control file

```
seqfile = file.fa * sequence data file name
outfile = file.ks           * main result file
verbose = 0      * 1: detailed output (list sequences), 0: concise output
icode = 0  * 0:universal code; 1:mammalian mt; 2-10:see below
weighting = 0  * weighting pathways between codons (0/1)?
commonf3x4 = 0  * use one set of codon freqs for all pairs (0/1)?
```

In [None]:
for alnfile in glob.glob(alnfiles):
    #print(alnfile)
    
    alndir = os.path.dirname(alnfile)
    og = alnfile.split('/')[-1].split('.')[0]
    #print(og,alndir)
    
    ynfile = alndir + '/' + og + '.yn00.ctl'
    #print(ynfile)
    
    fo = open(ynfile, 'w')
    
    fo.write('seqfile = ' + og + '.trim.aln\n')
    fo.write('outfile = ' + og + '.yn00.ks\n')
    fo.write('verbose = 0\n')
    fo.write('icode = 0\n')
    fo.write('weighting = 0\n')
    fo.write('commonf3x4 = 0\n')
    
    fo.close()

In [None]:
! cat ../../../7_Pangenome/1_OrthoFinder/MultipleSequenceAlignments_GUIDANCE/OG0000141.yn00.ks

# Parse yn00 output

In [None]:
strains = ['12B1','CCMP3037','UTEX2797','12A1','RCC3703','CCMP2941','KAC39','RCC3426','K0374','K0081','K0252','RCC1436','UTEX995','RCC1433','RCC191']

ksDict = {}
for strain1 in strains:
    ksDict[strain1] = {}
    for strain2 in strains:
        ksDict[strain1][strain2] = {}

In [None]:
treefiles = '../../../figshare/orthofinder/Gene_Trees_IQTREE/*.ntr.tree'
ksfiles = '../../../figshare/orthofinder/MultipleSequenceAlignments_GUIDANCE/*.yn00.ks'
ksmodel = 'LPB93'

In [None]:
for treefile in glob.glob(treefiles):
    og = treefile.split('/')[-1].split('.')[0]
    
    infile = '../../../figshare/orthofinder/MultipleSequenceAlignments_GUIDANCE/' + og + '.yn00.ks'
    fi = open(infile)

    gene1 = ''
    gene2 = ''
    ks = -1


    for line in fi:

        if 'vs.' in line:
            #print(line)

            gene1 = line.split('(')[1].split(')')[0]
            gene2 = line.split('(')[2].split(')')[0]

            #print(strain1, strain2)

        if ksmodel + ':' in line:
            #print(line)
            ks = line.split()[3].split()[0]
            #print(strain1,strain2,ks)

            strain1 = gene1.split('_')[-1]
            strain2 = gene2.split('_')[-1]

            genePair = tuple({gene1,gene2})

            ksDict[strain1][strain2][genePair] = [og, ks]
            ksDict[strain2][strain1][genePair] = [og, ks]

    fi.close()

In [None]:
outfile = '../figures/ks_pairwise_allstrains_treeogsonly_' + ksmodel + '.Rin'

fo = open(outfile, 'w')
fo.write('strain1\tstrain2\tgenes\tog\tks\n')

for strain1 in ksDict:
    for strain2 in ksDict:
        for genepair in ksDict[strain1][strain2]:
            genes = ' '.join(list(genepair))
            og = ksDict[strain1][strain2][genepair][0]
            ks = ksDict[strain1][strain2][genepair][1]
            #print(strain1, strain2, genes, og, ks)
            
            fo.write(f'{strain1}\t{strain2}\t{genes}\t{og}\t{ks}\n')
            
fo.close()

# UTEX subgenomes

In [None]:
a1bedfile = '../../../figshare/orthofinder/Comparative_Genomics_Statistics/UTEX2797_sisrels_orange0.bed'
a2bedfile = '../../../figshare/orthofinder/Comparative_Genomics_Statistics/UTEX2797_sisrels_pink0.bed'

In [None]:
a1Set = set()
a2Set = set()

fi = open(a1bedfile)
for line in fi:
    if line[0] == '#':
        continue
    col = line.rstrip().split('\t')
    gene = col[-1].split('_')[1].split('UTEX2797')[1]
    #print(gene)
    a1Set.add(gene)
fi.close()

fi = open(a2bedfile)
for line in fi:
    if line[0] == '#':
        continue
    col = line.rstrip().split('\t')
    gene = col[-1].split('_')[1].split('UTEX2797')[1]
    #print(gene)
    a2Set.add(gene)
fi.close()

In [None]:
outfile = '../figures/ks_pairwise_UTEX2797subgenomes_support0' + ksmodel + '.Rin'

fo = open(outfile, 'w')
fo.write('cat\tstrain2\tgenes\tog\tks\n')

for strain2 in ksDict['UTEX2797']:
    for genepair in ksDict['UTEX2797'][strain2]:
        ugene = ''
        utype = 'unknown'
        for gene in genepair:
            if 'UTEX2797' in gene:
                #print(gene)
                ugene = gene.split('.')[0]
        
        if ugene in a1Set:
            #print(ugene)
            utype = 'A1'
            
        if ugene in a2Set:
            #print(ugene)
            utype = 'A2'

        genes = ' '.join(list(genepair))
        og = ksDict['UTEX2797'][strain2][genepair][0]
        ks = ksDict['UTEX2797'][strain2][genepair][1]
        #print(strain1, strain2, genes, og, ks)
        fo.write(f'{utype}\t{strain2}\t{genes}\t{og}\t{ks}\n')
        
fo.close()

In [None]:
outfile = '../figures/ks_pairwise_allstrains_treeogsonly_' + ksmodel + '.Rin'

fo = open(outfile, 'a')

for strain2 in ksDict['UTEX2797']:
    for genepair in ksDict['UTEX2797'][strain2]:
        ugene = ''
        utype = 'unknown'
        for gene in genepair:
            if 'UTEX2797' in gene:
                #print(gene)
                ugene = gene.split('.')[0]
        
        if ugene in a1Set:
            #print(ugene)
            utype = 'UTEX2797-A1'
            
        if ugene in a2Set:
            #print(ugene)
            utype = 'UTEX2797-A2'

        if utype != 'unknown':
        
            genes = ' '.join(list(genepair))
            og = ksDict['UTEX2797'][strain2][genepair][0]
            ks = ksDict['UTEX2797'][strain2][genepair][1]
            #print(strain1, strain2, genes, og, ks)
            fo.write(f'{utype}\t{strain2}\t{genes}\t{og}\t{ks}\n')
        
fo.close()

In [None]:
astrains = ['12B1', 'CCMP3037', 'RCC3703', 'CCMP2941']
infile = '../figures/ks_pairwise_UTEX2797subgenomes_support0LPB93.Rin'

fi = open(infile)

mydict = {}
for strain in astrains:
    mydict[strain] = {}
    mydict[strain]['A1'] = []
    mydict[strain]['A2'] = []

for line in fi: 
    
    col = line.rstrip().split('\t')
    
    strain = col[1]
    if strain not in astrains:
            #print(strain)
            continue
    
    #print(strain)
    cat = col[0]
    if cat == 'unknown':
        #print(cat)
        continue

    ks = col[-1]

    if ks == 'inf' or ks == '-inf' or ks == 'nan' or ks == '-nan':
        #print(ks)
        continue

    ks = float(ks)

    if ks > 0.05 or ks < 0: 
        #print(strain, cat, ks)
        continue

    #print(strain, cat, ks)
    mydict[strain][cat].append(ks)

    #print(strain, cat, ks)

fi.close()

In [None]:
for strain in mydict:
    for cat in mydict[strain]:
        ks_vals = mydict[strain][cat]
        ks_vals.sort()
        
        medish = len(ks_vals) / 2
        
        
        median = statistics.median(mydict[strain][cat])
        print(strain, cat, ks_vals[int(medish)], median, len(mydict[strain][cat]))

In [None]:
mydict['CCMP2941']['A2']

# Create heatmap

In [None]:
treefiles = '../../../figshare/orthofinder/Gene_Trees_IQTREE/*.ntr.tree'
ksfiles = '../../../7figshare/orthofinder/MultipleSequenceAlignments_GUIDANCE/*.yn00.ks'
ksmodel = 'LPB93'

In [None]:
def isfloat(num):
    try:
        float(num)
        return True
    except ValueError:
        return False

In [None]:
ks_dict = {}

for treefile in glob.glob(treefiles):
    og = treefile.split('/')[-1].split('.')[0]
    #print(og)
    
    infile = '../../../figshare/orthofinder/MultipleSequenceAlignments_GUIDANCE/' + og + '.yn00.ks'
    fi = open(infile)

    gene1 = ''
    gene2 = ''
    ks = -1

    for line in fi:

        if 'vs.' in line:
            #print(line)

            gene1 = line.split('(')[1].split(')')[0]
            gene2 = line.split('(')[2].split(')')[0]

            #print(strain1, strain2)

        if ksmodel + ':' in line:
            #print(line)
            
            ks = line.split()[3].split()[0]
            
            if ks == 'inf' or ks == '-inf' or ks == 'nan' or ks == '-nan':
                continue
            
            ks = float(ks)

            if ks <= 0.25 and ks >= 0:
                #print(ks)

                strain1 = gene1.split('_')[-1]
                strain2 = gene2.split('_')[-1]

                strains = [strain1,strain2]
                strains.sort()
                strains = '-'.join(strains)
                #print(strains, ks)

                if strains in ks_dict:
                    ks_dict[strains].append(ks)
                else:
                    ks_dict[strains]=[ks]
            
    fi.close()

In [None]:
count=0
plot_dict={}
for strains in ks_dict:
    count+=1
        
    ks_vals=ks_dict[strains]
    strainA=strains.split('-')[0]
    strainB=strains.split('-')[1]
    if strainA == strainB:
        continue
    median=statistics.median(ks_vals)
    #print(strainA, strainB, median)

    plot_dict['Comb-'+str(count)]=[strainA,strainB,median]
    count+=1
    plot_dict['Comb-'+str(count)]=[strainB,strainA,median]

In [None]:
df=pd.DataFrame.from_dict(plot_dict, orient='index', columns=['StrainA', 'StrainB', 'KS'])
df

In [None]:
df['KS'].min()

In [None]:
df['KS'].max()

In [None]:
order=['12B1','UTEX2797','CCMP3037','12A1','CCMP2941','RCC3703','K0081','K0374','RCC3426','KAC39','K0252','RCC191','RCC1433','UTEX995','RCC1436']
df_heat = df.pivot("StrainA", "StrainB", "KS")
df_heat = df_heat.reindex(index=order, columns=order)
df_heat.to_csv('../../../figshare/orthofinder/Comparative_Genomics_Statistics/strain_pairwise_ks.txt', sep='\t', header=True, index=True)
df_heat

In [None]:
import seaborn as sns
import matplotlib.colors
from matplotlib.colors import LinearSegmentedColormap

#Colours=['#6F4C9B', '#6059A9', '#5568B8', '#4E79C5', '#4D8AC6', '#4E96BC', '#549EB3', '#59A5A9', '#60AB9E', '#69B190', '#77B77D', '#8CBC68', '#A6BE54', '#BEBC48', '#D1B541', '#DDAA3C', '#E49C39', '#E78C35', '#E67932', '#E4632D', '#DF4828', '#DA2222']
#Colours=[ '#FCF7D5', '#F5F3C1', '#EAF0B5', '#DDECBF', '#D0E7CA', '#C2E3D2', '#B5DDD8', '#A8D8DC', '#9BD2E1', '#8DCBE4', '#81C4E7', '#7BBCE7', '#7EB2E4', '#88A5DD', '#9398D2', '#9B8AC4', '#9D7DB2', '#9A709E', '#906388', '#805770', '#684957', '#46353A']
Colours=['#FFF7BC', '#FEE391', '#FEC44F', '#FB9A29', '#EC7014', '#CC4C02', '#993404', '#662506']
#Colours=['#FEE391', '#FEC44F', '#FB9A29', '#EC7014', '#CC4C02', '#993404', '#662506']

#Colours.reverse()

#normalize colors to min and maxc identities
norm=plt.Normalize(0,0.1)

cmap = matplotlib.colors.LinearSegmentedColormap.from_list("", Colours)

with sns.axes_style("white"):
    f, ax = plt.subplots(figsize=(15, 7))
    ax = sns.heatmap(df_heat, square=True, cmap=cmap, norm=norm)
    plt.savefig('../../../figshare/orthofinder/Comparative_Genomics_Statistics/strain_pairwise_ks_heatmap.svg',dpi=500)