In [None]:
%matplotlib inline
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.colors
from matplotlib.colors import LinearSegmentedColormap
import pandas as pd
import numpy as np
import glob
import statistics

In [None]:
ogfile = '../../../figshare/orthofinder/Orthogroups/Orthogroups.txt'
skipfile = '../GenesOnContigsToRemove.ids.txt'

overlapfile = '../../../figshare/orthofinder/Comparative_Genomics_Statistics/shared_OGs_matrix.txt'
heatfile = '../../../figshare/orthofinder/Comparative_Genomics_Statistics/shared_OGs_heatmap.svg'

In [None]:
all_strains = ['12B1','UTEX2797','CCMP3037','12A1','CCMP2941','RCC3703','K0081','K0374','RCC3426','KAC39','K0252','RCC191','RCC1433','UTEX995','RCC1436']

In [None]:
# skip 18 12B1 genes determined to be bacterial contamination at the ends of scaffolds 8 and 32
fi = open(skipfile)
skiplist = set()
for gene in fi:
    if gene[0] == '#':
        continue
    gene = gene.rstrip()
    skiplist.add(gene)
fi.close()

## Parse OrthoFinder output

In [None]:
core_ogs = 0
accessory_ogs = 0
singleton_ogs = 0

strain_og_dict = {}
for strain in all_strains:
    strain_og_dict[strain] = []

ogDict = {}

revogDict = {}
skipogs = set()

singletonSet = set()
accessorySet = set()
coreSet = set()

accessoryDict = {}
singletonDict = {}
for strain in all_strains:
    accessoryDict[strain] = set()
    singletonDict[strain] = set()

    
fi = open(ogfile)
for line in fi:
    genelist = line.rstrip().split()
    og = genelist.pop(0).split(':')[0]

    strainSet = set()
    for gene in genelist:
        if gene in skiplist:
            continue
        
        revogDict[gene] = og
        strain = gene.split('_')[1]
        strainSet.add(strain)


    if len(strainSet) == 0: 
        skipogs.add(og)
        continue
    else:
        ogDict[og] = genelist
        for strain1 in strainSet:
            for strain2 in strainSet:
                strain_og_dict[strain1].append(strain2)


    if len(strainSet) == 15: 
        core_ogs += 1
        coreSet.add(og)

    elif len(strainSet) == 1: 
        singleton_ogs += 1
        singletonSet.add(og)
        for strain in strainSet:
            singletonDict[strain].add(og)
        
    else: 
        accessory_ogs += 1
        accessorySet.add(og)
        for strain in strainSet:
            accessoryDict[strain].add(og)


fi.close()

total_ogs = core_ogs + singleton_ogs + accessory_ogs

## Print pan genome statistics

In [None]:
print(total_ogs)

In [None]:
print(core_ogs)

In [None]:
print(singleton_ogs)

In [None]:
print(accessory_ogs)

In [None]:
print(len(accessorySet))

In [None]:
outfile = '../../../figshare/orthofinder/Comparative_Genomics_Statistics/core_OGs.txt'    

fo = open(outfile, 'w')

for og in coreSet:
    fo.write(og + '\n')

fo.close()

In [None]:
for strain in all_strains:
    print(strain, len(accessoryDict[strain]), len(singletonDict[strain]))
    
    outfile1 = '../../../figshare/orthofinder/Comparative_Genomics_Statistics/' + strain + 'accessory_OGs.txt'        
    outfile2 = '../../../figshare/orthofinder/Comparative_Genomics_Statistics/' + strain + 'singleton_OGs.txt'        
   
    fo1 = open(outfile1, 'w')
    fo2 = open(outfile2, 'w')
    
    for og in accessoryDict[strain]:
        fo1.write(og + '\n')
    
    for og in singletonDict[strain]:
        fo2.write(og + '\n')

    fo1.close()
    fo2.close()

```
library(ggplot2)
df = read.table("Orthofinder_stats.Rin", sep = '\t', header = TRUE)

df$strain <- factor(df$strain, levels=c('RCC1436', 'UTEX995','RCC1433','RCC191','K0252','KAC39','RCC3426','K0374','K0081','RCC3703','CCMP2941','12A1','CCMP3037','UTEX2797','12B1'))

ggplot(data=df, aes(x=strain, y=count, fill=type)) + geom_bar(stat="identity") + coord_flip() + theme_classic() + facet_grid(cols = vars(type), scales = "free")
```

## Create shared orthogroup heatmap

In [None]:
plot_dict = {}

i = 0

"""for strain1 in all_strains:
    for strain2 in all_strains:
        i += 1
        count = strain_og_dict[strain1].count(strain2)
        plot_dict['Comb-' + str(i)] = [strain1, strain2, count]"""

for strain1 in all_strains:
    plot_dict[strain1] = {}
    for strain2 in all_strains:
        #Count shared orthogroups stored in dict
        count = strain_og_dict[strain1].count(strain2)
        #Store this information in the new dict
        plot_dict[strain1][strain2] = count

In [None]:
df=pd.DataFrame.from_dict(plot_dict, orient='index')#, columns=['StrainA', 'StrainB', 'shared'])
df.to_csv(overlapfile, sep='\t', header=True, index=True)

In [None]:
df

#### Min shared per strain

In [None]:
df.min()

#### Min shared across all strains

In [None]:
min(df.min())

#### Max shared per strain

In [None]:
df.max()

#### Max shared across all strains

In [None]:
max(df.max())

_____

## Plotting heatmaps

### Heatmap of shared orthogroups among strains

In [None]:
#Define colors of the heatmap scale
colours = ['#FEFBE9', '#FCF7D5', '#F5F3C1', '#EAF0B5', '#DDECBF',
           '#D0E7CA', '#C2E3D2', '#B5DDD8', '#A8D8DC', '#9BD2E1', 
           '#8DCBE4', '#81C4E7', '#7BBCE7', '#7EB2E4', '#88A5DD', 
           '#9398D2', '#9B8AC4', '#9D7DB2', '#9A709E', '#906388', 
           '#805770', '#684957', '#46353A']

#define scale here, normalize by the approximate max and min printed
# out above
norm = plt.Normalize(18500,28000)

#Create the colormap
cmap = matplotlib.colors.LinearSegmentedColormap.from_list("", colours)

#Plot
with sns.axes_style("white"):
    f, ax = plt.subplots(figsize=(15, 7))
    ax = sns.heatmap(df, square=True, cmap=cmap)
    plt.savefig(heatfile,dpi=500) #save fig

### Plotting with hierarchical clustering
Same data as above, but with clustering 

In [None]:
#Uses same colorbar as defined in plot above.
ax = sns.clustermap(df, square=True, cmap=cmap)

#Heatfile2 output name is same as heatmapfile but adding 
#.  'clustered' to filename
heatfile2 = heatfile.replace('.svg', '_clustered.svg')

#Save the plot
plt.savefig(heatfile2, dpi=500)