# Create orthogroup sets to test for functional enrichment

* OGs specific to ALL A/B clade genomes
* OGs unique to A/B clade genomes (doesn't need to be present in all strains)
* OGs enriched in A/B clade genomes

In [None]:
import glob
testsets = ['core', 'accessory', 'singleton', 'ATYPE', 'BTYPE', 'CTYPE']

for testset in testsets:
    ! mkdir -p ../../../7_Pangenome/3_KinFin/{testset}

### Pan genome sets

In [None]:
ogfile = '../../../figshare/orthofinder/Orthogroups/Orthogroups.txt'
skipfile = '../GenesOnContigsToRemove.ids.txt'
corefile = '../../../figshare/pangenome/kinfin/core/core_orthogroups.txt'
accessfile = '../../../figshare/pangenome/kinfin/accessory/accessory_orthogroups.txt'
singlefile = '../../../figshare/pangenome/kinfin/singleton/singleton_orthogroups.txt'


# skip 18 12B1 genes determined to be bacterial contamination at the ends of scaffolds 8 and 32
fi = open(skipfile)
skiplist = set()
for gene in fi:
    if gene[0] == '#':
        continue
    gene = gene.rstrip()
    skiplist.add(gene)
fi.close()

    
fo_c = open(corefile, 'w')
fo_a = open(accessfile, 'w')
fo_s = open(singlefile, 'w')


fi = open(ogfile)
for line in fi:
    genelist = line.rstrip().split()
    og = genelist.pop(0).split(':')[0]

    strainSet = set()
    for gene in genelist:
        if gene in skiplist:
            continue
        
        strain = gene.split('_')[1]
        strainSet.add(strain)

    if len(strainSet) == 0:
        continue
        
    elif len(strainSet) == 15: 
        fo_c.write(og + '\n')

    elif len(strainSet) == 1: 
        fo_s.write(og + '\n')
        
    else: 
        fo_a.write(og + '\n')

        
fi.close()
fo_c.close()
fo_a.close()
fo_s.close()

### A-type sets

In [None]:
afile = '../../../figshare/pangenome/kinfin/kinfin_results/TYPE/TYPE.ATYPE.cluster_metrics.txt'
outfile = '../../../figshare/pangenome/kinfin/ATYPE/ATYPE_orthogroups.txt'

fi = open(afile)
fo = open(outfile, 'w')

aTotal = set()

for line in fi:
    if line[0] == '#':
        continue
        
    col = line.rstrip().split('\t')
    og = col[0]
    ogStatus = col[1]
    ogType = col[2]
    ogTaxaCount = int(col[12])
            
    if ogTaxaCount > 0:
        aTotal.add(og)
    
    if ogType == 'specific':
        if ogTaxaCount >= 3:
            fo.write(og + '\n')
        

fi.close()
fo.close()

### B-type sets

In [None]:
bfile = '../../../figshare/pangenome/kinfin/kinfin_results/TYPE/TYPE.BTYPE.cluster_metrics.txt'
outfile = '../../../figshare/pangenome/kinfin/BTYPE/BTYPE_orthogroups.txt'

fi = open(bfile)
fo = open(outfile, 'w')

bTotal = set()

for line in fi:
    if line[0] == '#':
        continue
        
    col = line.rstrip().split('\t')
    og = col[0]
    ogStatus = col[1]
    ogType = col[2]
    ogTaxaCount = int(col[12])
            
    if ogTaxaCount > 0:
        bTotal.add(og)

    
    if ogType == 'specific':
        if ogTaxaCount >= 3:
            fo.write(og + '\n')
        

fi.close()
fo.close()

### C-type sets

In [None]:
cfile = '../../../figshare/pangenome/kinfin/kinfin_results/TYPE/TYPE.CTYPE.cluster_metrics.txt'
outfile = '../../../figshare/pangenome/kinfin/CTYPE/CTYPE_orthogroups.txt'

fi = open(cfile)
fo = open(outfile, 'w')

cTotal = set()

for line in fi:
    if line[0] == '#':
        continue
        
    col = line.rstrip().split('\t')
    og = col[0]
    ogStatus = col[1]
    ogType = col[2]
    ogTaxaCount = int(col[12])
    
    #print(ogTaxaCount)
            
    if ogTaxaCount > 0:
        cTotal.add(og)
    
    if ogType == 'specific':
        if ogTaxaCount >= 3:
            fo.write(og + '\n')
        

fi.close()
fo.close()

# Parse GO annotations

Skip annotations that aren't present in at least 50% of taxa in that orthogroup

In [None]:
annfile = '../../../figshare/pangenome/kinfin/kinfin_results/cluster_domain_annotation.GO.txt'
outfile1 = '../../../figshare/pangenome/kinfin/ATYPE/ATYPE_GOs.annot'
outfile2 = '../../../figshare/pangenome/kinfin/BTYPE/BTYPE_GOs.annot'
outfile4 = '../../../figshare/pangenome/kinfin/CTYPE/CTYPE_GOs.annot'
outfile3 = '../../../figshare/pangenome/kinfin/ALL_GOs.annot'


fi = open(annfile)
fo1 = open(outfile1, 'w')
fo2 = open(outfile2, 'w')
fo3 = open(outfile3, 'w')
fo4 = open(outfile4, 'w')


for line in fi:
    #print(line)
    if line[0] == '#':
        continue
    
    col = line.rstrip().split('\t')
    og = col[0]
    go = col[2]
    fraction = float(col[6])
    
    if fraction >= 0.5:
        #print(line)
        fo3.write(og + '\t' + go + '\n')

        if og in aTotal:
            #print(og)
            fo1.write(og + '\t' + go + '\n')

        if og in bTotal:
            #print(og)
            fo2.write(og + '\t' + go + '\n')

        if og in cTotal:
            #print(og)
            fo4.write(og + '\t' + go + '\n')

    
fi.close()
fo1.close()
fo2.close()
fo3.close()
fo4.close()

In [None]:
obofile = 'go_enrichment_test/go.obo'
program = 'go_enrichment_test/go-enrich'

for testset in testsets:
    setfile = '../../../figshare/pangenome/kinfin/' + testset + '/' + testset + '_orthogroups.txt'
    outfile = '../../../figshare/pangenome/kinfin/' + testset + '/' + testset + '_goenrich.out'

    if testset[0] == 'A' or testset[0] == 'B' or testset[0] == 'C':
        annfile = '../../../figshare/pangenome/kinfin/' + testset + '/' + testset + '_GOs.annot'
        !{program} -a {annfile} --obofile {obofile} -s {setfile} -p 1 -o {outfile}         

    else:
        annfile = '../../../figshare/pangenome/kinfin/ALL_GOs.annot'
        !{program} -a {annfile} --obofile {obofile} -s {setfile} -p 1 -o {outfile}         


# Parse KEGG annotations

Same as with GO. Skip annotations that aren't present in at least 50% of taxa in that orthogroup

In [None]:
annfile = '../../../figshare/pangenome/kinfin/kinfin_results/cluster_domain_annotation.KEGG.txt'
outfile1 = '../../../figshare/pangenome/kinfin/ATYPE/ATYPE_KEGG.annot'
outfile2 = '../../../figshare/pangenome/kinfin/BTYPE/BTYPE_KEGG.annot'
outfile3 = '../../../figshare/pangenome/kinfin/ALL_KEGG.annot'
outfile4 = '../../../figshare/pangenome/kinfin/CTYPE/CTYPE_KEGG.annot'


fi = open(annfile)
fo1 = open(outfile1, 'w')
fo2 = open(outfile2, 'w')
fo3 = open(outfile3, 'w')
fo4 = open(outfile4, 'w')



for line in fi:
    #print(line)
    if line[0] == '#':
        continue
    
    col = line.rstrip().split('\t')
    og = col[0]
    kegg = col[2]
    fraction = float(col[6])
    
    if fraction >= 0.5:
        #print(line)
        fo3.write(og + '\t' + kegg + '\n')

        if og in aTotal:
            #print(og)
            fo1.write(og + '\t' + kegg + '\n')

        if og in bTotal:
            #print(og)
            fo2.write(og + '\t' + kegg + '\n')

        if og in cTotal:
            #print(og)
            fo4.write(og + '\t' + kegg + '\n')


    
fi.close()
fo1.close()
fo2.close()
fo3.close()
fo4.close()

In [None]:
program = 'kegg_enrichment_test/kegg-enrich'

for testset in testsets:
    setfile = '../../../figshare/pangenome/kinfin/' + testset + '/' + testset + '_orthogroups.txt'
    outfile = '../../../figshare/pangenome/kinfin/' + testset + '/' + testset + '_keggenrich.out'

    if testset[0] == 'A' or testset[0] == 'B' or testset[0] == 'C':
        annfile = '../../../figshare/pangenome/kinfin/' + testset + '/' + testset + '_KEGG.annot'
        !{program} -a {annfile} -s {setfile} -o {outfile}         

    else:
        annfile = '../../../figshare/pangenome/kinfin/ALL_KEGG.annot'
        !{program} -a {annfile} -s {setfile} -o {outfile}         

# Combine files

In [None]:
outfile = '../../../figshare/pangenome/kinfin/all_echrichment_tests.txt'
fo = open(outfile, 'w')

header = ['orthogroup set', 'annotation', 'annotation category', 'annotation description', 'no. in set with annotation', 'no. in set with any annotation', 'set frequency', 'total no. with annotation', 'total no. with any annotation', 'total frequency', 'p-value', 'BH adjusted p-value', 'orthogroups in set with annotation']
fo.write('\t'.join(header) + '\n')


for testset in testsets:

    for infile in glob.glob('../../../figshare/pangenome/kinfin/' + testset + '/*enrich.out'):
        print(infile)

        fi = open(infile)
        for line in fi:
            col = line.rstrip().split('\t')

            if col[0] == 'set':
                continue

            fo.write(line)

        fi.close()

fo.close()

# Accessory genome KEGG enrichment plot

```
library(ggplot2)
df = read.table("Desktop/accessory_keggenrich.Rin", sep = '\t', header = TRUE)

ggplot(data=df, aes(x=cat, y=count/1101*100, fill=-log2(pval))) + geom_bar(stat="identity") + coord_flip() + theme_classic() + scale_fill_gradientn(colours = c('#7BBCE7', '#7EB2E4', '#88A5DD', '#9398D2', '#9B8AC4', '#9D7DB2', '#9A709E', '#906388', '#805770', '#684957'))
```

# Accessory genome GO enrichment plot

```
library(ggplot2)
df = read.table("Desktop/accessory_goenrich.Rin", sep = '\t', header = TRUE)

ggplot(data=df, aes(x=cat, y=id, size = freq*100, color=-log2(pval))) + geom_point() + theme_classic() + scale_color_gradientn(colours = c('#7BBCE7', '#7EB2E4', '#88A5DD', '#9398D2', '#9B8AC4', '#9D7DB2', '#9A709E', '#906388', '#805770', '#684957')) + scale_size(range = c(2, 25)) + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
```