In [None]:
import glob
from Bio import SeqIO
import statistics


In [None]:
strains=[ 'K0081', 'K0374', 'RCC191', 'CCMP3037', '12A1', 'UTEX995', 'RCC3703', '12B1', 'RCC1436', 'KAC39', 'CCMP2941', 'UTEX2797', 'RCC1433', 'RCC3426', 'K0252' ]

In [None]:
paramfile = '../../figshare/orthofinder/Species_Tree/Supermatrix_in.param'
alignment_files = '../../figshare/orthofinder/MultipleSequenceAlignments_GUIDANCE/OG*aln'
gff_files = '../../figshare/annotation/genes_illumina_assembly/*.gff3'

In [None]:
st_orthogroups=[]
with open(paramfile) as sources:
    for line in sources:
        OG=line.split(', ')[1].split(' = ')[0]
        st_orthogroups.append(OG)
len(st_orthogroups)

In [None]:
scogDict = {}
for strain in strains:
    scogDict[strain] = set()

for alignment in glob.glob(alignment_files):
    if 'trim' not in alignment:
        continue
    else:
        OG=alignment.split('/')[-1].split('.')[0]
        if OG in st_orthogroups:
            for record in SeqIO.parse(alignment, "fasta"):
                #print(record.id)
                record_strain = record.id.split('_')[-1]
                record_gene = record.id.split('.')[0]
                
                scogDict[record_strain].add(record_gene)


In [None]:
for strain in scogDict:
    print(strain, len(scogDict[record_strain]))

In [None]:
for gff_file in glob.glob(gff_files):
    #print(gff_file)
    strain = gff_file.split('/')[-1].split('.')[0]
    
    outfile = strain + '_scogs.bed'
    
    #print(strain)
    count = 0
    fi = open(gff_file)
    fo = open(outfile, 'w')
    
    for line in fi:
        col = line.rstrip().split('\t')
        new_scafid = 'scaf' + col[0] + '-' + strain
        ftype = col[2]
        #print(new_scafid, ftype)
        
        if ftype == 'gene':
            gid = col[8].split('=')[1].split(';')[0]
            #print(gid)
            if gid in scogDict[strain]:
                start = col[3]
                stop = col[4]
                #print(gid)
                count += 1
                
                #print(new_scafid, start, stop, gid)
                fo.write(new_scafid + '\t' + start + '\t' + stop + '\t' + gid + '\n')
                
                
    #print(strain, count)
        
    
    
    fi.close()
    fo.close()

In [None]:
strain = 'UTEX2797'
gff_file = '../../figshare/annotation/genes_scaffolded_assembly/UTEX2797_v1.gff''
outfile = strain + '_scogs.bed'

#print(strain)
count = 0
fi = open(gff_file)
fo = open(outfile, 'w')

for line in fi:
    col = line.rstrip().split('\t')
    new_scafid = col[0]
    ftype = col[2]
    #print(new_scafid, ftype)

    if ftype == 'gene':
        gid = col[8].split('=UTEX2797')[1].split(';')[0]
        #print(gid)
        if gid in scogDict[strain]:
            start = col[3]
            stop = col[4]
            #print(gid)
            count += 1

            #print(new_scafid, start, stop, gid)
            fo.write(new_scafid + '\t' + start + '\t' + stop + '\t' + gid + '\n')


#print(strain, count)



fi.close()
fo.close()

In [None]:
strain = '12B1'
gff_file = '../../figshare/annotation/genes_scaffolded_assembly/12B1_v1.gff'
outfile = strain + '_scogs.bed'

#print(strain)
count = 0
fi = open(gff_file)
fo = open(outfile, 'w')

for line in fi:
    col = line.rstrip().split('\t')
    new_scafid = col[0]
    ftype = col[2]
    #print(new_scafid, ftype)

    if ftype == 'gene':
        gid = col[8].split('=12B1')[1].split(';')[0]
        #print(gid)
        if gid in scogDict[strain]:
            start = col[3]
            stop = col[4]
            #print(gid)
            count += 1

            #print(new_scafid, start, stop, gid)
            fo.write(new_scafid + '\t' + start + '\t' + stop + '\t' + gid + '\n')


#print(strain, count)



fi.close()
fo.close()

`bedtools coverage -mean -a ${STRAIN}_scogs.bed -b ${STRAIN}_self_refv1.sorted.bam > ${STRAIN}_scogs_mean_cov.bed`

In [None]:
outfile = 'figures/scog_coverage.Rin'
fo = open(outfile, 'w')
fo.write('strain\tgene\tcov\n')

for bedfile in glob.glob('../../figshare/orthofinder/SCOG_Read_Depth/*scogs_mean_cov.bed'):
    #print(bedfile)
    strain = bedfile.split('_')[0]
    covList = []
    
    fi = open(bedfile)
    
    for line in fi:
        scaf, start, stop, gene, cov = line.rstrip().split('\t')
        
        fo.write(strain + '\t' + gene + '\t' + cov + '\n')
        #print(gene,cov)
        covList.append(float(cov))
    
    fi.close()
    
    print(strain, statistics.median(covList), )
    
fo.close()