### Notebook for tables and subfigures
This is a short notebook that tries to capture all the things done to get the values for the tables

#### Location of table draft
The table draft is in the shared google drive folder SOMEWHERE/GRRC_collaboration/DK0911_genome_report_manuscript/Figures_and_Tables

#### Table 1 or first Table for now
This table summarizes the gene content and TE coverage and such.

In [1]:
import pandas as pd
from Bio import SeqIO
from Bio import SeqUtils
import os
import re

In [2]:
#now get the basefolder of the final genome assembly
BASEFOLDER = '/home/benjamin/genome_assembly/Warrior/genome_v04'
TABLE_tmp = os.path.join(BASEFOLDER, '../DK0911_v04/TABLE_tmp')
if not os.path.exists(TABLE_tmp):
    os.makedirs(TABLE_tmp)
p_name = 'DK_0911_v04_p_ctg'
h_name = 'DK_0911_v04_h_ctg'
p_BUSCO_file = '/home/benjamin/genome_assembly/Warrior/DK0911_v04/protein_annotation/DK_0911_v04LT_p_ctg/funannotate/parsed/annotations.busco.txt'
h_BUSCO_file = '/home/benjamin/genome_assembly/Warrior/DK0911_v04/protein_annotation/DK_0911_v04LT_h_ctg/funannotate/parsed/annotations.busco.txt'
p_effector2_fa_fn = '/home/benjamin/genome_assembly/Warrior/DK0911/Secretome/DK_0911_v04_p_ctg.EffectorP2.0.fa'
h_effector2_fa_fn = '/home/benjamin/genome_assembly/Warrior/DK0911/Secretome/DK_0911_v04_h_ctg.EffectorP2.0.fa'

In [3]:
#thats what is in that folder
!ls {BASEFOLDER}

DK_0911_v04_h_ctg.104Ep_DK0911p.REPET.gff3
DK_0911_v04_h_ctg.anno.gff3
DK_0911_v04_h_ctg.cds.fa
DK_0911_v04_h_ctg.fa
DK_0911_v04_h_ctg.fa.fai
DK_0911_v04_h_ctg.gene.fa
DK_0911_v04_h_ctg.genome_file
DK_0911_v04_h_ctg.protein.fa
DK_0911_v04LT_h_ctg.cds.fa
DK_0911_v04LT_h_ctg.gene.bed
DK_0911_v04LT_h_ctg.gene.fa
DK_0911_v04LT_h_ctg.protein.fa
DK_0911_v04LT.nr_id_80.protein.fa
DK_0911_v04LT_p_ctg.cds.fa
DK_0911_v04LT_p_ctg.gene.bed
DK_0911_v04LT_p_ctg.gene.fa
DK_0911_v04LT_p_ctg.protein.fa
DK_0911_v04LT_ph_ctg.cds.fa
DK_0911_v04LT_ph_ctg.protein.fa
DK_0911_v04_p_ctg.104Ep_DK0911p.REPET.gff3
DK_0911_v04_p_ctg.104Ep_DK0911p.REPET.superfamily.sorted.gff
DK_0911_v04_p_ctg.anno.gff3
DK_0911_v04_p_ctg.cds.fa
DK_0911_v04_p_ctg.fa
DK_0911_v04_p_ctg.fa.fai
DK_0911_v04_p_ctg.gene.fa
DK_0911_v04_p_ctg.gene.gff3
DK_0911_v04_p_ctg.genome_file
DK_0911_v04_p_ctg.protein.fa
DK_0911_v04_p_ctg.pwh.anno.gff3
DK_0911_v04_ph_ctg.anno.gff3
DK_0911_v04_ph_ctg.anno.gtf
DK_0911_v04_

In [4]:
def get_pwh_and_pwoh(p_contig_file, h_contig_file):
    """Function that reads primary and haplotig file and returns the following.
    p_contigs, h_contigs, pwh_list, pwoh_list 
    Naming convention needs to follow pcontig_xxx and hcontig_xxx_xxx."""
    pwh_list = []
    pwoh_list = []
    h_contigs = []
    h_to_p_list = []
    for seq in SeqIO.parse(h_contig_file, 'fasta'):
        h_contigs.append(seq.id)
        h_to_p_list.append(seq.id[:-4].replace('h','p'))
    h_to_p_list = list(set(h_to_p_list))
    p_contigs = []
    for seq in SeqIO.parse(p_contig_file, 'fasta'):
        p_contigs.append(seq.id)
        if seq.id in  h_to_p_list:
            pwh_list.append(seq.id)
        else:
            pwoh_list.append(seq.id)
    return p_contigs, h_contigs, pwh_list, pwoh_list 

In [5]:
def summary_genome_df(genome_fn):
    contig_length = []
    contig_id =[]
    contig_GC = []
    assembly_id = []
    for x in SeqIO.parse(genome_fn, 'fasta'):
        contig_id.append(x.id)
        contig_length.append(len(x.seq))
        contig_GC.append(SeqUtils.GC(x.seq))
        assembly_id.append(genome_fn)
    s1 = pd.Series(contig_id, name='ID')
    s2 = pd.Series(contig_length, name='Length')
    s3 = pd.Series(contig_GC, name='%GC')
    s4 = pd.Series(assembly_id, name="Assembly")
    contig_df = pd.concat([s1, s2, s3, s4], axis=1)
    return contig_df

In [6]:
p_contig_file = os.path.join(BASEFOLDER, '%s.fa' % p_name)
h_contig_file = os.path.join(BASEFOLDER, '%s.fa' % h_name)
p_contigs, h_contigs, pwh_list, pwoh_list = get_pwh_and_pwoh(p_contig_file, h_contig_file)
p_contig_df = summary_genome_df(p_contig_file)
h_contig_df = summary_genome_df(h_contig_file)

In [18]:
#N50 calcuations
p_contig_df.sort_values('Length', ascending=False, inplace = True)
p_contig_df['cumSum'] = p_contig_df.Length.cumsum()
p_contig_df[p_contig_df.cumSum > 0.5*p_contig_df.cumSum.max() ]['Length']

16    1543748
17    1541929
18    1432633
19    1403510
20    1341714
21    1267121
22    1266820
23    1097733
25    1025863
26     985869
13     975977
28     971047
27     970245
29     968355
30     910852
32     905432
31     900669
33     883174
34     873993
35     811016
38     738750
39     737851
36     736666
37     722814
40     716704
41     628565
44     605520
43     589307
45     587382
46     555801
       ...   
67     181555
68     173893
69     169609
70     152492
71     138502
72     137558
73     117314
57     116323
74     109931
75     100797
76      98178
77      81214
78      80580
79      72512
81      70543
80      70079
82      56462
42      55741
84      48536
85      44208
83      40668
86      30698
87      29712
24      25437
88      23312
89      23178
90      23117
91      22424
92      20280
93      20182
Name: Length, Length: 79, dtype: int64

In [7]:
pwh_list

['pcontig_000',
 'pcontig_001',
 'pcontig_002',
 'pcontig_003',
 'pcontig_004',
 'pcontig_005',
 'pcontig_006',
 'pcontig_008',
 'pcontig_009',
 'pcontig_010',
 'pcontig_011',
 'pcontig_012',
 'pcontig_013',
 'pcontig_014',
 'pcontig_015',
 'pcontig_016',
 'pcontig_017',
 'pcontig_018',
 'pcontig_019',
 'pcontig_020',
 'pcontig_021',
 'pcontig_022',
 'pcontig_023',
 'pcontig_024',
 'pcontig_025',
 'pcontig_026',
 'pcontig_027',
 'pcontig_028',
 'pcontig_029',
 'pcontig_030',
 'pcontig_031',
 'pcontig_032',
 'pcontig_033',
 'pcontig_034',
 'pcontig_035',
 'pcontig_036',
 'pcontig_037',
 'pcontig_038',
 'pcontig_039',
 'pcontig_040',
 'pcontig_041',
 'pcontig_042',
 'pcontig_043',
 'pcontig_045',
 'pcontig_046',
 'pcontig_047',
 'pcontig_048',
 'pcontig_049',
 'pcontig_050',
 'pcontig_051',
 'pcontig_052',
 'pcontig_053',
 'pcontig_054',
 'pcontig_055',
 'pcontig_057',
 'pcontig_058',
 'pcontig_060',
 'pcontig_063',
 'pcontig_064',
 'pcontig_065',
 'pcontig_066',
 'pcontig_067',
 'pconti

In [8]:
#GC content quickly
pwh_contig_df = p_contig_df[p_contig_df.ID.isin(pwh_list)]
pwoh_contig_df = p_contig_df[p_contig_df.ID.isin(pwoh_list)]

In [9]:
pwoh_contig_df

Unnamed: 0,ID,Length,%GC,Assembly
57,pcontig_061,116323,45.089105,/home/benjamin/genome_assembly/Warrior/genome_...
71,pcontig_077,138502,43.18999,/home/benjamin/genome_assembly/Warrior/genome_...
74,pcontig_080,109931,44.419681,/home/benjamin/genome_assembly/Warrior/genome_...
86,pcontig_099,30698,44.797707,/home/benjamin/genome_assembly/Warrior/genome_...
88,pcontig_104,23312,40.614276,/home/benjamin/genome_assembly/Warrior/genome_...
89,pcontig_105,23178,42.009664,/home/benjamin/genome_assembly/Warrior/genome_...
92,pcontig_109,20280,44.112426,/home/benjamin/genome_assembly/Warrior/genome_...
93,pcontig_110,20182,42.770786,/home/benjamin/genome_assembly/Warrior/genome_...


In [132]:
pwoh_contig_df['l*GC'] = pwoh_contig_df['Length'] *  pwoh_contig_df['%GC']
pwoh_contig_df['l*GC'].sum()/pwoh_contig_df['Length'].sum()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


43.8705157066869

In [133]:
pwh_contig_df['l*GC'] = pwh_contig_df['Length'] *  pwh_contig_df['%GC']
pwh_contig_df['l*GC'].sum()/pwh_contig_df['Length'].sum()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


44.42511496789157

In [134]:
h_contig_df['l*GC'] = h_contig_df['Length'] *  h_contig_df['%GC']
h_contig_df['l*GC'].sum()/h_contig_df['Length'].sum()

44.43558624193618

In [6]:
#haplotig sequence length
h_length = sum([len(x.seq) for x in SeqIO.parse(h_contig_file, 'fasta')])
print(len(h_contigs))
print(h_length)

1176
52126201


In [7]:
#pwh sequence length
pwh_length = sum([len(x.seq) for x in SeqIO.parse(p_contig_file, 'fasta') if x.id in pwh_list])
print(len(pwh_list))
print(pwh_length)

86
73945211


In [8]:
#pwoh sequence length
pwoh_length = sum([len(x.seq) for x in SeqIO.parse(p_contig_file, 'fasta') if x.id in pwoh_list])
print(len(pwoh_list))
print(pwoh_length)

8
482406


In [9]:
print(len([x.id for x in SeqIO.parse(p_contig_file, 'fasta')]))

94


In [10]:
!grep -v '^>' {p_contig_file} | wc

1240505 1240505 75668122


In [11]:
print(pwh_length+pwoh_length)

74427617


In [12]:
l = 0
for x in SeqIO.parse(p_contig_file, 'fasta'):
    if len(x.seq) == 0:
        print(x.id)
print(l)

0


In [112]:
p_anno_gff = os.path.join(BASEFOLDER, 'DK_0911_v04_p_ctg.anno.gff3')
pwh_anno_gff = os.path.join(TABLE_tmp, 'DK_0911_v04_p_ctg.pwh.anno.gff3')
pwoh_anno_gff = os.path.join(TABLE_tmp, 'DK_0911_v04_p_ctg.pwoh.anno.gff3')
p_REPET_gff = os.path.join(BASEFOLDER, 'DK_0911_v04_p_ctg.104Ep_DK0911p.REPET.gff3')
h_REPET_gff = os.path.join(BASEFOLDER, 'DK_0911_v04_h_ctg.104Ep_DK0911p.REPET.gff3')
pwh_REPET_gff = os.path.join(TABLE_tmp, 'DK_0911_v04_p_ctg.104Ep_DK0911p.pwh.REPET.gff3')
pwoh_REPET_gff = os.path.join(TABLE_tmp, 'DK_0911_v04_p_ctg.104Ep_DK0911p.pwoh.REPET.gff3')
h_anno_gff = os.path.join(BASEFOLDER, "DK_0911_v04_h_ctg.anno.gff3")

In [111]:
BASEFOLDER

'/home/benjamin/genome_assembly/Warrior/genome_v04'

In [14]:
in_fn = p_anno_gff
out_fn = pwh_anno_gff
subset = pwh_list

In [15]:
with open(in_fn, 'r') as in_fh:
    with open(out_fn, 'w') as out_fh:
        for line in in_fh:
            if any(x in line.split('\t') for x in subset):
                line.rstrip()
                print(line, file=out_fh)

In [16]:
!cut -f 1 {out_fn} | sort | uniq | wc

     87      86    1033


In [17]:
def grep(in_fn, out_fn, subset):
    """Functon that takes an infile and greps each line where one column exactly matches the subset"""
    with open(in_fn, 'r') as in_fh:
        with open(out_fn, 'w') as out_fh:
            for line in in_fh:
                if any(x in line.split('\t') for x in subset):
                    line.rstrip()
                    print(line, file=out_fh)

In [18]:
grep(p_anno_gff, pwh_anno_gff, pwh_list)

In [19]:
grep(p_anno_gff, pwoh_anno_gff, pwoh_list)

In [20]:
grep(p_REPET_gff,pwh_REPET_gff,  pwh_list)

In [21]:
grep(p_REPET_gff,pwoh_REPET_gff,  pwoh_list)

In [22]:
!report_gff3_statistics.py -i {pwh_anno_gff}

Assembly count	86
Assembly length	N/A (no FASTA data in GFF?)

Gene count	15003
Gene length (mean)	1552.9
Gene length (sum)	23298795

mRNA count	15003
mRNA length (mean)	1552.9
mRNA length (sum)	23298795
mRNAs per gene (mean)	1.0

exon count	64729
exon length (mean)	273.4
exon length (sum)	17697858
exons per mRNA (mean)	4.3

CDS count	64729
CDS length (mean)	273.4
CDS fragment length (sum)	17697858
CDS per mRNA (mean)	4.3

# CDS fragment composition profile: count<tab>percentage
mRNAs with 1 CDS	3112	20.7
mRNAs with 2 CDS	2617	17.4
mRNAs with 3 CDS	2286	15.2
mRNAs with 4 CDS	1707	11.4
mRNAs with 5 CDS	1426	9.5
mRNAs with 6 CDS	962	6.41
mRNAs with 7 CDS	704	4.69
mRNAs with 8 CDS	496	3.31
mRNAs with 9 CDS	400	2.67
mRNAs with 10 CDS	355	2.37
mRNAs with 11 CDS	201	1.34
mRNAs with 12 CDS	166	1.11
mRNAs with 13 CDS	123	0.82
mRNAs with 14 CDS	100	0.667
mRNAs with 15 CDS	73	0.487
mRNAs with 16 CDS	51	0.34
mRNAs with 17 CDS	46	0.307
mRNAs with 18 CDS	39	0.26
mRNAs with 19 CDS	28	0.187
mRNAs wit

In [23]:
!report_gff3_statistics.py -i {pwoh_anno_gff}

Assembly count	7
Assembly length	N/A (no FASTA data in GFF?)

Gene count	67
Gene length (mean)	1032.3
Gene length (sum)	69164

mRNA count	67
mRNA length (mean)	1032.3
mRNA length (sum)	69164
mRNAs per gene (mean)	1.0

exon count	183
exon length (mean)	320.3
exon length (sum)	58615
exons per mRNA (mean)	2.7

CDS count	183
CDS length (mean)	320.3
CDS fragment length (sum)	58615
CDS per mRNA (mean)	2.7

# CDS fragment composition profile: count<tab>percentage
mRNAs with 1 CDS	26	38.8
mRNAs with 2 CDS	15	22.4
mRNAs with 3 CDS	12	17.9
mRNAs with 4 CDS	5	7.46
mRNAs with 5 CDS	2	2.99
mRNAs with 6 CDS	2	2.99
mRNAs with 7 CDS	3	4.48
mRNAs with 14 CDS	2	2.99


In [24]:
!report_gff3_statistics.py -i {h_anno_gff}

Assembly count	1125
Assembly length	N/A (no FASTA data in GFF?)

Gene count	10870
Gene length (mean)	1419.3
Gene length (sum)	15428273

mRNA count	10870
mRNA length (mean)	1419.3
mRNA length (sum)	15428273
mRNAs per gene (mean)	1.0

exon count	44756
exon length (mean)	270.1
exon length (sum)	12087793
exons per mRNA (mean)	4.1

CDS count	44756
CDS length (mean)	270.1
CDS fragment length (sum)	12087793
CDS per mRNA (mean)	4.1

# CDS fragment composition profile: count<tab>percentage
mRNAs with 1 CDS	2452	22.6
mRNAs with 2 CDS	1942	17.9
mRNAs with 3 CDS	1712	15.7
mRNAs with 4 CDS	1161	10.7
mRNAs with 5 CDS	994	9.14
mRNAs with 6 CDS	672	6.18
mRNAs with 7 CDS	471	4.33
mRNAs with 8 CDS	364	3.35
mRNAs with 9 CDS	249	2.29
mRNAs with 10 CDS	237	2.18
mRNAs with 11 CDS	150	1.38
mRNAs with 12 CDS	101	0.929
mRNAs with 13 CDS	87	0.8
mRNAs with 14 CDS	75	0.69
mRNAs with 15 CDS	46	0.423
mRNAs with 16 CDS	24	0.221
mRNAs with 17 CDS	30	0.276
mRNAs with 18 CDS	28	0.258
mRNAs with 19 CDS	18	0.166
mRNAs wi

In [25]:
!report_gff_intron_and_intergenic_stats.py -i {pwh_anno_gff} -f {p_contig_file}


Molecule count: 86
Gene count: 15003

Total molecule bases: 73945211 bp
Empty molecule bases: 0 bp
Intergenic space count: 15089
Average intergenic space distance: 3356.5 bp
Median intergenic space distance: 1740 bp
Minimum intergenic space distance: 0 bp
Maximum intergenic space distance: 61009 bp

Intron count: 49726
Intron space count: 5600937
Average intron size: 112.6 bp
Median intron size: 82 bp
Minimum intron size: 24 bp
Maximum intron size: 170177 bp



In [26]:
!report_gff_intron_and_intergenic_stats.py -i {pwoh_anno_gff} -f {p_contig_file}


Molecule count: 7
Gene count: 67

Total molecule bases: 459228 bp
Empty molecule bases: 0 bp
Intergenic space count: 74
Average intergenic space distance: 5271.1 bp
Median intergenic space distance: 3202 bp
Minimum intergenic space distance: 126 bp
Maximum intergenic space distance: 40346 bp

Intron count: 116
Intron space count: 10549
Average intron size: 90.9 bp
Median intron size: 80 bp
Minimum intron size: 61 bp
Maximum intron size: 368 bp



In [27]:
!report_gff_intron_and_intergenic_stats.py -i {h_anno_gff} -f {h_contig_file}


Molecule count: 1125
Gene count: 10870

Total molecule bases: 51739088 bp
Empty molecule bases: 0 bp
Intergenic space count: 11995
Average intergenic space distance: 3027.2 bp
Median intergenic space distance: 1603 bp
Minimum intergenic space distance: 0 bp
Maximum intergenic space distance: 65202 bp

Intron count: 33886
Intron space count: 3340480
Average intron size: 98.6 bp
Median intron size: 82 bp
Minimum intron size: 41 bp
Maximum intron size: 47904 bp



In [40]:
def coverage_in_bases(input_gff):
    !bedtools sort -i {input_gff} > {'%s_sort' % input_gff}
    !bedtools merge -i {'%s_sort' % input_gff} > {'%s_merged' % input_gff}
    tmp_df = pd.read_csv('%s_merged' % input_gff, sep='\t', header=None)
    tmp_df['cov'] = tmp_df[2] - tmp_df[1]
    print(sum(tmp_df['cov']))
    return sum(tmp_df['cov'])

In [41]:
coverage_in_bases(pwh_REPET_gff)/pwh_length * 100

41422635


56.018009063494326

In [42]:
coverage_in_bases(pwoh_REPET_gff)/pwoh_length *100

390226


80.89161411756902

In [116]:
coverage_in_bases(h_REPET_gff)/h_length *100

27006141


51.809148723498957

In [43]:
pwh_gene_LT = []

In [45]:
s = 'locus_tag=DK0911_17082;Name=EVM prediction pcontig_061.1'

In [65]:
pwoh_gene_LTs = []
with open(pwoh_anno_gff) as fh:
    pattern = re.compile('.*locus_tag=([A-Z,_,0-9]*);')
    for line in fh:
        if len(line) < 4:
            continue
        line_list = line.split('\t')
        if line_list[2] == 'gene':
            pwoh_gene_LTs.append(re.findall(pattern, line_list[8])[0])
print(len(pwoh_gene_LTs))

67


In [66]:
pwh_gene_LTs = []
with open(pwh_anno_gff) as fh:
    pattern = re.compile('.*locus_tag=([A-Z,_,0-9]*);')
    for line in fh:
        if len(line) < 4:
            continue
        line_list = line.split('\t')
        if line_list[2] == 'gene':
            pwh_gene_LTs.append(re.findall(pattern, line_list[8])[0])
print(len(pwh_gene_LTs))

15003


In [108]:
h_gene_LTs = [seq.id for seq in SeqIO.parse(os.path.join(BASEFOLDER, 'DK_0911_v04LT_h_ctg.protein.fa'), 'fasta') ]

In [69]:
p_BUSCOs = pd.read_csv(p_BUSCO_file, sep='\t', header=None)[0].tolist()
h_BUSCOs = pd.read_csv(h_BUSCO_file, sep='\t', header=None)[0].tolist()

In [71]:
len(h_BUSCOs)

886

In [72]:
print(len([x for x in p_BUSCOs if x in pwh_gene_LTs]))

1290


In [73]:
print(len([x for x in p_BUSCOs if x in pwoh_gene_LTs]))

2


In [81]:
p_effectors = [seq.id for seq in SeqIO.parse(p_effector2_fa_fn, 'fasta')]
h_effectors = [seq.id for seq in SeqIO.parse(h_effector2_fa_fn, 'fasta')]

In [78]:
print(len([x for x in p_effectors if x in pwh_gene_LTs]))

816


In [79]:
print(len([x for x in p_effectors if x in pwoh_gene_LTs]))

3


In [80]:
len(p_effectors)

819

In [82]:
len(h_effectors)

557

### Moving on to next Table that describes the annotation of the different categories

In [83]:
p_anno_folder = '/home/benjamin/genome_assembly/Warrior/DK0911_v04/protein_annotation/DK_0911_v04LT_p_ctg/combined'
h_anno_folder = '/home/benjamin/genome_assembly/Warrior/DK0911_v04/protein_annotation/DK_0911_v04LT_h_ctg/combined'
file_endings =['.GO_combined.tablist', '.iprscan.tablist', '.Pfam.tablist', '.OGs.tablist', '.KEGG_combined.tablist','.merops.tablist', '.dbCAN.tablist','.SignalP3.tablist']

In [96]:
def count_and_mean_lenght(fa_fn, ID_list):
    len_list = [len(seq.seq) for seq in SeqIO.parse(fa_fn, 'fasta') if seq.id in ID_list]
    print('\n\nCount and mean lenght\n\n')
    print('%i, %i'%(len(len_list), sum(len_list)/len(len_list)))

In [101]:
def proteins_w_anno(anno_tab_fn, ID_list):
    anno_ids = pd.read_csv(anno_tab_fn, sep='\t', header=None)[0].tolist()
    len_all = len(ID_list)
    match = len([x for x in anno_ids if x in ID_list])
    print('%s: %s/%.2f' % (anno_tab_fn, match, (match/len_all*100)))
    return anno_ids

In [106]:
genome_prefix = 'DK_0911_v04LT_p_ctg'
gene_lists = [(pwh_gene_LTs+pwoh_gene_LTs),p_BUSCOs, p_effectors]
anno_base_folder = p_anno_folder
protein_fn = os.path.join(BASEFOLDER, 'DK_0911_v04LT_p_ctg.protein.fa')
all_anno_ids = []
for gene_list in gene_lists:
    count_and_mean_lenght( protein_fn, gene_list)
    for file_ending in file_endings:
        fn = os.path.join(anno_base_folder, '%s%s' % (genome_prefix, file_ending))
        ids = proteins_w_anno(fn, gene_list)
        if file_ending != '.SignalP3.tablist':
            all_anno_ids = all_anno_ids + ids
    no_anno_num = len(gene_list) - len([x for x in gene_list if x in all_anno_ids])
    print('No hits at all: %s/%.2f' % (no_anno_num, (no_anno_num/len(gene_list)*100)))
    print("\n\n######Next List######\n\n")



Count and mean lenght


15070, 392
/home/benjamin/genome_assembly/Warrior/DK0911_v04/protein_annotation/DK_0911_v04LT_p_ctg/combined/DK_0911_v04LT_p_ctg.GO_combined.tablist: 5642/37.44
/home/benjamin/genome_assembly/Warrior/DK0911_v04/protein_annotation/DK_0911_v04LT_p_ctg/combined/DK_0911_v04LT_p_ctg.iprscan.tablist: 6448/42.79
/home/benjamin/genome_assembly/Warrior/DK0911_v04/protein_annotation/DK_0911_v04LT_p_ctg/combined/DK_0911_v04LT_p_ctg.Pfam.tablist: 5722/37.97
/home/benjamin/genome_assembly/Warrior/DK0911_v04/protein_annotation/DK_0911_v04LT_p_ctg/combined/DK_0911_v04LT_p_ctg.OGs.tablist: 7295/48.41
/home/benjamin/genome_assembly/Warrior/DK0911_v04/protein_annotation/DK_0911_v04LT_p_ctg/combined/DK_0911_v04LT_p_ctg.KEGG_combined.tablist: 2511/16.66
/home/benjamin/genome_assembly/Warrior/DK0911_v04/protein_annotation/DK_0911_v04LT_p_ctg/combined/DK_0911_v04LT_p_ctg.merops.tablist: 262/1.74
/home/benjamin/genome_assembly/Warrior/DK0911_v04/protein_annotation/DK_0911_v04LT_p_ct

In [104]:
no_anno_num

5487

In [109]:
genome_prefix = 'DK_0911_v04LT_h_ctg'
gene_lists = [h_gene_LTs,h_BUSCOs, h_effectors]
anno_base_folder = h_anno_folder
protein_fn = os.path.join(BASEFOLDER, 'DK_0911_v04LT_h_ctg.protein.fa')
all_anno_ids = []
for gene_list in gene_lists:
    count_and_mean_lenght( protein_fn, gene_list)
    for file_ending in file_endings:
        fn = os.path.join(anno_base_folder, '%s%s' % (genome_prefix, file_ending))
        ids = proteins_w_anno(fn, gene_list)
        if file_ending != '.SignalP3.tablist':
            all_anno_ids = all_anno_ids + ids
    no_anno_num = len(gene_list) - len([x for x in gene_list if x in all_anno_ids])
    print('No hits at all: %s/%.2f' % (no_anno_num, (no_anno_num/len(gene_list)*100)))
    print("\n\n######Next List######\n\n")



Count and mean lenght


10870, 370
/home/benjamin/genome_assembly/Warrior/DK0911_v04/protein_annotation/DK_0911_v04LT_h_ctg/combined/DK_0911_v04LT_h_ctg.GO_combined.tablist: 3948/36.32
/home/benjamin/genome_assembly/Warrior/DK0911_v04/protein_annotation/DK_0911_v04LT_h_ctg/combined/DK_0911_v04LT_h_ctg.iprscan.tablist: 4425/40.71
/home/benjamin/genome_assembly/Warrior/DK0911_v04/protein_annotation/DK_0911_v04LT_h_ctg/combined/DK_0911_v04LT_h_ctg.Pfam.tablist: 3891/35.80
/home/benjamin/genome_assembly/Warrior/DK0911_v04/protein_annotation/DK_0911_v04LT_h_ctg/combined/DK_0911_v04LT_h_ctg.OGs.tablist: 5202/47.86
/home/benjamin/genome_assembly/Warrior/DK0911_v04/protein_annotation/DK_0911_v04LT_h_ctg/combined/DK_0911_v04LT_h_ctg.KEGG_combined.tablist: 1789/16.46
/home/benjamin/genome_assembly/Warrior/DK0911_v04/protein_annotation/DK_0911_v04LT_h_ctg/combined/DK_0911_v04LT_h_ctg.merops.tablist: 181/1.67
/home/benjamin/genome_assembly/Warrior/DK0911_v04/protein_annotation/DK_0911_v04LT_h_ct