# 2. After tximport in R

* Script: full_model.Rmd
* Input: "~/Documents/UCDavis/Whitehead/RNAseq_15killifish/salmon_denovo_by_species/"
* Output: "~/Documents/UCDavis/Whitehead/counts_matrices/*_counts.csv" for each species 

# 3. Merge annotations for each species, connecting Trinity contigs/genes to annotation

# Using dammit's `GFF3Parser` function 
1. Digests the gff3 file for each species (output from dammit, downloaded from the farm cluster)
2. Sorts each contig by E-value score
3. Assigns the lowest E-value score for each contig 
4. Separately, for each contig, saves gene names from the NCBI annotated F. heteroclitus genome

In [1]:
import os
import pandas as pd
# requires dammit env
# source activate dammit
from dammit.fileio.gff3 import GFF3Parser

In [125]:
counts_matrices = "/Users/johnsolk/Documents/UCDavis/Whitehead/counts_matrices/"
counts_files = os.listdir(counts_matrices)
gene_out_dir = "/Users/johnsolk/Documents/UCDavis/Whitehead/contig_gene_name_16Oct2018_filtnew/"
print(counts_files)

['F_heteroclitusMDPP_counts.csv', 'F_parvapinis_counts.csv', 'L_parva_counts.csv', 'F_notatus_counts.csv', 'F_heteroclitusMDPL_counts.csv', 'F_grandis_counts.csv', 'F_zebrinus_counts.csv', 'A_xenica_counts.csv', 'F_olivaceus_counts.csv', 'F_catanatus_counts.csv', 'F_sciadicus_counts.csv', 'F_rathbuni_counts.csv', 'F_chrysotus_counts.csv', 'F_diaphanus_counts.csv', 'L_goodei_counts.csv', 'F_nottii_counts.csv', 'F_similis_counts.csv']


In [128]:
# 10/15/2018
# keep only F. het
# then pick one per contig
# if a contig does not have an F. het annotation, 
# keep track of this? but then drop it
for counts_file in counts_files:
    if counts_file != ".DS_Store":
        species = counts_file.split("_")[0]+"_"+counts_file.split("_")[1]
        print("========")
        print(species)
        print("========")
        gene_out = gene_out_dir + species + "_gene_counts_annotations_filt.csv"
        table = pd.read_csv("/Users/johnsolk/Documents/UCDavis/Whitehead/counts_matrices/"+counts_file)
        print('Number of Trinity "genes" (this is how we summarized expression):')
        print(table.shape)
        table = table.rename(columns={'Unnamed: 0': 'Gene'})
        # if countsvalue is >5 in any column, then keep
        table_filt = table[(table.iloc[:,1:] > 5).any(1)]
        print('Contigs filtered:')
        print(table_filt.shape)
        name = "/Users/johnsolk/Documents/UCDavis/Whitehead/gff_annotations/"+species+".trinity_out.Trinity.fasta.dammit.gff3"
        conversion_contig = "/Users/johnsolk/Documents/UCDavis/Whitehead/contig_gene_23June2018/"+species+"_contig_gene.csv"
        conversion_dammit = "/Users/johnsolk/Documents/UCDavis/Whitehead/dammit_conversions/"+species+".trinity_out.Trinity.fasta.dammit.namemap.csv"
        annotations = GFF3Parser(filename=name).read()
        annotations = annotations.dropna(subset=['Name'])
        # keeps track of how long the annotation is
        annotations["length"] = annotations["end"].subtract(annotations["start"], fill_value=0)
        conversions_dammit = pd.read_csv(conversion_dammit)
        conversions_contig = pd.read_csv(conversion_contig)
        conversions_dammit['Name'], conversions_dammit['info'] = conversions_dammit['original'].str.split(' ', 1).str
        conversions_dammit = conversions_dammit[['Name','renamed']]
        conversions_dammit.columns = ['Name','seqid']
        coversions_contig = conversions_contig[['Name','Gene']]
        # merge filtered counts with Trinity contig and gene ID
        merged_table = pd.merge(table_filt,conversions_contig,on="Gene",how='left')
        # merge with dammit transcript ID
        merged_table = pd.merge(merged_table,conversions_dammit,on="Name",how='left')
        # merge with annotation names
        fhet = annotations[annotations['Name'].str.startswith("gi")]
        # pick best e-value match
        fhet_filtered = fhet.sort_values(by=['seqid','score'],ascending=True).query('score < 1e-05').drop_duplicates(subset='seqid')[['seqid', 'Name','start','end','length']]
        print("Unique Fhet gene names (one name per contig):")
        print(len(fhet_filtered.Name.unique()))
        fhet_merged_table = pd.merge(merged_table,fhet_filtered,on='seqid',how='inner')
        # gets rid of contigs without any F. het annotation
        # we probably lose ~2000 because of low counts
        # more "Genes" than annotations because the contigs are probably fragmented across a gene
        print("Unique Fhet gene names (one name per contig):")
        print(len(fhet_merged_table.Name_y.unique()))
        fhet_merged_table = fhet_merged_table.drop('Unnamed: 0', 1)
        fhet_merged_table = fhet_merged_table.rename(columns = {'Name_x':'TrinityContig','Name_y':'Fhet_GeneName'}) 
        fhet_merged_table['split1'], fhet_merged_table['split2'],fhet_merged_table['split3'],fhet_merged_table['NCBIproteinID'],fhet_merged_table['NCBIproteinName'] = fhet_merged_table['Fhet_GeneName'].str.split('|', 5).str
        fhet_merged_table = fhet_merged_table.drop('split1',1)
        fhet_merged_table = fhet_merged_table.drop('split2',1)
        fhet_merged_table = fhet_merged_table.drop('split3',1)
        print("Unique Fhet annotations, contigs with expression")
        print(len(fhet_merged_table.Fhet_GeneName.unique()))
        print('Unique NCBI protein ID')
        print(len(fhet_merged_table.NCBIproteinID.unique()))
        print('Unique Trinity "genes"')
        print(len(fhet_merged_table.Gene.unique()))
        print('Unique Trinity "transcripts" (contigs) ')
        print(len(fhet_merged_table.TrinityContig.unique()))
        fhet = fhet_merged_table.sort_values(by=['Gene','length'],ascending=False).drop_duplicates(subset='Gene')
        print('Collaposed genes')
        print(fhet.shape)
        fhet_collapse = fhet.groupby(['NCBIproteinID'])[list(fhet.columns)[1:len(list(fhet.columns))-1]].agg('sum')
        print('Collaposed proteins')
        print(fhet_collapse.shape)
        fhet_collapse.head()
        fhet_collapse.to_csv(gene_out)

F_heteroclitusMDPP
Number of Trinity "genes" (this is how we summarized expression):
(496133, 10)
Contigs filtered:
(174264, 10)


  dtype=dict(self.columns)):


Unique Fhet gene names (one name per contig):
22641
Unique Fhet gene names (one name per contig):
20558
Unique Fhet annotations, contigs with expression
20558
Unique NCBI protein ID
20558
Unique Trinity "genes"
27387
Unique Trinity "transcripts" (contigs) 
66797
Collaposed genes
(27387, 18)
Collaposed proteins
(18626, 12)
F_parvapinis
Number of Trinity "genes" (this is how we summarized expression):
(279009, 9)
Contigs filtered:
(82915, 9)
Unique Fhet gene names (one name per contig):
20212
Unique Fhet gene names (one name per contig):
18293
Unique Fhet annotations, contigs with expression
18293
Unique NCBI protein ID
18293
Unique Trinity "genes"
21139
Unique Trinity "transcripts" (contigs) 
55189
Collaposed genes
(21139, 17)
Collaposed proteins
(16307, 11)
L_parva
Number of Trinity "genes" (this is how we summarized expression):
(275950, 10)
Contigs filtered:
(84392, 10)
Unique Fhet gene names (one name per contig):
22218
Unique Fhet gene names (one name per contig):
20629
Unique Fhet

# Test one species, F. heteroclitus MDPP

In [48]:
gene_out = gene_out_dir + "F_heteroclitusMDPP" + "_gene_counts_annotations_filt.csv"
table = pd.read_csv("/Users/johnsolk/Documents/UCDavis/Whitehead/counts_matrices/"+"F_heteroclitusMDPP_counts.csv")
print('Number of Trinity "genes" (this is how we summarized expression):')
print(table.shape)
table = table.rename(columns={'Unnamed: 0': 'Gene'})

Number of Trinity "genes" (this is how we summarized expression):
(496133, 10)


In [49]:
# if countsvalue is >5 in any column, then keep
table_filt = table[(table.iloc[:,1:] > 5).any(1)]
print('Contigs filtered:')
print(table_filt.shape)

Contigs filtered:
(174264, 10)


In [50]:
name = "/Users/johnsolk/Documents/UCDavis/Whitehead/gff_annotations/"+"F_heteroclitusMDPP"+".trinity_out.Trinity.fasta.dammit.gff3"
conversion_contig = "/Users/johnsolk/Documents/UCDavis/Whitehead/contig_gene_23June2018/"+"F_heteroclitusMDPP"+"_contig_gene.csv"
conversion_dammit = "/Users/johnsolk/Documents/UCDavis/Whitehead/dammit_conversions/"+"F_heteroclitusMDPP"+".trinity_out.Trinity.fasta.dammit.namemap.csv"

In [51]:
annotations = GFF3Parser(filename=name).read()
annotations = annotations.dropna(subset=['Name'])

  dtype=dict(self.columns)):


In [52]:
# keeps track of how long the annotation is
annotations["length"] = annotations["end"].subtract(annotations["start"], fill_value=0)

In [53]:
#pickonename = annotations.sort_values(by=['seqid', 'score'], ascending=True).query('score < 1e-05').drop_duplicates(subset='seqid')[['seqid', 'Name','Note','database','Dbxref','start','end','length']]
#pickonename = pickonename.dropna(axis=0,how="all")
#print('Number of contigs with annotations (one annotation/contig, sorted by E-value < 1e-05 and picked the lowest):')
#print(pickonename.shape)

In [54]:
conversions_dammit = pd.read_csv(conversion_dammit)
conversions_contig = pd.read_csv(conversion_contig)

In [55]:
conversions_dammit['Name'], conversions_dammit['info'] = conversions_dammit['original'].str.split(' ', 1).str
conversions_dammit = conversions_dammit[['Name','renamed']]
conversions_dammit.columns = ['Name','seqid']
coversions_contig = conversions_contig[['Name','Gene']]

In [56]:
# merge filtered counts with Trinity contig and gene ID
merged_table = pd.merge(table_filt,conversions_contig,on="Gene",how='left')
# merge with dammit transcript ID
merged_table = pd.merge(merged_table,conversions_dammit,on="Name",how='left')

In [57]:
merged_table.head()

Unnamed: 0.1,Gene,F_heteroclitusMDPP_BW_1.quant,F_heteroclitusMDPP_BW_2.quant,F_heteroclitusMDPP_BW_3.quant,F_heteroclitusMDPP_FW_1.quant,F_heteroclitusMDPP_FW_2.quant,F_heteroclitusMDPP_FW_3.quant,F_heteroclitusMDPP_transfer_1.quant,F_heteroclitusMDPP_transfer_2.quant,F_heteroclitusMDPP_transfer_3.quant,Unnamed: 0,Name,seqid
0,TRINITY_DN0_c0_g1,0.0,0.0,0.0,6.0,0.0,0.0,0.0,0.0,0.0,604146,TRINITY_DN0_c0_g1_i1,Transcript_604149
1,TRINITY_DN1_c0_g1,0.0,2.0,0.0,11.0,0.0,0.0,2.0,0.0,0.0,604184,TRINITY_DN1_c0_g1_i1,Transcript_604187
2,TRINITY_DN10000_c1_g1,0.0,0.0,0.0,13.0,1.0,0.0,4.0,2.0,0.0,631886,TRINITY_DN10000_c1_g1_i1,Transcript_631889
3,TRINITY_DN100010_c1_g1,0.0,0.0,0.0,7.0,0.0,0.0,9.0,1.0,0.0,189616,TRINITY_DN100010_c1_g1_i1,Transcript_189616
4,TRINITY_DN100017_c0_g1,1.0,0.0,0.0,11.0,0.0,0.0,1.0,1.0,0.0,189637,TRINITY_DN100017_c0_g1_i1,Transcript_189637


### merge with annotation names

In [83]:
fhet = annotations[annotations['Name'].str.startswith("gi")]
fhet_filtered = fhet.sort_values(by=['seqid','score'],ascending=True).query('score < 1e-05').drop_duplicates(subset='seqid')[['seqid', 'Name','start','end','length']]
print("Unique Fhet gene names (one name per contig):")
print(len(fhet_filtered.Name.unique()))

Unique Fhet gene names (one name per contig):
22641


In [84]:
fhet_filtered.head()

Unnamed: 0,seqid,Name,start,end,length
34,Transcript_100053,gi|831512517|ref|XP_012714591.1| PREDICTED: br...,0,124,124
134,Transcript_100263,gi|831554709|ref|XP_012729460.1| PREDICTED: mu...,0,122,122
146,Transcript_100278,gi|831477084|ref|XP_012712778.1| PREDICTED: ub...,0,70,70
148,Transcript_100279,gi|831477084|ref|XP_012712778.1| PREDICTED: ub...,0,70,70
159,Transcript_100291,gi|831575409|ref|XP_012736406.1| PREDICTED: ne...,0,88,88


In [85]:
fhet_merged_table = pd.merge(merged_table,fhet_filtered,on='seqid',how='inner')
fhet_merged_table.head()
# gets rid of contigs without any F. het annotation

Unnamed: 0.1,Gene,F_heteroclitusMDPP_BW_1.quant,F_heteroclitusMDPP_BW_2.quant,F_heteroclitusMDPP_BW_3.quant,F_heteroclitusMDPP_FW_1.quant,F_heteroclitusMDPP_FW_2.quant,F_heteroclitusMDPP_FW_3.quant,F_heteroclitusMDPP_transfer_1.quant,F_heteroclitusMDPP_transfer_2.quant,F_heteroclitusMDPP_transfer_3.quant,Unnamed: 0,Name_x,seqid,Name_y,start,end,length
0,TRINITY_DN100767_c0_g1,0.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0,1.0,221533,TRINITY_DN100767_c0_g1_i1,Transcript_221533,gi|831525541|ref|XP_012719085.1| PREDICTED: zo...,9,56,47
1,TRINITY_DN10175_c0_g1,1.0,0.0,0.0,0.0,0.0,2.0,1.0,2.0,9.0,605739,TRINITY_DN10175_c0_g1_i1,Transcript_605742,gi|831490281|ref|XP_012706852.1| PREDICTED: ti...,0,220,220
2,TRINITY_DN10249_c0_g1,0.0,0.0,0.0,1.0,0.0,0.0,17.0,0.0,0.0,618554,TRINITY_DN10249_c0_g1_i1,Transcript_618557,gi|831577816|ref|XP_012737259.1| PREDICTED: ez...,0,45,45
3,TRINITY_DN102633_c0_g1,0.0,2.0,0.0,9.0,2.0,1.0,5.0,0.0,0.0,201922,TRINITY_DN102633_c0_g1_i1,Transcript_201922,gi|831498824|ref|XP_012709872.1| PREDICTED: co...,30,68,38
4,TRINITY_DN102877_c0_g1,125.0,79.0,78.0,52.0,55.0,105.0,40.0,92.0,72.0,143501,TRINITY_DN102877_c0_g1_i1,Transcript_143501,gi|831491323|ref|XP_012707217.1| PREDICTED: LY...,0,26,26


In [86]:
fhet_merged_table.shape

(66797, 17)

In [87]:
# we probably lose ~2000 because of low counts
# more "Genes" than annotations because the contigs are probably fragmented across a gene
print("Unique Fhet gene names (one name per contig):")
print(len(fhet_merged_table.Name_y.unique()))

Unique Fhet gene names (one name per contig):
20558


In [88]:
fhet_merged_table = fhet_merged_table.drop('Unnamed: 0', 1)

In [89]:
fhet_merged_table = fhet_merged_table.rename(columns = {'Name_x':'TrinityContig','Name_y':'Fhet_GeneName'})                                           

In [90]:
fhet_merged_table.head()

Unnamed: 0,Gene,F_heteroclitusMDPP_BW_1.quant,F_heteroclitusMDPP_BW_2.quant,F_heteroclitusMDPP_BW_3.quant,F_heteroclitusMDPP_FW_1.quant,F_heteroclitusMDPP_FW_2.quant,F_heteroclitusMDPP_FW_3.quant,F_heteroclitusMDPP_transfer_1.quant,F_heteroclitusMDPP_transfer_2.quant,F_heteroclitusMDPP_transfer_3.quant,TrinityContig,seqid,Fhet_GeneName,start,end,length
0,TRINITY_DN100767_c0_g1,0.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0,1.0,TRINITY_DN100767_c0_g1_i1,Transcript_221533,gi|831525541|ref|XP_012719085.1| PREDICTED: zo...,9,56,47
1,TRINITY_DN10175_c0_g1,1.0,0.0,0.0,0.0,0.0,2.0,1.0,2.0,9.0,TRINITY_DN10175_c0_g1_i1,Transcript_605742,gi|831490281|ref|XP_012706852.1| PREDICTED: ti...,0,220,220
2,TRINITY_DN10249_c0_g1,0.0,0.0,0.0,1.0,0.0,0.0,17.0,0.0,0.0,TRINITY_DN10249_c0_g1_i1,Transcript_618557,gi|831577816|ref|XP_012737259.1| PREDICTED: ez...,0,45,45
3,TRINITY_DN102633_c0_g1,0.0,2.0,0.0,9.0,2.0,1.0,5.0,0.0,0.0,TRINITY_DN102633_c0_g1_i1,Transcript_201922,gi|831498824|ref|XP_012709872.1| PREDICTED: co...,30,68,38
4,TRINITY_DN102877_c0_g1,125.0,79.0,78.0,52.0,55.0,105.0,40.0,92.0,72.0,TRINITY_DN102877_c0_g1_i1,Transcript_143501,gi|831491323|ref|XP_012707217.1| PREDICTED: LY...,0,26,26


In [91]:
fhet_merged_table['split1'], fhet_merged_table['split2'],fhet_merged_table['split3'],fhet_merged_table['NCBIproteinID'],fhet_merged_table['NCBIproteinName'] = fhet_merged_table['Fhet_GeneName'].str.split('|', 5).str
fhet_merged_table = fhet_merged_table.drop('split1',1)
fhet_merged_table = fhet_merged_table.drop('split2',1)
fhet_merged_table = fhet_merged_table.drop('split3',1)

In [92]:
fhet_merged_table.head()

Unnamed: 0,Gene,F_heteroclitusMDPP_BW_1.quant,F_heteroclitusMDPP_BW_2.quant,F_heteroclitusMDPP_BW_3.quant,F_heteroclitusMDPP_FW_1.quant,F_heteroclitusMDPP_FW_2.quant,F_heteroclitusMDPP_FW_3.quant,F_heteroclitusMDPP_transfer_1.quant,F_heteroclitusMDPP_transfer_2.quant,F_heteroclitusMDPP_transfer_3.quant,TrinityContig,seqid,Fhet_GeneName,start,end,length,NCBIproteinID,NCBIproteinName
0,TRINITY_DN100767_c0_g1,0.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0,1.0,TRINITY_DN100767_c0_g1_i1,Transcript_221533,gi|831525541|ref|XP_012719085.1| PREDICTED: zo...,9,56,47,XP_012719085.1,PREDICTED: zona pellucida sperm-binding prote...
1,TRINITY_DN10175_c0_g1,1.0,0.0,0.0,0.0,0.0,2.0,1.0,2.0,9.0,TRINITY_DN10175_c0_g1_i1,Transcript_605742,gi|831490281|ref|XP_012706852.1| PREDICTED: ti...,0,220,220,XP_012706852.1,PREDICTED: titin-like [Fundulus heteroclitus]
2,TRINITY_DN10249_c0_g1,0.0,0.0,0.0,1.0,0.0,0.0,17.0,0.0,0.0,TRINITY_DN10249_c0_g1_i1,Transcript_618557,gi|831577816|ref|XP_012737259.1| PREDICTED: ez...,0,45,45,XP_012737259.1,PREDICTED: ezrin-like [Fundulus heteroclitus]
3,TRINITY_DN102633_c0_g1,0.0,2.0,0.0,9.0,2.0,1.0,5.0,0.0,0.0,TRINITY_DN102633_c0_g1_i1,Transcript_201922,gi|831498824|ref|XP_012709872.1| PREDICTED: co...,30,68,38,XP_012709872.1,PREDICTED: collagen alpha-1(XXVII) chain B-li...
4,TRINITY_DN102877_c0_g1,125.0,79.0,78.0,52.0,55.0,105.0,40.0,92.0,72.0,TRINITY_DN102877_c0_g1_i1,Transcript_143501,gi|831491323|ref|XP_012707217.1| PREDICTED: LY...,0,26,26,XP_012707217.1,PREDICTED: LYR motif-containing protein 4 iso...


In [93]:
fhet_merged_table.shape

(66797, 18)

In [94]:
titin = fhet_merged_table.loc[fhet_merged_table['NCBIproteinID'] == 'XP_012706852.1']
titin.head()

Unnamed: 0,Gene,F_heteroclitusMDPP_BW_1.quant,F_heteroclitusMDPP_BW_2.quant,F_heteroclitusMDPP_BW_3.quant,F_heteroclitusMDPP_FW_1.quant,F_heteroclitusMDPP_FW_2.quant,F_heteroclitusMDPP_FW_3.quant,F_heteroclitusMDPP_transfer_1.quant,F_heteroclitusMDPP_transfer_2.quant,F_heteroclitusMDPP_transfer_3.quant,TrinityContig,seqid,Fhet_GeneName,start,end,length,NCBIproteinID,NCBIproteinName
1,TRINITY_DN10175_c0_g1,1.0,0.0,0.0,0.0,0.0,2.0,1.0,2.0,9.0,TRINITY_DN10175_c0_g1_i1,Transcript_605742,gi|831490281|ref|XP_012706852.1| PREDICTED: ti...,0,220,220,XP_012706852.1,PREDICTED: titin-like [Fundulus heteroclitus]
465,TRINITY_DN146980_c0_g1,2.0,2.0,1.0,3.0,0.0,8.0,3.0,6.0,27.0,TRINITY_DN146980_c0_g1_i1,Transcript_91068,gi|831490281|ref|XP_012706852.1| PREDICTED: ti...,0,497,497,XP_012706852.1,PREDICTED: titin-like [Fundulus heteroclitus]
1016,TRINITY_DN170990_c2_g1,3.0,1.0,2.0,3.0,3.0,1.0,2.0,0.0,7.0,TRINITY_DN170990_c2_g1_i1,Transcript_202201,gi|831490281|ref|XP_012706852.1| PREDICTED: ti...,0,293,293,XP_012706852.1,PREDICTED: titin-like [Fundulus heteroclitus]
1022,TRINITY_DN171266_c0_g2,8.0,0.0,0.0,0.0,1.0,2.0,2.0,2.5,0.0,TRINITY_DN171266_c0_g2_i1,Transcript_175700,gi|831490281|ref|XP_012706852.1| PREDICTED: ti...,0,309,309,XP_012706852.1,PREDICTED: titin-like [Fundulus heteroclitus]
1367,TRINITY_DN179170_c2_g1,4.0,6.0,0.0,2.0,2.0,1.0,2.0,3.0,3.0,TRINITY_DN179170_c2_g1_i1,Transcript_54728,gi|831490281|ref|XP_012706852.1| PREDICTED: ti...,0,218,218,XP_012706852.1,PREDICTED: titin-like [Fundulus heteroclitus]


In [102]:
# sort by length descending, drop duplicate genes
titin_gene = titin.sort_values(by=['Gene','length'],ascending=False).drop_duplicates(subset='Gene')
#titin_gene.head(20)

In [118]:
# now, collapse matrix by protein, sum expression for all fragments for each sample
# colSum[2:10] for unique proteins 'NCBIproteinID'
titin_collapse = titin_gene.groupby(['NCBIproteinID'])[list(titin_gene.columns)[1:9]].agg('sum')
#titin_collapse = titin_gene.groupby(['NCBIproteinID']).sum()
titin_collapse.head()

Unnamed: 0_level_0,F_heteroclitusMDPP_BW_1.quant,F_heteroclitusMDPP_BW_2.quant,F_heteroclitusMDPP_BW_3.quant,F_heteroclitusMDPP_FW_1.quant,F_heteroclitusMDPP_FW_2.quant,F_heteroclitusMDPP_FW_3.quant,F_heteroclitusMDPP_transfer_1.quant,F_heteroclitusMDPP_transfer_2.quant
NCBIproteinID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
XP_012706852.1,9267.948719,5022.968313,2300.499999,7129.471635,2996.556833,4430.794692,3312.109758,7756.500001


In [122]:
# now do this on everything
fhet = fhet_merged_table.sort_values(by=['Gene','length'],ascending=False).drop_duplicates(subset='Gene')
print(fhet.shape)
fhet_collapse = fhet.groupby(['NCBIproteinID'])[list(fhet.columns)[1:9]].agg('sum')
print(fhet_collapse.shape)
fhet_collapse.head()

(27387, 18)
(18626, 8)


Unnamed: 0_level_0,F_heteroclitusMDPP_BW_1.quant,F_heteroclitusMDPP_BW_2.quant,F_heteroclitusMDPP_BW_3.quant,F_heteroclitusMDPP_FW_1.quant,F_heteroclitusMDPP_FW_2.quant,F_heteroclitusMDPP_FW_3.quant,F_heteroclitusMDPP_transfer_1.quant,F_heteroclitusMDPP_transfer_2.quant
NCBIproteinID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
XP_012704699.1,277.294571,329.0,165.227518,250.0,143.160386,170.0,110.590974,213.904699
XP_012704700.1,373.0,478.0,242.000001,409.0,265.202322,334.0,207.035645,505.69083
XP_012704702.1,664.999999,837.0,476.0,823.0,520.699682,695.999999,304.0,526.0
XP_012704705.1,892.841678,851.0,514.0,727.999999,353.0,727.0,276.0,731.000001
XP_012704706.1,7.0,5.0,3.0,10.0,0.0,1.0,4.0,2.0


In [45]:
print("Unique Fhet annotations, contigs with expression")
print(len(fhet_merged_table.Fhet_GeneName.unique()))
print('Unique NCBI protein ID')
print(len(fhet_merged_table.NCBIproteinID.unique()))
print('Unique Trinity "genes"')
print(len(fhet_merged_table.Gene.unique()))
print('Unique Trinity "transcripts" (contigs) ')
print(len(fhet_merged_table.TrinityContig.unique()))
# fragments
#fhet_merged_table.to_csv(gene_out)

Unique Fhet annotations, contigs with expression
20877
Unique NCBI protein ID
20877
Unique Trinity "genes"
27387
Unique Trinity "transcripts" (contigs) 
66797
