# 2. After tximport in R

* Script: full_model.Rmd
* Input: "~/Documents/UCDavis/Whitehead/RNAseq_15killifish/salmon_denovo_by_species/"
* Output: "~/Documents/UCDavis/Whitehead/counts_matrices/*_counts.csv" for each species 

# 3. Merge annotations for each species, connecting Trinity contigs/genes to annotation

# Using dammit's `GFF3Parser` function 
1. Digests the gff3 file for each species (output from dammit, downloaded from the farm cluster)
2. Sorts each contig by E-value score
3. Assigns the lowest E-value score for each contig 
4. Separately, for each contig, saves gene names from the NCBI annotated F. heteroclitus genome

In [2]:
import os
import pandas as pd
# requires dammit env
# source activate dammit
from dammit.fileio.gff3 import GFF3Parser

In [3]:
counts_matrices = "/Users/johnsolk/Documents/UCDavis/Whitehead/counts_matrices_July2019/"
counts_files = os.listdir(counts_matrices)
gene_out_dir = "/Users/johnsolk/Documents/UCDavis/Whitehead/contig_gene_name_July2019_filtnew/"
print(counts_files)

['F_heteroclitusMDPP_counts.csv', 'F_parvapinis_counts.csv', 'L_parva_counts.csv', 'F_olivaceous_counts.csv', 'F_notatus_counts.csv', 'F_heteroclitusMDPL_counts.csv', 'F_grandis_counts.csv', 'F_zebrinus_counts.csv', 'A_xenica_counts.csv', 'F_catanatus_counts.csv', 'F_sciadicus_counts.csv', 'F_rathbuni_counts.csv', 'F_chrysotus_counts.csv', 'F_diaphanus_counts.csv', 'L_goodei_counts.csv', 'F_nottii_counts.csv', 'F_similis_counts.csv']


In [121]:
# 08/01/2019
# keep only F. het
# then pick one per contig
# if a contig does not have an F. het annotation, 
# keep track of this? but then drop it
for counts_file in counts_files:
    if counts_file != ".DS_Store":
        species = counts_file.split("_")[0]+"_"+counts_file.split("_")[1]
        print("========")
        print(species)
        print("========")
        gene_out = gene_out_dir + species + "_gene_counts_annotations_filt.csv"
        table = pd.read_csv("/Users/johnsolk/Documents/UCDavis/Whitehead/counts_matrices_July2019/"+counts_file)
        print('Number of Trinity "genes" (this is how we summarized expression):')
        print(table.shape)
        table = table.rename(columns={'Unnamed: 0': 'Gene'})
        # make sure expression counts >5
        # if countsvalue is >5 in any column, then keep
        table_filt = table[(table.iloc[:,1:] > 5).any(1)]
        print('Contigs filtered for expression (each row must have 5 counts):')
        print(table_filt.shape)
        name = "/Users/johnsolk/Documents/UCDavis/Whitehead/gff_annotations/"+species+".trinity_out.Trinity.fasta.dammit.fixed.gff3"
        conversion_contig = "/Users/johnsolk/Documents/UCDavis/Whitehead/counts_collapsed_July2019/"+species+"_contig_gene.csv"
        conversion_dammit = "/Users/johnsolk/Documents/UCDavis/Whitehead/dammit_conversions/"+species+".trinity_out.Trinity.fasta.dammit.namemap.csv"
        annotations = GFF3Parser(filename=name).read()
        annotations = annotations.dropna(subset=['Name'])
        # keeps track of how long the annotation is
        annotations["length"] = annotations["end"].subtract(annotations["start"], fill_value=0)
        conversions_dammit = pd.read_csv(conversion_dammit)
        conversions_contig = pd.read_csv(conversion_contig)
        conversions_dammit['Name'], conversions_dammit['info'] = conversions_dammit['original'].str.split(' ', 1).str
        conversions_dammit = conversions_dammit[['Name','renamed']]
        conversions_dammit.columns = ['Name','seqid']
        coversions_contig = conversions_contig[['Name','Gene']]
        # merge filtered counts with Trinity contig and gene ID
        merged_table = pd.merge(table_filt,conversions_contig,on="Gene",how='left')
        # merge with dammit transcript ID
        merged_table = pd.merge(merged_table,conversions_dammit,on="Name",how='left')
        # merge with annotation names
        fhet = annotations[annotations['Name'].str.startswith("ENSFHEP")]
        # pick best e-value match
        fhet_filtered = fhet.sort_values(by=['seqid','length'],ascending=False).drop_duplicates(subset='seqid')[['seqid', 'Name','start','end','length']]
        print("Unique Fhet gene names (one name per contig):")
        print(len(fhet_filtered.Name.unique()))
        fhet_merged_table = pd.merge(merged_table,fhet_filtered,on='seqid',how='inner')
        # gets rid of contigs without any F. het annotation
        # we probably lose ~2000 because of low counts
        # more "Genes" than annotations because the contigs are probably fragmented across a gene
        print("Unique Fhet gene names (one name per contig):")
        print(len(fhet_merged_table.Name_y.unique()))
        
        
        #fhet_merged_table = fhet_merged_table.drop('Unnamed: 0', 1)
        #fhet_merged_table = fhet_merged_table.rename(columns = {'Name_x':'TrinityContig','Name_y':'Fhet_GeneName'}) 
        #fhet_merged_table['split1'], fhet_merged_table['split2'],fhet_merged_table['split3'],fhet_merged_table['NCBIproteinID'],fhet_merged_table['NCBIproteinName'] = fhet_merged_table['Fhet_GeneName'].str.split('|', 5).str
        #fhet_merged_table = fhet_merged_table.drop('split1',1)
        #fhet_merged_table = fhet_merged_table.drop('split2',1)
        #fhet_merged_table = fhet_merged_table.drop('split3',1)
        #print("Unique Fhet annotations, contigs with expression")
        #print(len(fhet_merged_table.Fhet_GeneName.unique()))
        #print('Unique NCBI protein ID')
        #print(len(fhet_merged_table.NCBIproteinID.unique()))
        #print('Unique Trinity "genes"')
        #print(len(fhet_merged_table.Gene.unique()))
        #print('Unique Trinity "transcripts" (contigs) ')
        #print(len(fhet_merged_table.TrinityContig.unique()))
        #fhet = fhet_merged_table.sort_values(by=['Gene','length'],ascending=False).drop_duplicates(subset='Gene')
        #print('Collaposed genes')
        #print(fhet.shape)
        #fhet_collapse = fhet.groupby(['NCBIproteinID'])[list(fhet.columns)[1:len(list(fhet.columns))-1]].agg('sum')
        #print('Collaposed proteins')
        #print(fhet_collapse.shape)
        #fhet_collapse.head()
        #fhet_collapse.to_csv(gene_out)

F_heteroclitusMDPP
Number of Trinity "genes" (this is how we summarized expression):
(369971, 10)
Contigs filtered for expression (each row must have 5 counts):
(147578, 10)
Unique Fhet gene names (one name per contig):
22922
Unique Fhet gene names (one name per contig):
2665
F_parvapinis
Number of Trinity "genes" (this is how we summarized expression):
(219865, 9)
Contigs filtered for expression (each row must have 5 counts):
(65341, 9)
Unique Fhet gene names (one name per contig):
20432
Unique Fhet gene names (one name per contig):
1564
L_parva
Number of Trinity "genes" (this is how we summarized expression):
(204418, 10)
Contigs filtered for expression (each row must have 5 counts):
(65308, 10)
Unique Fhet gene names (one name per contig):
22157
Unique Fhet gene names (one name per contig):
1018
F_olivaceous
Number of Trinity "genes" (this is how we summarized expression):
(192082, 9)
Contigs filtered for expression (each row must have 5 counts):
(59830, 9)
Unique Fhet gene names (o

KeyboardInterrupt: 

# Test one species, F. heteroclitus MDPP

In [283]:
gene_out = gene_out_dir + "F_heteroclitusMDPP" + "_gene_counts_annotations_filt.csv"
table = pd.read_csv("/Users/johnsolk/Documents/UCDavis/Whitehead/counts_matrices_July2019/"+"F_heteroclitusMDPP_counts.csv")
#table = pd.read_csv("/Users/johnsolk/Documents/UCDavis/Whitehead/counts_matrices_Sept2018/"+"F_heteroclitusMDPP_counts.csv")
print('Number of Trinity "genes" (this is how we summarized expression):')
print(table.shape)
table = table.rename(columns={'Unnamed: 0': 'Gene'})

Number of Trinity "genes" (this is how we summarized expression):
(369971, 10)


In [284]:
table.head()

Unnamed: 0,Gene,F_heteroclitusMDPP_BW_1.quant,F_heteroclitusMDPP_BW_2.quant,F_heteroclitusMDPP_BW_3.quant,F_heteroclitusMDPP_FW_1.quant,F_heteroclitusMDPP_FW_2.quant,F_heteroclitusMDPP_FW_3.quant,F_heteroclitusMDPP_transfer_1.quant,F_heteroclitusMDPP_transfer_2.quant,F_heteroclitusMDPP_transfer_3.quant
0,TRINITY_DN0_c0_g1,2118.367,3823.061,2043.86,1875.973,1741.001,3107.106,1181.237,1615.228,1885.474
1,TRINITY_DN0_c0_g2,4033.616,4634.489,3109.738,3562.385,1954.666,4909.949,1501.673,2716.748,3228.895
2,TRINITY_DN0_c0_g3,519.02,264.704,168.547,425.641,233.355,149.159,120.091,557.423,377.39
3,TRINITY_DN0_c1_g1,921.0,1300.001,853.0,1026.0,1329.0,1190.011,391.0,1101.0,936.0
4,TRINITY_DN0_c10_g1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [285]:
# if countsvalue is >5 in any column, then keep
table_filt = table[(table.iloc[:,1:] > 5).any(1)]
print('Contigs filtered:')
print(table_filt.shape)

Contigs filtered:
(147578, 10)


In [286]:
table_filt.head()

Unnamed: 0,Gene,F_heteroclitusMDPP_BW_1.quant,F_heteroclitusMDPP_BW_2.quant,F_heteroclitusMDPP_BW_3.quant,F_heteroclitusMDPP_FW_1.quant,F_heteroclitusMDPP_FW_2.quant,F_heteroclitusMDPP_FW_3.quant,F_heteroclitusMDPP_transfer_1.quant,F_heteroclitusMDPP_transfer_2.quant,F_heteroclitusMDPP_transfer_3.quant
0,TRINITY_DN0_c0_g1,2118.367,3823.061,2043.86,1875.973,1741.001,3107.106,1181.237,1615.228,1885.474
1,TRINITY_DN0_c0_g2,4033.616,4634.489,3109.738,3562.385,1954.666,4909.949,1501.673,2716.748,3228.895
2,TRINITY_DN0_c0_g3,519.02,264.704,168.547,425.641,233.355,149.159,120.091,557.423,377.39
3,TRINITY_DN0_c1_g1,921.0,1300.001,853.0,1026.0,1329.0,1190.011,391.0,1101.0,936.0
5,TRINITY_DN0_c3_g1,150.0,134.0,87.0,186.0,94.0,141.0,108.0,102.0,126.975


In [184]:
name = "/Users/johnsolk/Documents/UCDavis/Whitehead/gff_annotations/"+"F_heteroclitusMDPP"+".trinity_out.Trinity.fasta.dammit.fixed.gff3"
conversion_contig = "/Users/johnsolk/Documents/UCDavis/Whitehead/contig_gene_23June2018/"+"F_heteroclitusMDPP"+"_contig_gene.csv"
conversion_dammit = "/Users/johnsolk/Documents/UCDavis/Whitehead/dammit_conversions/"+"F_heteroclitusMDPP"+".trinity_out.Trinity.fasta.dammit.namemap.csv"

In [185]:
annotations = GFF3Parser(filename=name).read()
annotations = annotations.dropna(subset=['Name'])

In [186]:
annotations.head()

Unnamed: 0,Dbxref,ID,Name,Note,Parent,Target,accuracy,bitscore,database,end,env_coords,phase,score,seqid,source,start,strand,trunc,type
0,"""Rfam:RF00381""",homology:497023,Antizyme_FSE,-,,Antizyme_FSE 1 59 +,,80.8,,395,,,2.6999999999999996e-19,Transcript_0,Infernal,336,-,no,RNA_sequence_secondary_structure
1,,homology:503492,ENSONIP00000006020,,,ENSONIP00000006020 0 192 +,,,OrthoDB,683,,,1.4e-158,Transcript_0,LAST,109,-,,translated_nucleotide_match
2,,homology:683059,sp|Q9YI98|OAZ1_DANRE,,,sp|Q9YI98|OAZ1_DANRE 0 186 +,,,sprot,683,,,3.6e-128,Transcript_0,LAST,109,-,,translated_nucleotide_match
3,,homology:1659499,ENSFHEP00000001247,,,ENSFHEP00000001247 59 179 +,,,Fundulus_heteroclitus.Fundulus_heteroclitus-3....,227,,,4.8e-84,Transcript_0,shmlast.LAST,107,+,,conditional_reciprocal_best_LAST
4,,homology:1444722,Funhe2EKm033591t1,,,Funhe2EKm033591t1 26 148 +,,,kfish2rae5g.pub.aa,152,,,1.1e-77,Transcript_0,shmlast.LAST,30,+,,conditional_reciprocal_best_LAST


In [187]:
# keeps track of how long the annotation is
annotations["length"] = annotations["end"].subtract(annotations["start"], fill_value=0)

In [188]:
#pickonename = annotations.sort_values(by=['seqid', 'score'], ascending=True).query('score < 1e-05').drop_duplicates(subset='seqid')[['seqid', 'Name','Note','database','Dbxref','start','end','length']]
#pickonename = pickonename.dropna(axis=0,how="all")
#print('Number of contigs with annotations (one annotation/contig, sorted by E-value < 1e-05 and picked the lowest):')
#print(pickonename.shape)

In [198]:
conversions_dammit = pd.read_csv(conversion_dammit)
conversions_contig = pd.read_csv(conversion_contig)

In [199]:
conversions_dammit['Name'], conversions_dammit['info'] = conversions_dammit['original'].str.split(' ', 1).str
conversions_dammit = conversions_dammit[['Name','renamed']]
conversions_dammit.columns = ['Name','seqid']
coversions_contig = conversions_contig[['Name','Gene']]

In [200]:
coversions_contig.head()

Unnamed: 0,Name,Gene
0,TRINITY_DN341363_c0_g1_i1,TRINITY_DN341363_c0_g1
1,TRINITY_DN341306_c0_g1_i1,TRINITY_DN341306_c0_g1
2,TRINITY_DN341347_c0_g1_i1,TRINITY_DN341347_c0_g1
3,TRINITY_DN341308_c0_g1_i1,TRINITY_DN341308_c0_g1
4,TRINITY_DN341353_c0_g1_i1,TRINITY_DN341353_c0_g1


In [201]:
coversions_contig.shape

(668484, 2)

In [202]:
conversions_dammit.head()

Unnamed: 0,Name,seqid
0,TRINITY_DN341363_c0_g1_i1,Transcript_0
1,TRINITY_DN341306_c0_g1_i1,Transcript_1
2,TRINITY_DN341347_c0_g1_i1,Transcript_2
3,TRINITY_DN341308_c0_g1_i1,Transcript_3
4,TRINITY_DN341353_c0_g1_i1,Transcript_4


In [203]:
conversions_dammit.shape

(668487, 2)

In [204]:
conversions_contig = pd.merge(conversions_contig,conversions_dammit,on="Name",how='right')

In [210]:
conversions_contig = conversions_contig.drop('Unnamed: 0', 1)
conversions_contig.head()

Unnamed: 0,Name,Gene,seqid
0,TRINITY_DN341363_c0_g1_i1,TRINITY_DN341363_c0_g1,Transcript_0
1,TRINITY_DN341306_c0_g1_i1,TRINITY_DN341306_c0_g1,Transcript_1
2,TRINITY_DN341347_c0_g1_i1,TRINITY_DN341347_c0_g1,Transcript_2
3,TRINITY_DN341308_c0_g1_i1,TRINITY_DN341308_c0_g1,Transcript_3
4,TRINITY_DN341353_c0_g1_i1,TRINITY_DN341353_c0_g1,Transcript_4


In [211]:
conversions_contig.shape

(668484, 3)

In [218]:
table_filt.head()

Unnamed: 0,Gene,F_heteroclitusMDPP_BW_1.quant,F_heteroclitusMDPP_BW_2.quant,F_heteroclitusMDPP_BW_3.quant,F_heteroclitusMDPP_FW_1.quant,F_heteroclitusMDPP_FW_2.quant,F_heteroclitusMDPP_FW_3.quant,F_heteroclitusMDPP_transfer_1.quant,F_heteroclitusMDPP_transfer_2.quant,F_heteroclitusMDPP_transfer_3.quant
0,TRINITY_DN0_c0_g1,2118.367,3823.061,2043.86,1875.973,1741.001,3107.106,1181.237,1615.228,1885.474
1,TRINITY_DN0_c0_g2,4033.616,4634.489,3109.738,3562.385,1954.666,4909.949,1501.673,2716.748,3228.895
2,TRINITY_DN0_c0_g3,519.02,264.704,168.547,425.641,233.355,149.159,120.091,557.423,377.39
3,TRINITY_DN0_c1_g1,921.0,1300.001,853.0,1026.0,1329.0,1190.011,391.0,1101.0,936.0
5,TRINITY_DN0_c3_g1,150.0,134.0,87.0,186.0,94.0,141.0,108.0,102.0,126.975


In [219]:
table_filt.shape

(147578, 10)

In [220]:
# merge filtered counts with Trinity contig and gene ID
merged_table = pd.merge(conversions_contig,table_filt,on="Gene",how='right')

In [221]:
merged_table.head()

Unnamed: 0,Name,Gene,seqid,F_heteroclitusMDPP_BW_1.quant,F_heteroclitusMDPP_BW_2.quant,F_heteroclitusMDPP_BW_3.quant,F_heteroclitusMDPP_FW_1.quant,F_heteroclitusMDPP_FW_2.quant,F_heteroclitusMDPP_FW_3.quant,F_heteroclitusMDPP_transfer_1.quant,F_heteroclitusMDPP_transfer_2.quant,F_heteroclitusMDPP_transfer_3.quant
0,TRINITY_DN300070_c0_g1_i1,TRINITY_DN300070_c0_g1,Transcript_657,0.0,0.0,0.0,11.0,0.0,0.0,3.0,0.0,0.0
1,TRINITY_DN300063_c0_g1_i1,TRINITY_DN300063_c0_g1,Transcript_658,0.0,0.0,2.0,8.0,0.0,0.0,11.0,2.0,1.0
2,TRINITY_DN300085_c0_g1_i1,TRINITY_DN300085_c0_g1,Transcript_673,14.87,9.0,11.0,0.0,0.0,0.0,5.0,0.0,0.0
3,TRINITY_DN300054_c0_g1_i1,TRINITY_DN300054_c0_g1,Transcript_682,2.0,3.0,6.0,3.0,0.0,2.0,1.0,2.0,2.0
4,TRINITY_DN300093_c0_g1_i1,TRINITY_DN300093_c0_g1,Transcript_687,0.0,0.0,0.0,6.0,0.0,0.0,2.0,0.0,2.0


In [222]:
merged_table.shape

(150470, 12)

### merge with annotation names

In [287]:
annotations.shape

(2006410, 20)

In [288]:
annotations.head()

Unnamed: 0,Dbxref,ID,Name,Note,Parent,Target,accuracy,bitscore,database,end,env_coords,phase,score,seqid,source,start,strand,trunc,type,length
0,"""Rfam:RF00381""",homology:497023,Antizyme_FSE,-,,Antizyme_FSE 1 59 +,,80.8,,395,,,2.6999999999999996e-19,Transcript_0,Infernal,336,-,no,RNA_sequence_secondary_structure,59
1,,homology:503492,ENSONIP00000006020,,,ENSONIP00000006020 0 192 +,,,OrthoDB,683,,,1.4e-158,Transcript_0,LAST,109,-,,translated_nucleotide_match,574
2,,homology:683059,sp|Q9YI98|OAZ1_DANRE,,,sp|Q9YI98|OAZ1_DANRE 0 186 +,,,sprot,683,,,3.6e-128,Transcript_0,LAST,109,-,,translated_nucleotide_match,574
3,,homology:1659499,ENSFHEP00000001247,,,ENSFHEP00000001247 59 179 +,,,Fundulus_heteroclitus.Fundulus_heteroclitus-3....,227,,,4.8e-84,Transcript_0,shmlast.LAST,107,+,,conditional_reciprocal_best_LAST,120
4,,homology:1444722,Funhe2EKm033591t1,,,Funhe2EKm033591t1 26 148 +,,,kfish2rae5g.pub.aa,152,,,1.1e-77,Transcript_0,shmlast.LAST,30,+,,conditional_reciprocal_best_LAST,122


In [289]:
# any annotation
pickone = annotations.sort_values(by=['seqid','score'],ascending=True).query('score < 1e-05').drop_duplicates(subset='seqid')[['seqid', 'Name','start','end','length']]

In [290]:
pickone.head()

Unnamed: 0,seqid,Name,start,end,length
1,Transcript_0,ENSONIP00000006020,109,683,574
110685,Transcript_10,Funhe2EKm026362t2,709,871,162
11229,Transcript_100,ENSXMAP00000019040,284,1373,1089
1425,Transcript_1000,ENSXETP00000058488,88,217,129
82,Transcript_10000,ENSDARP00000070224,156,3633,3477


In [291]:
pickone.shape

(177922, 5)

In [294]:
# only Ensembl
# many per contig
fhet = annotations.loc[annotations['database'] == 'Fundulus_heteroclitus.Fundulus_heteroclitus-3.0.2.pep.all.fa']
#fhet = annotations[annotations['Name'].str.startswith("ENSFHEP")]
fhet.shape

(152572, 20)

In [295]:
# still multiple Ensembl per seqid contig
fhet.head()

Unnamed: 0,Dbxref,ID,Name,Note,Parent,Target,accuracy,bitscore,database,end,env_coords,phase,score,seqid,source,start,strand,trunc,type,length
3,,homology:1659499,ENSFHEP00000001247,,,ENSFHEP00000001247 59 179 +,,,Fundulus_heteroclitus.Fundulus_heteroclitus-3....,227,,,4.8e-84,Transcript_0,shmlast.LAST,107,+,,conditional_reciprocal_best_LAST,120
5,,homology:1659500,ENSFHEP00000001247,,,ENSFHEP00000001247 0 59 +,,,Fundulus_heteroclitus.Fundulus_heteroclitus-3....,95,,,8.2e-37,Transcript_0,shmlast.LAST,36,+,,conditional_reciprocal_best_LAST,59
18,,homology:1510136,ENSFHEP00000010891,,,ENSFHEP00000010891 0 354 +,,,Fundulus_heteroclitus.Fundulus_heteroclitus-3....,380,,,4.6e-253,Transcript_100001,shmlast.LAST,26,+,,conditional_reciprocal_best_LAST,354
34,,homology:1510135,ENSFHEP00000010891,,,ENSFHEP00000010891 75 354 +,,,Fundulus_heteroclitus.Fundulus_heteroclitus-3....,404,,,4.2000000000000002e-196,Transcript_100002,shmlast.LAST,125,+,,conditional_reciprocal_best_LAST,279
42,,homology:1510134,ENSFHEP00000010891,,,ENSFHEP00000010891 0 79 +,,,Fundulus_heteroclitus.Fundulus_heteroclitus-3....,129,,,7.7e-49,Transcript_100002,shmlast.LAST,50,+,,conditional_reciprocal_best_LAST,79


In [298]:
# pick on Fhet per contig
# sort by length
ens = fhet.sort_values(by=['seqid','length'],ascending=True).drop_duplicates(subset='seqid')[['seqid', 'Name','start','end','length']]

In [303]:
# now only one annotation/seqid
ens.shape

(76704, 5)

In [302]:
ens.head()

Unnamed: 0,seqid,Name,start,end,length
5,Transcript_0,ENSFHEP00000001247,36,95,59
110687,Transcript_10,ENSFHEP00000022696,740,879,139
11231,Transcript_100,ENSFHEP00000028412,94,457,363
100,Transcript_10000,ENSFHEP00000000648,52,1227,1175
18,Transcript_100001,ENSFHEP00000010891,26,380,354


In [301]:
fhet_filtered = fhet.sort_values(by=['seqid','score'],ascending=True).query('score < 1e-05').drop_duplicates(subset='seqid')[['seqid', 'Name','start','end','length']]
print("Unique Fhet Ensembl gene names (one name per contig):")
print(len(fhet_filtered.Name.unique()))

Unique Fhet Ensembl gene names (one name per contig):
23074


In [230]:
fhet_filtered.head()

Unnamed: 0,seqid,Name,start,end,length
3,Transcript_0,ENSFHEP00000001247,107,227,120
110687,Transcript_10,ENSFHEP00000022696,740,879,139
11231,Transcript_100,ENSFHEP00000028412,94,457,363
100,Transcript_10000,ENSFHEP00000000648,52,1227,1175
18,Transcript_100001,ENSFHEP00000010891,26,380,354


In [231]:
fhet_filtered.shape

(76702, 5)

In [232]:
merged_table.head()

Unnamed: 0,Name,Gene,seqid,F_heteroclitusMDPP_BW_1.quant,F_heteroclitusMDPP_BW_2.quant,F_heteroclitusMDPP_BW_3.quant,F_heteroclitusMDPP_FW_1.quant,F_heteroclitusMDPP_FW_2.quant,F_heteroclitusMDPP_FW_3.quant,F_heteroclitusMDPP_transfer_1.quant,F_heteroclitusMDPP_transfer_2.quant,F_heteroclitusMDPP_transfer_3.quant
0,TRINITY_DN300070_c0_g1_i1,TRINITY_DN300070_c0_g1,Transcript_657,0.0,0.0,0.0,11.0,0.0,0.0,3.0,0.0,0.0
1,TRINITY_DN300063_c0_g1_i1,TRINITY_DN300063_c0_g1,Transcript_658,0.0,0.0,2.0,8.0,0.0,0.0,11.0,2.0,1.0
2,TRINITY_DN300085_c0_g1_i1,TRINITY_DN300085_c0_g1,Transcript_673,14.87,9.0,11.0,0.0,0.0,0.0,5.0,0.0,0.0
3,TRINITY_DN300054_c0_g1_i1,TRINITY_DN300054_c0_g1,Transcript_682,2.0,3.0,6.0,3.0,0.0,2.0,1.0,2.0,2.0
4,TRINITY_DN300093_c0_g1_i1,TRINITY_DN300093_c0_g1,Transcript_687,0.0,0.0,0.0,6.0,0.0,0.0,2.0,0.0,2.0


In [233]:
merged_table.shape

(150470, 12)

In [329]:
ens.head()

Unnamed: 0,seqid,Name,start,end,length
5,Transcript_0,ENSFHEP00000001247,36,95,59
110687,Transcript_10,ENSFHEP00000022696,740,879,139
11231,Transcript_100,ENSFHEP00000028412,94,457,363
100,Transcript_10000,ENSFHEP00000000648,52,1227,1175
18,Transcript_100001,ENSFHEP00000010891,26,380,354


In [304]:
table_filt.head()

Unnamed: 0,Gene,F_heteroclitusMDPP_BW_1.quant,F_heteroclitusMDPP_BW_2.quant,F_heteroclitusMDPP_BW_3.quant,F_heteroclitusMDPP_FW_1.quant,F_heteroclitusMDPP_FW_2.quant,F_heteroclitusMDPP_FW_3.quant,F_heteroclitusMDPP_transfer_1.quant,F_heteroclitusMDPP_transfer_2.quant,F_heteroclitusMDPP_transfer_3.quant
0,TRINITY_DN0_c0_g1,2118.367,3823.061,2043.86,1875.973,1741.001,3107.106,1181.237,1615.228,1885.474
1,TRINITY_DN0_c0_g2,4033.616,4634.489,3109.738,3562.385,1954.666,4909.949,1501.673,2716.748,3228.895
2,TRINITY_DN0_c0_g3,519.02,264.704,168.547,425.641,233.355,149.159,120.091,557.423,377.39
3,TRINITY_DN0_c1_g1,921.0,1300.001,853.0,1026.0,1329.0,1190.011,391.0,1101.0,936.0
5,TRINITY_DN0_c3_g1,150.0,134.0,87.0,186.0,94.0,141.0,108.0,102.0,126.975


In [305]:
table_filt.shape

(147578, 10)

In [306]:
fhet_merged_table = pd.merge(ens,conversions_contig,on='seqid',how='left')
fhet_merged_table.head()
# gets rid of contigs without any F. het annotation Ensembl

Unnamed: 0,seqid,Name_x,start,end,length,Name_y,Gene
0,Transcript_0,ENSFHEP00000001247,36,95,59,TRINITY_DN341363_c0_g1_i1,TRINITY_DN341363_c0_g1
1,Transcript_10,ENSFHEP00000022696,740,879,139,TRINITY_DN341343_c0_g1_i1,TRINITY_DN341343_c0_g1
2,Transcript_100,ENSFHEP00000028412,94,457,363,TRINITY_DN311101_c0_g1_i1,TRINITY_DN311101_c0_g1
3,Transcript_10000,ENSFHEP00000000648,52,1227,1175,TRINITY_DN308331_c0_g1_i1,TRINITY_DN308331_c0_g1
4,Transcript_100001,ENSFHEP00000010891,26,380,354,TRINITY_DN115512_c0_g1_i1,TRINITY_DN115512_c0_g1


In [307]:
fhet_merged_table.shape

(76704, 7)

In [308]:
# Ensembl
print("Unique Fhet gene names (one name per contig):")
print(len(fhet_merged_table.Name_x.unique()))

Unique Fhet gene names (one name per contig):
23882


In [316]:
fhet_merged_table.head()

Unnamed: 0,seqid,Name_x,start,end,length,Name_y,Gene,F_heteroclitusMDPP_BW_1.quant,F_heteroclitusMDPP_BW_2.quant,F_heteroclitusMDPP_BW_3.quant,F_heteroclitusMDPP_FW_1.quant,F_heteroclitusMDPP_FW_2.quant,F_heteroclitusMDPP_FW_3.quant,F_heteroclitusMDPP_transfer_1.quant,F_heteroclitusMDPP_transfer_2.quant,F_heteroclitusMDPP_transfer_3.quant
0,Transcript_100001,ENSFHEP00000010891,26,380,354,TRINITY_DN115512_c0_g1_i1,TRINITY_DN115512_c0_g1,0.0,0.0,1.0,8.0,0.0,0.0,2.0,0.0,0.0
1,Transcript_100019,ENSFHEP00000022304.1 pep primary_assembly:Fund...,264,628,364,TRINITY_DN115592_c0_g1_i1,TRINITY_DN115592_c0_g1,0.0,0.0,0.0,11.0,1.0,0.0,0.0,0.0,0.0
2,Transcript_100021,ENSFHEP00000022304.1 pep primary_assembly:Fund...,241,425,184,TRINITY_DN115535_c0_g1_i1,TRINITY_DN115535_c0_g1,0.0,0.0,0.0,11.0,0.0,0.0,2.0,0.0,0.0
3,Transcript_100047,ENSFHEP00000000967,62,374,312,TRINITY_DN106787_c0_g1_i1,TRINITY_DN106787_c0_g1,6.0,10.0,6.0,3.0,5.0,9.0,2.0,9.0,14.0
4,Transcript_100062,ENSFHEP00000024506.1 pep primary_assembly:Fund...,117,285,168,TRINITY_DN106759_c0_g1_i1,TRINITY_DN106759_c0_g1,26.0,34.0,9.0,31.0,7.0,23.0,13.0,27.0,19.0


In [315]:
table_filt.head()

Unnamed: 0,Gene,F_heteroclitusMDPP_BW_1.quant,F_heteroclitusMDPP_BW_2.quant,F_heteroclitusMDPP_BW_3.quant,F_heteroclitusMDPP_FW_1.quant,F_heteroclitusMDPP_FW_2.quant,F_heteroclitusMDPP_FW_3.quant,F_heteroclitusMDPP_transfer_1.quant,F_heteroclitusMDPP_transfer_2.quant,F_heteroclitusMDPP_transfer_3.quant
0,TRINITY_DN0_c0_g1,2118.367,3823.061,2043.86,1875.973,1741.001,3107.106,1181.237,1615.228,1885.474
1,TRINITY_DN0_c0_g2,4033.616,4634.489,3109.738,3562.385,1954.666,4909.949,1501.673,2716.748,3228.895
2,TRINITY_DN0_c0_g3,519.02,264.704,168.547,425.641,233.355,149.159,120.091,557.423,377.39
3,TRINITY_DN0_c1_g1,921.0,1300.001,853.0,1026.0,1329.0,1190.011,391.0,1101.0,936.0
5,TRINITY_DN0_c3_g1,150.0,134.0,87.0,186.0,94.0,141.0,108.0,102.0,126.975


In [314]:
# filtered based on low expression
table_filt.shape

(147578, 10)

In [312]:
fhet_merged_table = pd.merge(fhet_merged_table,table_filt,on='Gene',how='inner')

In [313]:
fhet_merged_table.shape

(3662, 16)

In [317]:
ens.head()

Unnamed: 0,seqid,Name,start,end,length
5,Transcript_0,ENSFHEP00000001247,36,95,59
110687,Transcript_10,ENSFHEP00000022696,740,879,139
11231,Transcript_100,ENSFHEP00000028412,94,457,363
100,Transcript_10000,ENSFHEP00000000648,52,1227,1175
18,Transcript_100001,ENSFHEP00000010891,26,380,354


In [337]:
fhet.head()

Unnamed: 0,Dbxref,ID,Name,Note,Parent,Target,accuracy,bitscore,database,end,env_coords,phase,score,seqid,source,start,strand,trunc,type,length
3,,homology:1659499,ENSFHEP00000001247,,,ENSFHEP00000001247 59 179 +,,,Fundulus_heteroclitus.Fundulus_heteroclitus-3....,227,,,4.8e-84,Transcript_0,shmlast.LAST,107,+,,conditional_reciprocal_best_LAST,120
5,,homology:1659500,ENSFHEP00000001247,,,ENSFHEP00000001247 0 59 +,,,Fundulus_heteroclitus.Fundulus_heteroclitus-3....,95,,,8.2e-37,Transcript_0,shmlast.LAST,36,+,,conditional_reciprocal_best_LAST,59
18,,homology:1510136,ENSFHEP00000010891,,,ENSFHEP00000010891 0 354 +,,,Fundulus_heteroclitus.Fundulus_heteroclitus-3....,380,,,4.6e-253,Transcript_100001,shmlast.LAST,26,+,,conditional_reciprocal_best_LAST,354
34,,homology:1510135,ENSFHEP00000010891,,,ENSFHEP00000010891 75 354 +,,,Fundulus_heteroclitus.Fundulus_heteroclitus-3....,404,,,4.2000000000000002e-196,Transcript_100002,shmlast.LAST,125,+,,conditional_reciprocal_best_LAST,279
42,,homology:1510134,ENSFHEP00000010891,,,ENSFHEP00000010891 0 79 +,,,Fundulus_heteroclitus.Fundulus_heteroclitus-3....,129,,,7.7e-49,Transcript_100002,shmlast.LAST,50,+,,conditional_reciprocal_best_LAST,79


In [335]:
ens.shape

(76704, 5)

In [338]:
ens_contigs = pd.merge(ens,conversions_contig,on='seqid',how='left')

In [339]:
ens_contigs.head()

Unnamed: 0,seqid,Name_x,start,end,length,Name_y,Gene
0,Transcript_0,ENSFHEP00000001247,36,95,59,TRINITY_DN341363_c0_g1_i1,TRINITY_DN341363_c0_g1
1,Transcript_10,ENSFHEP00000022696,740,879,139,TRINITY_DN341343_c0_g1_i1,TRINITY_DN341343_c0_g1
2,Transcript_100,ENSFHEP00000028412,94,457,363,TRINITY_DN311101_c0_g1_i1,TRINITY_DN311101_c0_g1
3,Transcript_10000,ENSFHEP00000000648,52,1227,1175,TRINITY_DN308331_c0_g1_i1,TRINITY_DN308331_c0_g1
4,Transcript_100001,ENSFHEP00000010891,26,380,354,TRINITY_DN115512_c0_g1_i1,TRINITY_DN115512_c0_g1


In [340]:
ens_contigs.shape

(76704, 7)

In [341]:
table_filt.shape

(147578, 10)

In [342]:
table_filt.head()

Unnamed: 0,Gene,F_heteroclitusMDPP_BW_1.quant,F_heteroclitusMDPP_BW_2.quant,F_heteroclitusMDPP_BW_3.quant,F_heteroclitusMDPP_FW_1.quant,F_heteroclitusMDPP_FW_2.quant,F_heteroclitusMDPP_FW_3.quant,F_heteroclitusMDPP_transfer_1.quant,F_heteroclitusMDPP_transfer_2.quant,F_heteroclitusMDPP_transfer_3.quant
0,TRINITY_DN0_c0_g1,2118.367,3823.061,2043.86,1875.973,1741.001,3107.106,1181.237,1615.228,1885.474
1,TRINITY_DN0_c0_g2,4033.616,4634.489,3109.738,3562.385,1954.666,4909.949,1501.673,2716.748,3228.895
2,TRINITY_DN0_c0_g3,519.02,264.704,168.547,425.641,233.355,149.159,120.091,557.423,377.39
3,TRINITY_DN0_c1_g1,921.0,1300.001,853.0,1026.0,1329.0,1190.011,391.0,1101.0,936.0
5,TRINITY_DN0_c3_g1,150.0,134.0,87.0,186.0,94.0,141.0,108.0,102.0,126.975


In [349]:
ens_merge = pd.merge(ens_contigs,table_filt,on='Gene',how='inner')

In [350]:
ens_merge.shape

(3662, 16)

In [351]:
ens_merge.head()

Unnamed: 0,seqid,Name_x,start,end,length,Name_y,Gene,F_heteroclitusMDPP_BW_1.quant,F_heteroclitusMDPP_BW_2.quant,F_heteroclitusMDPP_BW_3.quant,F_heteroclitusMDPP_FW_1.quant,F_heteroclitusMDPP_FW_2.quant,F_heteroclitusMDPP_FW_3.quant,F_heteroclitusMDPP_transfer_1.quant,F_heteroclitusMDPP_transfer_2.quant,F_heteroclitusMDPP_transfer_3.quant
0,Transcript_100001,ENSFHEP00000010891,26,380,354,TRINITY_DN115512_c0_g1_i1,TRINITY_DN115512_c0_g1,0.0,0.0,1.0,8.0,0.0,0.0,2.0,0.0,0.0
1,Transcript_100019,ENSFHEP00000022304.1 pep primary_assembly:Fund...,264,628,364,TRINITY_DN115592_c0_g1_i1,TRINITY_DN115592_c0_g1,0.0,0.0,0.0,11.0,1.0,0.0,0.0,0.0,0.0
2,Transcript_100021,ENSFHEP00000022304.1 pep primary_assembly:Fund...,241,425,184,TRINITY_DN115535_c0_g1_i1,TRINITY_DN115535_c0_g1,0.0,0.0,0.0,11.0,0.0,0.0,2.0,0.0,0.0
3,Transcript_100047,ENSFHEP00000000967,62,374,312,TRINITY_DN106787_c0_g1_i1,TRINITY_DN106787_c0_g1,6.0,10.0,6.0,3.0,5.0,9.0,2.0,9.0,14.0
4,Transcript_100062,ENSFHEP00000024506.1 pep primary_assembly:Fund...,117,285,168,TRINITY_DN106759_c0_g1_i1,TRINITY_DN106759_c0_g1,26.0,34.0,9.0,31.0,7.0,23.0,13.0,27.0,19.0


In [352]:
ens_merge.shape

(3662, 16)

In [353]:
print(len(ens_merge.Name_x.unique()))

2924


In [89]:
fhet_merged_table = fhet_merged_table.rename(columns = {'Name_x':'TrinityContig','Name_y':'Fhet_GeneName'})                                           

In [90]:
fhet_merged_table.head()

Unnamed: 0,Gene,F_heteroclitusMDPP_BW_1.quant,F_heteroclitusMDPP_BW_2.quant,F_heteroclitusMDPP_BW_3.quant,F_heteroclitusMDPP_FW_1.quant,F_heteroclitusMDPP_FW_2.quant,F_heteroclitusMDPP_FW_3.quant,F_heteroclitusMDPP_transfer_1.quant,F_heteroclitusMDPP_transfer_2.quant,F_heteroclitusMDPP_transfer_3.quant,TrinityContig,seqid,Fhet_GeneName,start,end,length
0,TRINITY_DN100767_c0_g1,0.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0,1.0,TRINITY_DN100767_c0_g1_i1,Transcript_221533,gi|831525541|ref|XP_012719085.1| PREDICTED: zo...,9,56,47
1,TRINITY_DN10175_c0_g1,1.0,0.0,0.0,0.0,0.0,2.0,1.0,2.0,9.0,TRINITY_DN10175_c0_g1_i1,Transcript_605742,gi|831490281|ref|XP_012706852.1| PREDICTED: ti...,0,220,220
2,TRINITY_DN10249_c0_g1,0.0,0.0,0.0,1.0,0.0,0.0,17.0,0.0,0.0,TRINITY_DN10249_c0_g1_i1,Transcript_618557,gi|831577816|ref|XP_012737259.1| PREDICTED: ez...,0,45,45
3,TRINITY_DN102633_c0_g1,0.0,2.0,0.0,9.0,2.0,1.0,5.0,0.0,0.0,TRINITY_DN102633_c0_g1_i1,Transcript_201922,gi|831498824|ref|XP_012709872.1| PREDICTED: co...,30,68,38
4,TRINITY_DN102877_c0_g1,125.0,79.0,78.0,52.0,55.0,105.0,40.0,92.0,72.0,TRINITY_DN102877_c0_g1_i1,Transcript_143501,gi|831491323|ref|XP_012707217.1| PREDICTED: LY...,0,26,26


In [91]:
fhet_merged_table['split1'], fhet_merged_table['split2'],fhet_merged_table['split3'],fhet_merged_table['NCBIproteinID'],fhet_merged_table['NCBIproteinName'] = fhet_merged_table['Fhet_GeneName'].str.split('|', 5).str
fhet_merged_table = fhet_merged_table.drop('split1',1)
fhet_merged_table = fhet_merged_table.drop('split2',1)
fhet_merged_table = fhet_merged_table.drop('split3',1)

In [92]:
fhet_merged_table.head()

Unnamed: 0,Gene,F_heteroclitusMDPP_BW_1.quant,F_heteroclitusMDPP_BW_2.quant,F_heteroclitusMDPP_BW_3.quant,F_heteroclitusMDPP_FW_1.quant,F_heteroclitusMDPP_FW_2.quant,F_heteroclitusMDPP_FW_3.quant,F_heteroclitusMDPP_transfer_1.quant,F_heteroclitusMDPP_transfer_2.quant,F_heteroclitusMDPP_transfer_3.quant,TrinityContig,seqid,Fhet_GeneName,start,end,length,NCBIproteinID,NCBIproteinName
0,TRINITY_DN100767_c0_g1,0.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0,1.0,TRINITY_DN100767_c0_g1_i1,Transcript_221533,gi|831525541|ref|XP_012719085.1| PREDICTED: zo...,9,56,47,XP_012719085.1,PREDICTED: zona pellucida sperm-binding prote...
1,TRINITY_DN10175_c0_g1,1.0,0.0,0.0,0.0,0.0,2.0,1.0,2.0,9.0,TRINITY_DN10175_c0_g1_i1,Transcript_605742,gi|831490281|ref|XP_012706852.1| PREDICTED: ti...,0,220,220,XP_012706852.1,PREDICTED: titin-like [Fundulus heteroclitus]
2,TRINITY_DN10249_c0_g1,0.0,0.0,0.0,1.0,0.0,0.0,17.0,0.0,0.0,TRINITY_DN10249_c0_g1_i1,Transcript_618557,gi|831577816|ref|XP_012737259.1| PREDICTED: ez...,0,45,45,XP_012737259.1,PREDICTED: ezrin-like [Fundulus heteroclitus]
3,TRINITY_DN102633_c0_g1,0.0,2.0,0.0,9.0,2.0,1.0,5.0,0.0,0.0,TRINITY_DN102633_c0_g1_i1,Transcript_201922,gi|831498824|ref|XP_012709872.1| PREDICTED: co...,30,68,38,XP_012709872.1,PREDICTED: collagen alpha-1(XXVII) chain B-li...
4,TRINITY_DN102877_c0_g1,125.0,79.0,78.0,52.0,55.0,105.0,40.0,92.0,72.0,TRINITY_DN102877_c0_g1_i1,Transcript_143501,gi|831491323|ref|XP_012707217.1| PREDICTED: LY...,0,26,26,XP_012707217.1,PREDICTED: LYR motif-containing protein 4 iso...


In [93]:
fhet_merged_table.shape

(66797, 18)

In [94]:
titin = fhet_merged_table.loc[fhet_merged_table['NCBIproteinID'] == 'XP_012706852.1']
titin.head()

Unnamed: 0,Gene,F_heteroclitusMDPP_BW_1.quant,F_heteroclitusMDPP_BW_2.quant,F_heteroclitusMDPP_BW_3.quant,F_heteroclitusMDPP_FW_1.quant,F_heteroclitusMDPP_FW_2.quant,F_heteroclitusMDPP_FW_3.quant,F_heteroclitusMDPP_transfer_1.quant,F_heteroclitusMDPP_transfer_2.quant,F_heteroclitusMDPP_transfer_3.quant,TrinityContig,seqid,Fhet_GeneName,start,end,length,NCBIproteinID,NCBIproteinName
1,TRINITY_DN10175_c0_g1,1.0,0.0,0.0,0.0,0.0,2.0,1.0,2.0,9.0,TRINITY_DN10175_c0_g1_i1,Transcript_605742,gi|831490281|ref|XP_012706852.1| PREDICTED: ti...,0,220,220,XP_012706852.1,PREDICTED: titin-like [Fundulus heteroclitus]
465,TRINITY_DN146980_c0_g1,2.0,2.0,1.0,3.0,0.0,8.0,3.0,6.0,27.0,TRINITY_DN146980_c0_g1_i1,Transcript_91068,gi|831490281|ref|XP_012706852.1| PREDICTED: ti...,0,497,497,XP_012706852.1,PREDICTED: titin-like [Fundulus heteroclitus]
1016,TRINITY_DN170990_c2_g1,3.0,1.0,2.0,3.0,3.0,1.0,2.0,0.0,7.0,TRINITY_DN170990_c2_g1_i1,Transcript_202201,gi|831490281|ref|XP_012706852.1| PREDICTED: ti...,0,293,293,XP_012706852.1,PREDICTED: titin-like [Fundulus heteroclitus]
1022,TRINITY_DN171266_c0_g2,8.0,0.0,0.0,0.0,1.0,2.0,2.0,2.5,0.0,TRINITY_DN171266_c0_g2_i1,Transcript_175700,gi|831490281|ref|XP_012706852.1| PREDICTED: ti...,0,309,309,XP_012706852.1,PREDICTED: titin-like [Fundulus heteroclitus]
1367,TRINITY_DN179170_c2_g1,4.0,6.0,0.0,2.0,2.0,1.0,2.0,3.0,3.0,TRINITY_DN179170_c2_g1_i1,Transcript_54728,gi|831490281|ref|XP_012706852.1| PREDICTED: ti...,0,218,218,XP_012706852.1,PREDICTED: titin-like [Fundulus heteroclitus]


In [102]:
# sort by length descending, drop duplicate genes
titin_gene = titin.sort_values(by=['Gene','length'],ascending=False).drop_duplicates(subset='Gene')
#titin_gene.head(20)

In [118]:
# now, collapse matrix by protein, sum expression for all fragments for each sample
# colSum[2:10] for unique proteins 'NCBIproteinID'
titin_collapse = titin_gene.groupby(['NCBIproteinID'])[list(titin_gene.columns)[1:9]].agg('sum')
#titin_collapse = titin_gene.groupby(['NCBIproteinID']).sum()
titin_collapse.head()

Unnamed: 0_level_0,F_heteroclitusMDPP_BW_1.quant,F_heteroclitusMDPP_BW_2.quant,F_heteroclitusMDPP_BW_3.quant,F_heteroclitusMDPP_FW_1.quant,F_heteroclitusMDPP_FW_2.quant,F_heteroclitusMDPP_FW_3.quant,F_heteroclitusMDPP_transfer_1.quant,F_heteroclitusMDPP_transfer_2.quant
NCBIproteinID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
XP_012706852.1,9267.948719,5022.968313,2300.499999,7129.471635,2996.556833,4430.794692,3312.109758,7756.500001


In [122]:
# now do this on everything
fhet = fhet_merged_table.sort_values(by=['Gene','length'],ascending=False).drop_duplicates(subset='Gene')
print(fhet.shape)
fhet_collapse = fhet.groupby(['NCBIproteinID'])[list(fhet.columns)[1:9]].agg('sum')
print(fhet_collapse.shape)
fhet_collapse.head()

(27387, 18)
(18626, 8)


Unnamed: 0_level_0,F_heteroclitusMDPP_BW_1.quant,F_heteroclitusMDPP_BW_2.quant,F_heteroclitusMDPP_BW_3.quant,F_heteroclitusMDPP_FW_1.quant,F_heteroclitusMDPP_FW_2.quant,F_heteroclitusMDPP_FW_3.quant,F_heteroclitusMDPP_transfer_1.quant,F_heteroclitusMDPP_transfer_2.quant
NCBIproteinID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
XP_012704699.1,277.294571,329.0,165.227518,250.0,143.160386,170.0,110.590974,213.904699
XP_012704700.1,373.0,478.0,242.000001,409.0,265.202322,334.0,207.035645,505.69083
XP_012704702.1,664.999999,837.0,476.0,823.0,520.699682,695.999999,304.0,526.0
XP_012704705.1,892.841678,851.0,514.0,727.999999,353.0,727.0,276.0,731.000001
XP_012704706.1,7.0,5.0,3.0,10.0,0.0,1.0,4.0,2.0


In [45]:
print("Unique Fhet annotations, contigs with expression")
print(len(fhet_merged_table.Fhet_GeneName.unique()))
print('Unique NCBI protein ID')
print(len(fhet_merged_table.NCBIproteinID.unique()))
print('Unique Trinity "genes"')
print(len(fhet_merged_table.Gene.unique()))
print('Unique Trinity "transcripts" (contigs) ')
print(len(fhet_merged_table.TrinityContig.unique()))
# fragments
#fhet_merged_table.to_csv(gene_out)

Unique Fhet annotations, contigs with expression
20877
Unique NCBI protein ID
20877
Unique Trinity "genes"
27387
Unique Trinity "transcripts" (contigs) 
66797
