In [2]:
import os
import pandas as pd
# requires dammit env
# source activate dammit
from dammit.fileio.gff3 import GFF3Parser

# Using dammit's `GFF3Parser` function 
1. Digests the gff3 file for each species (output from dammit, downloaded from the farm cluster)
2. Sorts each contig by E-value score
3. Assigns the lowest E-value score for each contig 
4. Separately, for each contig, saves gene names from the NCBI annotated F. heteroclitus genome

Writes 2 separate .csv files:

* (species)_Fhet_genenames.csv
* (species)_onegenenamepertranscript.csv

In [22]:
annotations_dir = "/Users/johnsolk/Documents/UCDavis/Whitehead/gff_annotations/"
annotations = os.listdir(annotations_dir)
print(annotations)

['F_notatus.trinity_out.Trinity.fasta.dammit.gff3', '.DS_Store', 'F_rathbuni.trinity_out.Trinity.fasta.dammit.gff3', 'F_diaphanus.trinity_out.Trinity.fasta.dammit.gff3', 'F_chrysotus.trinity_out.Trinity.fasta.dammit.gff3', 'L_goodei.trinity_out.Trinity.fasta.dammit.gff3', 'F_heteroclitusMDPL.trinity_out.Trinity.fasta.dammit.gff3', 'F_heteroclitusMDPP.trinity_out.Trinity.fasta.dammit.gff3', 'F_similis.trinity_out.Trinity.fasta.dammit.gff3', 'L_parva.trinity_out.Trinity.fasta.dammit.gff3', 'A_xenica.trinity_out.Trinity.fasta.dammit.gff3', 'F_catanatus.trinity_out.Trinity.fasta.dammit.gff3', 'F_zebrinus.trinity_out.Trinity.fasta.dammit.gff3', 'F_parvapinis.trinity_out.Trinity.fasta.dammit.gff3', 'F_grandis.trinity_out.Trinity.fasta.dammit.gff3', 'F_sciadicus.trinity_out.Trinity.fasta.dammit.gff3', 'F_olivaceous.trinity_out.Trinity.fasta.dammit.gff3', 'F_notti.trinity_out.Trinity.fasta.dammit.gff3']


In [148]:
annotation = "A_xenica.trinity_out.Trinity.fasta.dammit.gff3"
species = annotation.split(".")[0]
print(annotation)
print(species)

A_xenica.trinity_out.Trinity.fasta.dammit.gff3
A_xenica


In [149]:
name = annotations_dir + annotation
annotations = GFF3Parser(filename=name).read()

  dtype=dict(self.columns)):


In [150]:
annotations["length"] = annotations["end"].subtract(annotations["start"], fill_value=0)
annotations.head()

Unnamed: 0,Dbxref,ID,Name,Note,Parent,Target,accuracy,bitscore,database,end,env_coords,phase,score,seqid,source,start,strand,trunc,type,length
0,,homology:425062,90820,,,90820 1132 1240 +,,,OrthoDB,366,,,5.5e-47,Transcript_100000,LAST,36,+,,translated_nucleotide_match,330
1,"""Pfam:PF13465.2""",homology:53130,zf-H2C2_2,Zinc-finger double domain,,zf-H2C2_2 1 25 +,0.94,,,198,127 201,,0.0012,Transcript_100001,HMMER,126,,,protein_hmm_match,72
2,"""Pfam:PF13894.2""",homology:53126,zf-C2H2_4,C2H2-type zinc finger,,zf-C2H2_4 2 23 +,0.94,,,234,166 237,,0.004,Transcript_100001,HMMER,168,,,protein_hmm_match,66
3,"""Pfam:PF12171.4""",homology:53128,zf-C2H2_jaz,Zinc-finger double-stranded RNA-binding,,zf-C2H2_jaz 4 23 +,0.93,,,231,172 240,,0.0026,Transcript_100001,HMMER,171,,,protein_hmm_match,60
4,"""Pfam:PF13912.2""",homology:53133,zf-C2H2_6,C2H2-type zinc finger,,zf-C2H2_6 4 24 +,0.96,,,234,169 240,,0.0058,Transcript_100001,HMMER,171,,,protein_hmm_match,63


In [151]:
print('Number of annotations (multiple/contig):',annotations.shape)

Number of annotations (multiple/contig): (1740771, 20)


In [152]:
annotations = annotations.dropna(subset=['Name'])
print(annotations.head())

             Dbxref               ID         Name  \
0               NaN  homology:425062        90820   
1  "Pfam:PF13465.2"   homology:53130    zf-H2C2_2   
2  "Pfam:PF13894.2"   homology:53126    zf-C2H2_4   
3  "Pfam:PF12171.4"   homology:53128  zf-C2H2_jaz   
4  "Pfam:PF13912.2"   homology:53133    zf-C2H2_6   

                                      Note Parent              Target  \
0                                      NaN    NaN   90820 1132 1240 +   
1                Zinc-finger double domain    NaN    zf-H2C2_2 1 25 +   
2                    C2H2-type zinc finger    NaN    zf-C2H2_4 2 23 +   
3  Zinc-finger double-stranded RNA-binding    NaN  zf-C2H2_jaz 4 23 +   
4                    C2H2-type zinc finger    NaN    zf-C2H2_6 4 24 +   

  accuracy bitscore database  end env_coords  phase         score  \
0      NaN      NaN  OrthoDB  366        NaN    NaN  5.500000e-47   
1     0.94      NaN      NaN  198    127 201    NaN  1.200000e-03   
2     0.94      NaN      NaN  234  

In [153]:
print('Number of annotations with a name:',annotations.shape)
# For later, what are the annotations without names? (make a subset df minus the named df)

Number of annotations with a name: (1089393, 20)


In [154]:
all_names = annotations.sort_values(by=['seqid'],ascending=True)[['seqid', 'Name','Note','database','Dbxref','start','end','length']]
pickonename = annotations.sort_values(by=['seqid', 'score'], ascending=True).query('score < 1e-05').drop_duplicates(subset='seqid')[['seqid', 'Name','Note','database','Dbxref','start','end','length']]
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 1000)                                                                                                                               
pickonename.head(1000)                                                                                               
                                                

Unnamed: 0,seqid,Name,Note,database,Dbxref,start,end,length
0,Transcript_100000,90820,,OrthoDB,,36,366,330
12,Transcript_100001,90820,,OrthoDB,,89,770,681
27,Transcript_100005,ENSONIP00000019130,,OrthoDB,,2,1952,1950
71,Transcript_100011,ENSONIP00000011620,,OrthoDB,,203,353,150
78,Transcript_100012,ENSONIP00000011620,,OrthoDB,,236,608,372
79,Transcript_100013,ENSONIP00000011620,,OrthoDB,,123,273,150
80,Transcript_100014,MAT2A_F,-,,"""Rfam:RF02265""",75,130,55
81,Transcript_100016,ENSXMAP00000004158,,OrthoDB,,807,1230,423
85,Transcript_100017,ENSXMAP00000004158,,OrthoDB,,270,1044,774
93,Transcript_100018,ENSXMAP00000004158,,OrthoDB,,807,1237,430


# Based on expression
* We have expression quant per contig
* other methods have filtered based on contig with highest expression
* We will drop contigs with no expression

In [155]:
out_dir = "/Users/johnsolk/Documents/UCDavis/Whitehead/trinity_contig_gene_tables/"
contigs_w_expression = pd.read_csv(out_dir + 'A_xenica_trinity_contig_gene_table_filtered.txt',sep="\t",names = ['contig','gene'])
contigs_w_expression.head()

Unnamed: 0,contig,gene
0,TRINITY_DN2270_c0_g1_i1,TRINITY_DN2270_c0_g1
1,TRINITY_DN2201_c0_g1_i1,TRINITY_DN2201_c0_g1
2,TRINITY_DN2279_c0_g1_i1,TRINITY_DN2279_c0_g1
3,TRINITY_DN2262_c0_g1_i1,TRINITY_DN2262_c0_g1
4,TRINITY_DN2253_c0_g1_i1,TRINITY_DN2253_c0_g1


In [156]:
contigs_w_expression.shape

(207976, 2)

In [202]:
# All the contigs
contigs = "A_xenica_trinity_contig_gene_table.txt"
contigs_table = pd.read_csv(out_dir + contigs,sep="\t",names = ['contig','gene'])
contigs_table.head()

Unnamed: 0,contig,gene
0,TRINITY_DN2202_c0_g1_i1,TRINITY_DN2202_c0_g1
1,TRINITY_DN2270_c0_g1_i1,TRINITY_DN2270_c0_g1
2,TRINITY_DN2201_c0_g1_i1,TRINITY_DN2201_c0_g1
3,TRINITY_DN2222_c0_g1_i1,TRINITY_DN2222_c0_g1
4,TRINITY_DN2291_c0_g1_i1,TRINITY_DN2291_c0_g1


In [203]:
# All the contigs
contigs_table.shape

(362782, 2)

Conversion table between dammit gene ID, e.g. 'Transcript_100001' and Trinity contig ID, e.g. 'TRINITY_DN2270_c0_g1_i1'

In [159]:
conversion_dir = "/Users/johnsolk/Documents/UCDavis/Whitehead/dammit_conversions/"
conversions = os.listdir(conversion_dir)
print(conversions)

['F_diaphanus.trinity_out.Trinity.fasta.dammit.namemap.csv', 'F_chrysotus.trinity_out.Trinity.fasta.dammit.namemap.csv', 'L_goodei.trinity_out.Trinity.fasta.dammit.namemap.csv', 'F_heteroclitusMDPL.trinity_out.Trinity.fasta.dammit.namemap.csv', 'F_grandis.trinity_out.Trinity.fasta.dammit.namemap.csv', 'F_olivaceous.trinity_out.Trinity.fasta.dammit.namemap.csv', 'F_zebrinus.trinity_out.Trinity.fasta.dammit.namemap.csv', 'F_catanatus.trinity_out.Trinity.fasta.dammit.namemap.csv', 'F_sciadicus.trinity_out.Trinity.fasta.dammit.namemap.csv', 'F_heteroclitusMDPP.trinity_out.Trinity.fasta.dammit.namemap.csv', 'F_similis.trinity_out.Trinity.fasta.dammit.namemap.csv', 'A_xenica.trinity_out.Trinity.fasta.dammit.namemap.csv', 'F_parvapinis.trinity_out.Trinity.fasta.dammit.namemap.csv', 'F_notatus.trinity_out.Trinity.fasta.dammit.namemap.csv', 'L_parva.trinity_out.Trinity.fasta.dammit.namemap.csv', 'F_notti.trinity_out.Trinity.fasta.dammit.namemap.csv', 'F_rathbuni.trinity_out.Trinity.fasta.dammit

In [184]:
conversion_file = conversion_dir + 'A_xenica.trinity_out.Trinity.fasta.dammit.namemap.csv'
conversions = pd.read_csv(conversion_file)
conversions['contig'], conversions['info'] = conversions['original'].str.split(' ', 1).str
conversions = conversions[['contig','renamed']]
conversions.head()

Unnamed: 0,contig,renamed
0,TRINITY_DN2202_c0_g1_i1,Transcript_0
1,TRINITY_DN2270_c0_g1_i1,Transcript_1
2,TRINITY_DN2201_c0_g1_i1,Transcript_2
3,TRINITY_DN2222_c0_g1_i1,Transcript_3
4,TRINITY_DN2291_c0_g1_i1,Transcript_4


In [161]:
conversions.shape

(362783, 2)

In [162]:
contigs_w_expression.shape

(207976, 2)

In [163]:
contigs_w_expression_conversion = pd.merge(conversions, contigs_w_expression, on="contig")
contigs_w_expression_conversion.head()

Unnamed: 0,contig,renamed,gene
0,TRINITY_DN2270_c0_g1_i1,Transcript_1,TRINITY_DN2270_c0_g1
1,TRINITY_DN2201_c0_g1_i1,Transcript_2,TRINITY_DN2201_c0_g1
2,TRINITY_DN2279_c0_g1_i1,Transcript_7,TRINITY_DN2279_c0_g1
3,TRINITY_DN2262_c0_g1_i1,Transcript_8,TRINITY_DN2262_c0_g1
4,TRINITY_DN2253_c0_g1_i1,Transcript_9,TRINITY_DN2253_c0_g1


In [164]:
contigs_w_expression_conversion.shape

(207976, 3)

# Pick one gene name per contig

## based on e-value

In [165]:
pickonename = pickonename.dropna(axis=0,how="all")
print('Number of contigs with annotations (one annotation/contig, sorted by E-value < 1e-05 and picked the lowest):')
print(pickonename.shape)

Number of contigs with annotations (one annotation/contig, sorted by E-value < 1e-05 and picked the lowest):
(148980, 8)


In [166]:
pickonename.head(1000)

Unnamed: 0,seqid,Name,Note,database,Dbxref,start,end,length
0,Transcript_100000,90820,,OrthoDB,,36,366,330
12,Transcript_100001,90820,,OrthoDB,,89,770,681
27,Transcript_100005,ENSONIP00000019130,,OrthoDB,,2,1952,1950
71,Transcript_100011,ENSONIP00000011620,,OrthoDB,,203,353,150
78,Transcript_100012,ENSONIP00000011620,,OrthoDB,,236,608,372
79,Transcript_100013,ENSONIP00000011620,,OrthoDB,,123,273,150
80,Transcript_100014,MAT2A_F,-,,"""Rfam:RF02265""",75,130,55
81,Transcript_100016,ENSXMAP00000004158,,OrthoDB,,807,1230,423
85,Transcript_100017,ENSXMAP00000004158,,OrthoDB,,270,1044,774
93,Transcript_100018,ENSXMAP00000004158,,OrthoDB,,807,1237,430


# Number of named contigs with expression

In [167]:
contigs_w_expression_conversion.columns = ['contig','seqid','gene']
contigs_w_expression_conversion.head()

Unnamed: 0,contig,seqid,gene
0,TRINITY_DN2270_c0_g1_i1,Transcript_1,TRINITY_DN2270_c0_g1
1,TRINITY_DN2201_c0_g1_i1,Transcript_2,TRINITY_DN2201_c0_g1
2,TRINITY_DN2279_c0_g1_i1,Transcript_7,TRINITY_DN2279_c0_g1
3,TRINITY_DN2262_c0_g1_i1,Transcript_8,TRINITY_DN2262_c0_g1
4,TRINITY_DN2253_c0_g1_i1,Transcript_9,TRINITY_DN2253_c0_g1


In [168]:
annotations_expression = pd.merge(contigs_w_expression_conversion,pickonename,on='seqid')
annotations_expression.head()

Unnamed: 0,contig,seqid,gene,Name,Note,database,Dbxref,start,end,length
0,TRINITY_DN2279_c0_g1_i1,Transcript_7,TRINITY_DN2279_c0_g1,OABI002927-RA,,OrthoDB,,5,359,354
1,TRINITY_DN2293_c0_g1_i2,Transcript_47,TRINITY_DN2293_c0_g1,ENSDARP00000005795,,OrthoDB,,450,510,60
2,TRINITY_DN2276_c0_g1_i1,Transcript_49,TRINITY_DN2276_c0_g1,gi|831522803|ref|XP_012718119.1| PREDICTED: ch...,,protein.fa,,0,21,21
3,TRINITY_DN2229_c0_g1_i1,Transcript_51,TRINITY_DN2229_c0_g1,ENSXMAP00000000711,,OrthoDB,,196,865,669
4,TRINITY_DN73078_c0_g1_i1,Transcript_63,TRINITY_DN73078_c0_g1,ENSONIP00000009481,,OrthoDB,,432,528,96


In [169]:
annotations_expression.shape

(97671, 10)

In [170]:
print("This is the number of unique gene names:")
print(len(annotations_expression.Name.unique()))

This is the number of unique gene names:
37921


# Fundulus heteroclitus reference model annotations ONLY

In [171]:
fhet = annotations[annotations['Name'].str.startswith("gi")]
fhet.head(50)

Unnamed: 0,Dbxref,ID,Name,Note,Parent,Target,accuracy,bitscore,database,end,env_coords,phase,score,seqid,source,start,strand,trunc,type,length
28,,homology:583173,gi|831543230|ref|XP_012725447.1| PREDICTED: F-...,,,gi|831543230|ref|XP_012725447.1| PREDICTED: F-...,,,protein.fa,650,,,0.0,Transcript_100005,shmlast.LAST,0,+,,conditional_reciprocal_best_LAST,650
86,,homology:583176,gi|831561925|ref|XP_012731866.1| PREDICTED: S-...,,,gi|831561925|ref|XP_012731866.1| PREDICTED: S-...,,,protein.fa,1086,,,3.2e-16,Transcript_100017,shmlast.LAST,1050,+,,conditional_reciprocal_best_LAST,36
94,,homology:583177,gi|831561925|ref|XP_012731866.1| PREDICTED: S-...,,,gi|831561925|ref|XP_012731866.1| PREDICTED: S-...,,,protein.fa,957,,,2.9e-16,Transcript_100018,shmlast.LAST,921,+,,conditional_reciprocal_best_LAST,36
100,,homology:583178,gi|831561925|ref|XP_012731866.1| PREDICTED: S-...,,,gi|831561925|ref|XP_012731866.1| PREDICTED: S-...,,,protein.fa,1034,,,3.1e-16,Transcript_100019,shmlast.LAST,998,+,,conditional_reciprocal_best_LAST,36
109,,homology:583179,gi|831561925|ref|XP_012731866.1| PREDICTED: S-...,,,gi|831561925|ref|XP_012731866.1| PREDICTED: S-...,,,protein.fa,227,,,4.4999999999999997e-94,Transcript_100020,shmlast.LAST,91,+,,conditional_reciprocal_best_LAST,136
121,,homology:583174,gi|831561925|ref|XP_012731866.1| PREDICTED: S-...,,,gi|831561925|ref|XP_012731866.1| PREDICTED: S-...,,,protein.fa,487,,,1.2e-279,Transcript_100021,shmlast.LAST,91,+,,conditional_reciprocal_best_LAST,396
130,,homology:583181,gi|831561925|ref|XP_012731866.1| PREDICTED: S-...,,,gi|831561925|ref|XP_012731866.1| PREDICTED: S-...,,,protein.fa,357,,,3.3e-142,Transcript_100022,shmlast.LAST,144,+,,conditional_reciprocal_best_LAST,213
141,,homology:583180,gi|831561925|ref|XP_012731866.1| PREDICTED: S-...,,,gi|831561925|ref|XP_012731866.1| PREDICTED: S-...,,,protein.fa,276,,,3.4999999999999997e-131,Transcript_100023,shmlast.LAST,91,+,,conditional_reciprocal_best_LAST,185
166,,homology:583182,gi|831561931|ref|XP_012731869.1| PREDICTED: pu...,,,gi|831561931|ref|XP_012731869.1| PREDICTED: pu...,,,protein.fa,337,,,2.6e-178,Transcript_100025,shmlast.LAST,74,+,,conditional_reciprocal_best_LAST,263
189,,homology:583184,gi|831573610|ref|XP_012735760.1| PREDICTED: sm...,,,gi|831573610|ref|XP_012735760.1| PREDICTED: sm...,,,protein.fa,552,,,4.8e-46,Transcript_100036,shmlast.LAST,471,+,,conditional_reciprocal_best_LAST,81


In [172]:
fhet.shape

(161802, 20)

In [174]:
fhet_names = fhet.sort_values(by=['seqid'], ascending=True)[['seqid', 'Name','start','end','length']]
fhet_names.head()

Unnamed: 0,seqid,Name,start,end,length
28,Transcript_100005,gi|831543230|ref|XP_012725447.1| PREDICTED: F-...,0,650,650
86,Transcript_100017,gi|831561925|ref|XP_012731866.1| PREDICTED: S-...,1050,1086,36
94,Transcript_100018,gi|831561925|ref|XP_012731866.1| PREDICTED: S-...,921,957,36
100,Transcript_100019,gi|831561925|ref|XP_012731866.1| PREDICTED: S-...,998,1034,36
109,Transcript_100020,gi|831561925|ref|XP_012731866.1| PREDICTED: S-...,91,227,136


In [175]:
fhet_names.shape

(161802, 5)

In [188]:
print(annotations.loc[annotations['seqid'] == 'Transcript_158687'])

                   Dbxref                          ID  \
338644  "Pfam:PF01630.14"             homology:135966   
338645                NaN             homology:455102   
338646                NaN             homology:645636   
338647                NaN             homology:645641   
338648                NaN             homology:645646   
338652                NaN  Transcript_158687|g.137461   
338653                NaN  Transcript_158687|m.137461   

                                                     Name           Note  \
338644                                     Glyco_hydro_56  Hyaluronidase   
338645                                 ENSXMAP00000017261            NaN   
338646  gi|831516815|ref|XP_012716020.1| PREDICTED: hy...            NaN   
338647  gi|831516818|ref|XP_012716021.1| PREDICTED: hy...            NaN   
338648  gi|831516821|ref|XP_012716022.1| PREDICTED: hy...            NaN   
338652  ORF%20Transcript_158687%7Cg.137461%20Transcrip...            NaN   
338653  ORF

In [176]:
fhet_filtered = fhet.query('score < 1e-05').drop_duplicates(subset='seqid')[['seqid', 'Name','start','end','length']]

In [177]:
fhet_filtered.shape

(73458, 5)

In [197]:
fhet_annotations = pd.merge(annotations_expression,fhet_filtered,on='seqid',how='inner')
fhet_annotations.shape

(52487, 14)

In [198]:
fhet_annotations.head(100)

Unnamed: 0,contig,seqid,gene,Name_x,Note,database,Dbxref,start_x,end_x,length_x,Name_y,start_y,end_y,length_y
0,TRINITY_DN2293_c0_g1_i2,Transcript_47,TRINITY_DN2293_c0_g1,ENSDARP00000005795,,OrthoDB,,450,510,60,gi|831503966|ref|XP_012711679.1| PREDICTED: pi...,2,22,20
1,TRINITY_DN2276_c0_g1_i1,Transcript_49,TRINITY_DN2276_c0_g1,gi|831522803|ref|XP_012718119.1| PREDICTED: ch...,,protein.fa,,0,21,21,gi|831522803|ref|XP_012718119.1| PREDICTED: ch...,0,21,21
2,TRINITY_DN2229_c0_g1_i1,Transcript_51,TRINITY_DN2229_c0_g1,ENSXMAP00000000711,,OrthoDB,,196,865,669,gi|831546950|ref|XP_012726773.1| PREDICTED: re...,65,288,223
3,TRINITY_DN73065_c0_g2_i1,Transcript_114,TRINITY_DN73065_c0_g2,ENSONIP00000022462,,OrthoDB,,57,459,402,gi|831473992|ref|XP_012717323.1| PREDICTED: gl...,23,152,129
4,TRINITY_DN91626_c0_g1_i1,Transcript_122,TRINITY_DN91626_c0_g1,gi|831551567|ref|XP_012728395.1| PREDICTED: re...,,protein.fa,,18,239,221,gi|831551567|ref|XP_012728395.1| PREDICTED: re...,18,239,221
5,TRINITY_DN91626_c3_g1_i1,Transcript_125,TRINITY_DN91626_c3_g1,ENSXMAP00000004000,,OrthoDB,,1,277,276,gi|831551552|ref|XP_012728390.1| PREDICTED: re...,0,92,92
6,TRINITY_DN91626_c4_g1_i1,Transcript_126,TRINITY_DN91626_c4_g1,ENSXMAP00000004000,,OrthoDB,,0,339,339,gi|831551543|ref|XP_012728387.1| PREDICTED: re...,0,113,113
7,TRINITY_DN91612_c1_g1_i1,Transcript_211,TRINITY_DN91612_c1_g1,gi|831567308|ref|XP_012733664.1| PREDICTED: vo...,,protein.fa,,0,76,76,gi|831567308|ref|XP_012733664.1| PREDICTED: vo...,0,76,76
8,TRINITY_DN91612_c2_g1_i1,Transcript_212,TRINITY_DN91612_c2_g1,gi|831567308|ref|XP_012733664.1| PREDICTED: vo...,,protein.fa,,0,84,84,gi|831567308|ref|XP_012733664.1| PREDICTED: vo...,0,84,84
9,TRINITY_DN91640_c0_g1_i1,Transcript_256,TRINITY_DN91640_c0_g1,ENSXMAP00000001308,,OrthoDB,,46,1417,1371,gi|831475890|ref|XP_012708845.1| PREDICTED: 6-...,15,472,457


In [199]:
fhet_annotations = fhet_annotations.drop_duplicates(subset='seqid')
fhet_annotations.shape

(52487, 14)

In [201]:
print(len(fhet_annotations.Name_x.unique()))

24893


In [183]:
new = fhet_annotations.to_csv("/Users/johnsolk/Documents/UCDavis/Whitehead/annotation_files_test/A_xenica_gene_names.csv")

In [205]:
fhet_names['split_1'], fhet_names['split_2'], fhet_names['split_3'],fhet_names['split_4'],fhet_names['split_5'] = fhet_names ['Name'].str.split('|',0).str

In [206]:
fhet_names.head()

Unnamed: 0,seqid,Name,start,end,length,split_1,split_2,split_3,split_4,split_5
28,Transcript_100005,gi|831543230|ref|XP_012725447.1| PREDICTED: F-...,0,650,650,gi,831543230,ref,XP_012725447.1,PREDICTED: F-box/LRR-repeat protein 17 [Fundu...
86,Transcript_100017,gi|831561925|ref|XP_012731866.1| PREDICTED: S-...,1050,1086,36,gi,831561925,ref,XP_012731866.1,PREDICTED: S-adenosylmethionine synthase isof...
94,Transcript_100018,gi|831561925|ref|XP_012731866.1| PREDICTED: S-...,921,957,36,gi,831561925,ref,XP_012731866.1,PREDICTED: S-adenosylmethionine synthase isof...
100,Transcript_100019,gi|831561925|ref|XP_012731866.1| PREDICTED: S-...,998,1034,36,gi,831561925,ref,XP_012731866.1,PREDICTED: S-adenosylmethionine synthase isof...
109,Transcript_100020,gi|831561925|ref|XP_012731866.1| PREDICTED: S-...,91,227,136,gi,831561925,ref,XP_012731866.1,PREDICTED: S-adenosylmethionine synthase isof...


In [208]:
fhet_names.columns

Index(['seqid', 'Name', 'start', 'end', 'length', 'split_1', 'split_2',
       'split_3', 'split_4', 'split_5'],
      dtype='object')

In [211]:
fhet_names.columns = ['seqid', 'Name', 'start', 'end', 'length', 'split_1', 'id','split_3', 'protein_id', 'Notes']
fhet_names = fhet_names[['seqid', 'Name', 'start', 'end', 'length', 'id','protein_id', 'Notes']]

In [212]:
fhet_names.head()

Unnamed: 0,seqid,Name,start,end,length,id,protein_id,Notes
28,Transcript_100005,gi|831543230|ref|XP_012725447.1| PREDICTED: F-...,0,650,650,831543230,XP_012725447.1,PREDICTED: F-box/LRR-repeat protein 17 [Fundu...
86,Transcript_100017,gi|831561925|ref|XP_012731866.1| PREDICTED: S-...,1050,1086,36,831561925,XP_012731866.1,PREDICTED: S-adenosylmethionine synthase isof...
94,Transcript_100018,gi|831561925|ref|XP_012731866.1| PREDICTED: S-...,921,957,36,831561925,XP_012731866.1,PREDICTED: S-adenosylmethionine synthase isof...
100,Transcript_100019,gi|831561925|ref|XP_012731866.1| PREDICTED: S-...,998,1034,36,831561925,XP_012731866.1,PREDICTED: S-adenosylmethionine synthase isof...
109,Transcript_100020,gi|831561925|ref|XP_012731866.1| PREDICTED: S-...,91,227,136,831561925,XP_012731866.1,PREDICTED: S-adenosylmethionine synthase isof...


In [213]:
fhet_names = pd.merge(fhet_names,protein_transcript,how="inner",on="protein_id")

NameError: name 'protein_transcript' is not defined