# Manually Curation of non-Aligned Locus Tags

___
## Setup

In [1]:
import sys
sys.path.append('..')

In [2]:
from pymodulon.gene_util import *
from tqdm.notebook import tqdm
import numpy as np
import os
from Bio import SeqIO

In [3]:
org_dir = '/Users/siddharth/PycharmProjects/modulome_saci'
kegg_organism_code = 'sai'
seq_dir = os.path.join(org_dir,'sequence_files')
sacid_seq_dir = os.path.join(org_dir,'Sacid_prokka')

### Get information from GFF file

#### Convert GFF to Pandas DataFrame

In [4]:
annot_list = []
for filename in os.listdir(seq_dir):
    if filename.endswith('.gff3'):
        gff = os.path.join(seq_dir,filename)
        annot_list.append(gff2pandas(gff))
keep_cols = ['refseq','start','end','strand','gene_name','locus_tag','old_locus_tag','gene_product','ncbi_protein']
DF_annot = pd.concat(annot_list)[keep_cols]
DF_annot = DF_annot.drop_duplicates('locus_tag')
DF_annot.set_index('locus_tag',drop=True,inplace=True)

In [5]:
annot_list = []
for filename in os.listdir(sacid_seq_dir):
    if filename.endswith('.gff'):
        gff = os.path.join(sacid_seq_dir,filename)
        annot_list.append(gff2pandas(gff))
keep_cols = ['refseq','start','end','strand','gene_name','locus_tag','old_locus_tag','gene_product','ncbi_protein']
DF_annot_sacid = pd.concat(annot_list)[keep_cols]
DF_annot_sacid = DF_annot_sacid.drop_duplicates('locus_tag')
DF_annot_sacid.set_index('locus_tag',drop=True,inplace=True)

In [6]:
tpm_file = os.path.join(org_dir,'data','log_tpm.csv')
DF_log_tpm = pd.read_csv(tpm_file,index_col=0)

Check that the genes are the same in the expression dataset as in the annotation dataframe.

In [7]:
DF_log_tpm.head()

Unnamed: 0_level_0,ERX1518397,ERX1518398,ERX1518399,ERX3018360,ERX3018361,ERX3018362,ERX3018363,SRX2548838,SRX2548839,SRX2548840,...,SRX5653264,SRX5653265,SRX5653266,SRX5653267,SRX5653268,SRX5653269,SRX6762909,SRX6762910,SRX6762911,SRX6762912
Geneid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
SACI_RS00005,0.0,0.0,0.0,8.903589,8.430008,8.961871,8.59526,8.448594,8.228731,8.368168,...,6.848788,7.185195,7.376186,7.685027,6.601357,7.652005,8.639968,8.791053,8.445832,8.605442
SACI_RS00010,0.0,0.0,0.0,8.103548,7.884489,8.137422,7.954576,7.012592,8.351356,8.608128,...,6.114308,6.546701,6.703395,7.01319,5.926302,6.883948,8.100742,8.554694,8.134199,8.34369
SACI_RS00015,11.275116,11.282262,9.90887,10.824914,11.150282,10.843054,11.12013,9.559263,6.69793,7.179276,...,10.860233,10.735659,10.796574,10.874813,11.073584,10.869027,10.799551,11.091902,10.519661,10.883387
SACI_RS00020,6.535285,0.0,0.0,4.920237,5.854611,5.760206,5.885891,6.917147,5.140496,3.635002,...,5.309888,5.338677,5.418699,5.533618,5.420223,5.384989,5.386592,5.732974,5.931306,5.798582
SACI_RS00025,7.016261,0.0,0.0,7.904266,8.076969,7.998144,7.867476,7.082449,7.023425,6.744682,...,8.011013,8.092458,8.111934,8.131836,8.261467,8.238378,7.956496,7.930298,7.902762,7.961625


In [8]:
DF_annot.head()

Unnamed: 0_level_0,refseq,start,end,strand,gene_name,old_locus_tag,gene_product,ncbi_protein
locus_tag,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
SACI_RS00005,NC_007181.1,101,1261,+,,Saci_0001,AAA family ATPase,WP_011276932.1
SACI_RS00010,NC_007181.1,1294,1629,+,,Saci_0002,hypothetical protein,WP_011276933.1
SACI_RS00015,NC_007181.1,1665,2504,+,,Saci_0003,hypothetical protein,WP_011276934.1
SACI_RS00020,NC_007181.1,2553,3056,-,,Saci_0004,hypothetical protein,WP_015385334.1
SACI_RS00025,NC_007181.1,3049,3768,-,,Saci_0005,hypothetical protein,WP_011276936.1


In [9]:
DF_annot_sacid.head()

Unnamed: 0_level_0,refseq,start,end,strand,gene_name,old_locus_tag,gene_product,ncbi_protein
locus_tag,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Sacid_00002,NC_007181.1,1294.0,1629.0,+,,,hypothetical protein,
Sacid_00003,NC_007181.1,1665.0,2504.0,+,,,hypothetical protein,
Sacid_00004,NC_007181.1,2553.0,3110.0,-,,,hypothetical protein,
Sacid_00005,NC_007181.1,3049.0,3768.0,-,,,hypothetical protein,
Sacid_00006,NC_007181.1,3801.0,4052.0,-,,,hypothetical protein,


###### Annotations differ between 'SACI' and 'Sacid', need to map between them to make a unified DF_annot file

In [10]:
# Initial Merge (Based on refseq, start, end, and strand columns)

DF_fin = DF_annot.reset_index().merge(DF_annot_sacid.reset_index(),
                                      how='inner',
                                      on=['refseq', 'start', 'end', 'strand'])


## Rename columns in new merged DataFrame

rename_dict = {'locus_tag_x': 'locus_tag',
               'locus_tag_y': 'prokka_locus_tag',
               'old_locus_tag_x': 'old_locus_tag',
               'ncbi_protein_x': 'ncbi_protein'}

DF_fin.rename(columns=rename_dict,
              inplace=True)

## Drop columns with only None values
DF_fin.drop(columns=['old_locus_tag_y', 'ncbi_protein_y'],
            inplace=True)

DF_fin['gene_name'] = None

In [11]:
def gene_name_cmp(x, y):
    out = None
    
    if x == None and y != None:
        out = y
    elif x != None and y == None:
        out = x
    
    elif x != None and y != None:
        if x == y:
            out = x
        
        else:
            out = str(x) + ', ' + str(y)
    
    return out


# Merge gene name columns
for idx in tqdm(DF_fin.index):
    DF_fin.loc[idx, 'gene_name'] = gene_name_cmp(DF_fin.loc[idx, 'gene_name_x'], DF_fin.loc[idx, 'gene_name_y'])

DF_fin.drop(columns=['gene_name_x', 'gene_name_y'],
            inplace=True)

DF_fin.set_index('locus_tag', inplace=True)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1942.0), HTML(value='')))




In [12]:
DF_fin

Unnamed: 0_level_0,refseq,start,end,strand,old_locus_tag,gene_product_x,ncbi_protein,prokka_locus_tag,gene_product_y,gene_name
locus_tag,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
SACI_RS00010,NC_007181.1,1294,1629,+,Saci_0002,hypothetical protein,WP_011276933.1,Sacid_00002,hypothetical protein,
SACI_RS00015,NC_007181.1,1665,2504,+,Saci_0003,hypothetical protein,WP_011276934.1,Sacid_00003,hypothetical protein,
SACI_RS00025,NC_007181.1,3049,3768,-,Saci_0005,hypothetical protein,WP_011276936.1,Sacid_00005,hypothetical protein,
SACI_RS00030,NC_007181.1,3801,4052,-,Saci_0006,winged helix-turn-helix transcriptional regulator,WP_011276937.1,Sacid_00006,hypothetical protein,
SACI_RS11965,NC_007181.1,4138,4419,-,Saci_0007,hypothetical protein,WP_061972215.1,Sacid_00007,hypothetical protein,
...,...,...,...,...,...,...,...,...,...,...
SACI_RS11460,NC_007181.1,2217234,2218298,-,Saci_2370,aspartate-semialdehyde dehydrogenase,WP_011279150.1,Sacid_02368,Aspartate-semialdehyde dehydrogenase,"asd, asd_2"
SACI_RS11465,NC_007181.1,2218618,2219355,-,Saci_2371,class I SAM-dependent methyltransferase,WP_011279151.1,Sacid_02369,2-methoxy-6-polyprenyl-1%2C4-benzoquinol methy...,COQ5_5
SACI_RS11470,NC_007181.1,2219468,2220394,+,Saci_2372,ornithine cyclodeaminase family protein,WP_011279152.1,Sacid_02370,Delta(1)-pyrroline-2-carboxylate reductase,
SACI_RS11475,NC_007181.1,2220381,2220989,-,Saci_2373,cob(I)yrinic acid a%2Cc-diamide adenosyltransf...,WP_011279153.1,Sacid_02371,Cobalamin adenosyltransferase,cobO


In [13]:
# Unified DF_annot DataFrame generated and partially filled
DF_annot_union = pd.DataFrame(data=DF_fin, index=DF_annot.index)
DF_annot_union

Unnamed: 0_level_0,refseq,start,end,strand,old_locus_tag,gene_product_x,ncbi_protein,prokka_locus_tag,gene_product_y,gene_name
locus_tag,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
SACI_RS00005,,,,,,,,,,
SACI_RS00010,NC_007181.1,1294.0,1629.0,+,Saci_0002,hypothetical protein,WP_011276933.1,Sacid_00002,hypothetical protein,
SACI_RS00015,NC_007181.1,1665.0,2504.0,+,Saci_0003,hypothetical protein,WP_011276934.1,Sacid_00003,hypothetical protein,
SACI_RS00020,,,,,,,,,,
SACI_RS00025,NC_007181.1,3049.0,3768.0,-,Saci_0005,hypothetical protein,WP_011276936.1,Sacid_00005,hypothetical protein,
...,...,...,...,...,...,...,...,...,...,...
SACI_RS11465,NC_007181.1,2218618.0,2219355.0,-,Saci_2371,class I SAM-dependent methyltransferase,WP_011279151.1,Sacid_02369,2-methoxy-6-polyprenyl-1%2C4-benzoquinol methy...,COQ5_5
SACI_RS11470,NC_007181.1,2219468.0,2220394.0,+,Saci_2372,ornithine cyclodeaminase family protein,WP_011279152.1,Sacid_02370,Delta(1)-pyrroline-2-carboxylate reductase,
SACI_RS11475,NC_007181.1,2220381.0,2220989.0,-,Saci_2373,cob(I)yrinic acid a%2Cc-diamide adenosyltransf...,WP_011279153.1,Sacid_02371,Cobalamin adenosyltransferase,cobO
SACI_RS11480,NC_007181.1,2221039.0,2224263.0,-,Saci_2374,S8 family serine peptidase,WP_011279154.1,Sacid_02372,hypothetical protein,


___
## Manual Curation Starts Here

### Merge and Curate along 1 limiting parameter (Start vs End)

#### Merge values by Start only

In [14]:
extra = set(DF_annot.index) - set(DF_fin.index)
DF_extra_start = DF_annot.loc[extra].reset_index().merge(DF_annot_sacid.reset_index(),
                                                   how='inner', on=['refseq', 'start', 'strand'])
DF_extra_start

Unnamed: 0,locus_tag_x,refseq,start,end_x,strand,gene_name_x,old_locus_tag_x,gene_product_x,ncbi_protein_x,locus_tag_y,end_y,gene_name_y,old_locus_tag_y,gene_product_y,ncbi_protein_y
0,SACI_RS01355,NC_007181.1,234430,235188,-,,Saci_0278,VIT1/CCC1 transporter family protein,WP_011277196.1,Sacid_00274,235173.0,,,hypothetical protein,
1,SACI_RS03880,NC_007181.1,649445,650179,-,,Saci_0811,RNA-guided pseudouridylation complex pseudouri...,WP_080504008.1,Sacid_00790,650152.0,truB_1,,tRNA pseudouridine synthase B,
2,SACI_RS10635,NC_007181.1,2037777,2038472,-,,Saci_2200,uroporphyrinogen-III C-methyltransferase,WP_011278988.1,Sacid_02200,2038493.0,cysG_2,,Siroheme synthase,
3,SACI_RS02045,NC_007181.1,358904,359723,+,,,aldolase,,Sacid_00414,359311.0,aroA',,2-amino-3%2C7-dideoxy-D-threo-hept-6-ulosonate...,
4,SACI_RS03785,NC_007181.1,633321,634028,-,,Saci_0791,thioredoxin family protein,WP_011277663.1,Sacid_00771,634052.0,,,hypothetical protein,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
145,SACI_RS11650,NC_007181.1,1096722,1098830,-,,Saci_1292,fibronectin type III domain-containing protein,WP_015385592.1,Sacid_01268,1098872.0,nanM,,N-acetylneuraminate epimerase,
146,SACI_RS03160,NC_007181.1,525232,526518,-,,Saci_0663,histidine--tRNA ligase,WP_011277549.1,Sacid_00645,526491.0,hisS,,Histidine--tRNA ligase,
147,SACI_RS00690,NC_007181.1,114813,115748,-,,Saci_0145,acyl-CoA thioesterase,WP_011277067.1,Sacid_00140,115757.0,,,hypothetical protein,
148,SACI_RS08665,NC_007181.1,1576764,1577747,-,,Saci_1809,energy-coupling factor transporter transmembra...,WP_011278617.1,Sacid_01778,1577765.0,ecfT,,Energy-coupling factor transporter transmembra...,


#### Merge values by End only

In [15]:
DF_extra_stop = DF_annot.loc[extra].reset_index().merge(DF_annot_sacid.reset_index(),
                                                   how='inner', on=['refseq', 'end', 'strand'])
DF_extra_stop

Unnamed: 0,locus_tag_x,refseq,start_x,end,strand,gene_name_x,old_locus_tag_x,gene_product_x,ncbi_protein_x,locus_tag_y,start_y,gene_name_y,old_locus_tag_y,gene_product_y,ncbi_protein_y
0,SACI_RS09355,NC_007181.1,1750538,1751416,+,,Saci_1939,fumarylacetoacetate hydrolase family protein,WP_011278744.1,Sacid_01924,1750478.0,,,hypothetical protein,
1,SACI_RS02045,NC_007181.1,358904,359723,+,,,aldolase,,Sacid_00415,359304.0,lsrF,,3-hydroxy-5-phosphonooxypentane-2%2C4-dione th...,
2,SACI_RS00735,NC_007181.1,124051,125265,+,,Saci_0154,hypothetical protein,WP_015385373.1,Sacid_00149,124009.0,,,hypothetical protein,
3,SACI_RS11120,NC_007181.1,2142036,2143856,+,,Saci_2298,AMP-binding protein,WP_011279086.1,Sacid_02300,2142033.0,acsA_6,,Acetyl-coenzyme A synthetase,
4,SACI_RS05580,NC_007181.1,987096,987896,+,,Saci_1171,MarR family transcriptional regulator,WP_015385569.1,Sacid_01148,987087.0,,,hypothetical protein,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
155,SACI_RS08780,NC_007181.1,1596826,1597425,+,,Saci_1831,DUF2250 domain-containing protein,WP_015385681.1,Sacid_01800,1596802.0,,,hypothetical protein,
156,SACI_RS05795,NC_007181.1,1036276,1036800,+,,Saci_1216,DUF2258 domain-containing protein,WP_015385577.1,Sacid_01193,1036327.0,,,hypothetical protein,
157,SACI_RS05305,NC_007181.1,915975,916871,+,,Saci_1113,acyl-CoA/acyl-ACP dehydrogenase,WP_015385555.1,Sacid_01089,915963.0,,,hypothetical protein,
158,SACI_RS00265,NC_007181.1,44258,45793,+,,Saci_0059,phosphoenolpyruvate carboxylase,WP_015385346.1,Sacid_00056,44222.0,ppcA,,Phosphoenolpyruvate carboxylase,


#### Find and Curate Intersection

This intersection is filled with multiple Sacid loci being mapped to one SACI locus, and so will be mapped manually

In [16]:
cmn_genes = set(DF_extra_start.locus_tag_x).intersection(set(DF_extra_stop.locus_tag_x))

DF_extra_start.set_index('locus_tag_x').loc[cmn_genes].head()

Unnamed: 0_level_0,refseq,start,end_x,strand,gene_name_x,old_locus_tag_x,gene_product_x,ncbi_protein_x,locus_tag_y,end_y,gene_name_y,old_locus_tag_y,gene_product_y,ncbi_protein_y
locus_tag_x,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
SACI_RS03755,NC_007181.1,626583,627977,+,,,FAD-dependent oxidoreductase,,Sacid_00763,627266.0,,,hypothetical protein,
SACI_RS06870,NC_007181.1,1225387,1227549,-,treY,,malto-oligosyltrehalose synthase,,Sacid_01405,1225686.0,,,hypothetical protein,
SACI_RS03760,NC_007181.1,627974,629082,+,,,FAD-binding oxidoreductase,,Sacid_00765,628495.0,,,hypothetical protein,
SACI_RS05475,NC_007181.1,962554,964277,-,,,acyl--CoA ligase,,Sacid_01125,963957.0,acsA_2,,Acetyl-coenzyme A synthetase,
SACI_RS02045,NC_007181.1,358904,359723,+,,,aldolase,,Sacid_00414,359311.0,aroA',,2-amino-3%2C7-dideoxy-D-threo-hept-6-ulosonate...,


In [17]:
DF_extra_stop.set_index('locus_tag_x').loc[cmn_genes].head()

Unnamed: 0_level_0,refseq,start_x,end,strand,gene_name_x,old_locus_tag_x,gene_product_x,ncbi_protein_x,locus_tag_y,start_y,gene_name_y,old_locus_tag_y,gene_product_y,ncbi_protein_y
locus_tag_x,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
SACI_RS03755,NC_007181.1,626583,627977,+,,,FAD-dependent oxidoreductase,,Sacid_00764,627477.0,,,hypothetical protein,
SACI_RS06870,NC_007181.1,1225387,1227549,-,treY,,malto-oligosyltrehalose synthase,,Sacid_01407,1225945.0,treY_2,,Maltooligosyl trehalose synthase,
SACI_RS03760,NC_007181.1,627974,629082,+,,,FAD-binding oxidoreductase,,Sacid_00766,628567.0,hcnC,,Hydrogen cyanide synthase subunit HcnC,
SACI_RS05475,NC_007181.1,962554,964277,-,,,acyl--CoA ligase,,Sacid_01126,963981.0,,,hypothetical protein,
SACI_RS02045,NC_007181.1,358904,359723,+,,,aldolase,,Sacid_00415,359304.0,lsrF,,3-hydroxy-5-phosphonooxypentane-2%2C4-dione th...,


In [18]:
def gpy_cmp(pgstart, pgstop):
    hp = 'hypothetical protein'
    out = hp
    
    if pgstart == hp and pgstop != hp:
        out = pgstop
    
    elif pgstart != hp and pgstop == hp:
        out = pgstart
    
    elif pgstart != hp and pgstop != hp:        
        if pgstart == pgstop:
            out = pgstart
        
        else:
            out = pgstart + ', '+ pgstop
    
    return out

# gene_name, gene_product_y merge
for gene in cmn_genes:
    pgstart = DF_extra_start.set_index('locus_tag_x').loc[gene, 'gene_product_y']
    pgstop = DF_extra_stop.set_index('locus_tag_x').loc[gene, 'gene_product_y']
    
    start_name = gene_name_cmp(DF_extra_start.set_index('locus_tag_x').loc[gene, 'gene_name_x'],
                               DF_extra_start.set_index('locus_tag_x').loc[gene, 'gene_name_y'])
    
    stop_name = gene_name_cmp(DF_extra_stop.set_index('locus_tag_x').loc[gene, 'gene_name_x'],
                              DF_extra_stop.set_index('locus_tag_x').loc[gene, 'gene_name_y'])
    
    DF_annot_union.loc[gene, 'gene_product_y'] = gpy_cmp(pgstart, pgstop).replace('%2C', ',')
    DF_annot_union.loc[gene, 'gene_name'] = gene_name_cmp(start_name, stop_name)

In [19]:
# refseq, strand, ncbi_protein, old_locus_tag, gene_product_x
DF_annot_union.loc[cmn_genes, 'refseq'] = 'NC_007181.1'

DF_annot_union.loc[cmn_genes, 'old_locus_tag'] = DF_annot.loc[cmn_genes, 'old_locus_tag']

DF_annot_union.loc[cmn_genes, 'strand'] = DF_extra_start.set_index('locus_tag_x').loc[
    cmn_genes, 'strand']

DF_annot_union.loc[cmn_genes, 'ncbi_protein'] = DF_extra_start.set_index('locus_tag_x').loc[
    cmn_genes, 'ncbi_protein_x']

DF_annot_union.loc[cmn_genes, 'gene_product_x'] = DF_extra_start.set_index('locus_tag_x').loc[
    cmn_genes, 'gene_product_x']

# start, end
sstart = DF_extra_start.set_index('locus_tag_x').loc[cmn_genes]
DF_annot_union.loc[sstart.index, 'start'] = sstart.start
DF_annot_union.loc[sstart.index, 'end'] = sstart.end_x


# prokka_locus_tag
no_SACI_RS06870 = cmn_genes - set(['SACI_RS06870'])

prokka1 = DF_extra_start.set_index('locus_tag_x').loc[no_SACI_RS06870, 'locus_tag_y']
prokka2 = DF_extra_stop.set_index('locus_tag_x').loc[no_SACI_RS06870, 'locus_tag_y']

DF_annot_union.loc[no_SACI_RS06870, 'prokka_locus_tag'] = prokka1 + ', '+ prokka2

# Manually curate prokka_locus_tag of SACI_RS06870
DF_annot_union.loc['SACI_RS06870', 'prokka_locus_tag'] = 'Sacid_01405, Sacid_01406, Sacid_01407'

In [20]:
DF_annot_union.loc[cmn_genes]

Unnamed: 0_level_0,refseq,start,end,strand,old_locus_tag,gene_product_x,ncbi_protein,prokka_locus_tag,gene_product_y,gene_name
locus_tag,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
SACI_RS03755,NC_007181.1,626583.0,627977.0,+,,FAD-dependent oxidoreductase,,"Sacid_00763, Sacid_00764",hypothetical protein,
SACI_RS06870,NC_007181.1,1225387.0,1227549.0,-,,malto-oligosyltrehalose synthase,,"Sacid_01405, Sacid_01406, Sacid_01407",Maltooligosyl trehalose synthase,"treY, treY, treY_2"
SACI_RS03760,NC_007181.1,627974.0,629082.0,+,,FAD-binding oxidoreductase,,"Sacid_00765, Sacid_00766",Hydrogen cyanide synthase subunit HcnC,hcnC
SACI_RS05475,NC_007181.1,962554.0,964277.0,-,,acyl--CoA ligase,,"Sacid_01125, Sacid_01126",Acetyl-coenzyme A synthetase,acsA_2
SACI_RS02045,NC_007181.1,358904.0,359723.0,+,,aldolase,,"Sacid_00414, Sacid_00415","2-amino-3,7-dideoxy-D-threo-hept-6-ulosonate s...","aroA', lsrF"
SACI_RS09915,NC_007181.1,1868239.0,1868964.0,-,,CRISPR-associated RAMP protein,,"Sacid_02045, Sacid_02046",hypothetical protein,
SACI_RS09730,NC_007181.1,1827072.0,1827638.0,-,,CRISPR-associated protein Cas4,,"Sacid_02005, Sacid_02006",hypothetical protein,cas4
SACI_RS10510,NC_007181.1,2007741.0,2009515.0,+,,S9 family peptidase,,"Sacid_02171, Sacid_02172",hypothetical protein,
SACI_RS11865,NC_007181.1,1611244.0,1612227.0,-,,cyclase family protein,,"Sacid_01812, Sacid_01813",hypothetical protein,
SACI_RS02475,NC_007181.1,423985.0,425845.0,+,,DUF87 domain-containing protein,,"Sacid_00507, Sacid_00508",hypothetical protein,


#### Curate those with only 'Start' or 'End' Matches

In [21]:
DF_extra_start.rename(columns=rename_dict, inplace=True)
DF_extra_start.set_index('locus_tag', inplace=True)

DF_extra_start.drop(index=cmn_genes, inplace=True)
DF_extra_start.drop(columns=['old_locus_tag_y', 'ncbi_protein_y'],
                    inplace=True)


# gene_name merge
for gene in DF_extra_start.index:
    gene_name_x = DF_extra_start.loc[gene, 'gene_name_x']
    gene_name_y = DF_extra_start.loc[gene, 'gene_name_y']
    
    DF_annot_union.loc[gene, 'gene_name'] = gene_name_cmp(gene_name_x, gene_name_y)


# refseq, strand, ncbi_protein, old_locus_tag, gene_product_x, gene_product_y
DF_annot_union.loc[DF_extra_start.index, 'refseq'] = 'NC_007181.1'
DF_annot_union.loc[DF_extra_start.index, 'old_locus_tag'] = DF_annot.loc[DF_extra_start.index, 'old_locus_tag']
DF_annot_union.loc[DF_extra_start.index, 'strand'] = DF_extra_start['strand']
DF_annot_union.loc[DF_extra_start.index, 'ncbi_protein'] = DF_extra_start['ncbi_protein']
DF_annot_union.loc[DF_extra_start.index, 'gene_product_x'] = DF_extra_start['gene_product_x']
DF_annot_union.loc[DF_extra_start.index, 'gene_product_y'] = DF_extra_start['gene_product_y']

# start, end
DF_annot_union.loc[DF_extra_start.index, 'start'] = DF_extra_start['start']
DF_annot_union.loc[DF_extra_start.index, 'end'] = DF_extra_start['end_x']

# prokka_locus_tag
DF_annot_union.loc[DF_extra_start.index, 'prokka_locus_tag'] = DF_extra_start['prokka_locus_tag']

In [22]:
DF_annot_union.loc[DF_extra_start.index]

Unnamed: 0_level_0,refseq,start,end,strand,old_locus_tag,gene_product_x,ncbi_protein,prokka_locus_tag,gene_product_y,gene_name
locus_tag,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
SACI_RS01355,NC_007181.1,234430.0,235188.0,-,Saci_0278,VIT1/CCC1 transporter family protein,WP_011277196.1,Sacid_00274,hypothetical protein,
SACI_RS03880,NC_007181.1,649445.0,650179.0,-,Saci_0811,RNA-guided pseudouridylation complex pseudouri...,WP_080504008.1,Sacid_00790,tRNA pseudouridine synthase B,truB_1
SACI_RS10635,NC_007181.1,2037777.0,2038472.0,-,Saci_2200,uroporphyrinogen-III C-methyltransferase,WP_011278988.1,Sacid_02200,Siroheme synthase,cysG_2
SACI_RS03785,NC_007181.1,633321.0,634028.0,-,Saci_0791,thioredoxin family protein,WP_011277663.1,Sacid_00771,hypothetical protein,
SACI_RS06250,NC_007181.1,1113548.0,1113727.0,-,Saci_1307,chromatin protein Cren7,WP_011278147.1,Sacid_01283,hypothetical protein,
...,...,...,...,...,...,...,...,...,...,...
SACI_RS11650,NC_007181.1,1096722.0,1098830.0,-,Saci_1292,fibronectin type III domain-containing protein,WP_015385592.1,Sacid_01268,N-acetylneuraminate epimerase,nanM
SACI_RS03160,NC_007181.1,525232.0,526518.0,-,Saci_0663,histidine--tRNA ligase,WP_011277549.1,Sacid_00645,Histidine--tRNA ligase,hisS
SACI_RS00690,NC_007181.1,114813.0,115748.0,-,Saci_0145,acyl-CoA thioesterase,WP_011277067.1,Sacid_00140,hypothetical protein,
SACI_RS08665,NC_007181.1,1576764.0,1577747.0,-,Saci_1809,energy-coupling factor transporter transmembra...,WP_011278617.1,Sacid_01778,Energy-coupling factor transporter transmembra...,ecfT


In [23]:
DF_extra_stop.rename(columns=rename_dict, inplace=True)
DF_extra_stop.set_index('locus_tag', inplace=True)

DF_extra_stop.drop(index=cmn_genes, inplace=True)
DF_extra_stop.drop(columns=['old_locus_tag_y', 'ncbi_protein_y'],
                   inplace=True)


# gene_name merge
for gene in DF_extra_stop.index:
    gene_name_x = DF_extra_stop.loc[gene, 'gene_name_x']
    gene_name_y = DF_extra_stop.loc[gene, 'gene_name_y']
    
    DF_annot_union.loc[gene, 'gene_name'] = gene_name_cmp(gene_name_x, gene_name_y)


# refseq, strand, ncbi_protein, old_locus_tag, gene_product_x, gene_product_y
DF_annot_union.loc[DF_extra_stop.index, 'refseq'] = 'NC_007181.1'
DF_annot_union.loc[DF_extra_stop.index, 'old_locus_tag'] = DF_annot.loc[DF_extra_stop.index, 'old_locus_tag']
DF_annot_union.loc[DF_extra_stop.index, 'strand'] = DF_extra_stop['strand']
DF_annot_union.loc[DF_extra_stop.index, 'ncbi_protein'] = DF_extra_stop['ncbi_protein']
DF_annot_union.loc[DF_extra_stop.index, 'gene_product_x'] = DF_extra_stop['gene_product_x']
DF_annot_union.loc[DF_extra_stop.index, 'gene_product_y'] = DF_extra_stop['gene_product_y']

# start, end
DF_annot_union.loc[DF_extra_stop.index, 'start'] = DF_extra_stop['start_x']
DF_annot_union.loc[DF_extra_stop.index, 'end'] = DF_extra_stop['end']

# prokka_locus_tag
DF_annot_union.loc[DF_extra_stop.index, 'prokka_locus_tag'] = DF_extra_stop['prokka_locus_tag']

In [24]:
DF_annot_union.loc[DF_extra_stop.index]

Unnamed: 0_level_0,refseq,start,end,strand,old_locus_tag,gene_product_x,ncbi_protein,prokka_locus_tag,gene_product_y,gene_name
locus_tag,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
SACI_RS09355,NC_007181.1,1750538.0,1751416.0,+,Saci_1939,fumarylacetoacetate hydrolase family protein,WP_011278744.1,Sacid_01924,hypothetical protein,
SACI_RS00735,NC_007181.1,124051.0,125265.0,+,Saci_0154,hypothetical protein,WP_015385373.1,Sacid_00149,hypothetical protein,
SACI_RS11120,NC_007181.1,2142036.0,2143856.0,+,Saci_2298,AMP-binding protein,WP_011279086.1,Sacid_02300,Acetyl-coenzyme A synthetase,acsA_6
SACI_RS05580,NC_007181.1,987096.0,987896.0,+,Saci_1171,MarR family transcriptional regulator,WP_015385569.1,Sacid_01148,hypothetical protein,
SACI_RS10430,NC_007181.1,1990097.0,1990336.0,+,,hypothetical protein,WP_015385781.1,Sacid_02154,hypothetical protein,
...,...,...,...,...,...,...,...,...,...,...
SACI_RS08780,NC_007181.1,1596826.0,1597425.0,+,Saci_1831,DUF2250 domain-containing protein,WP_015385681.1,Sacid_01800,hypothetical protein,
SACI_RS05795,NC_007181.1,1036276.0,1036800.0,+,Saci_1216,DUF2258 domain-containing protein,WP_015385577.1,Sacid_01193,hypothetical protein,
SACI_RS05305,NC_007181.1,915975.0,916871.0,+,Saci_1113,acyl-CoA/acyl-ACP dehydrogenase,WP_015385555.1,Sacid_01089,hypothetical protein,
SACI_RS00265,NC_007181.1,44258.0,45793.0,+,Saci_0059,phosphoenolpyruvate carboxylase,WP_015385346.1,Sacid_00056,Phosphoenolpyruvate carboxylase,ppcA


#### Curate remainder (non-matching) loci

Remaining 56 non-matching loci do not have intersecting (or even approximate) matches with prokka tags, will be filled out using only ncbi data from DF_annot

In [25]:
remainder = DF_annot_union['refseq'].isna()

# Match through DF_annot (ncbi seq)
DF_annot_union.loc[remainder, 'refseq'] = DF_annot.loc[remainder, 'refseq']
DF_annot_union.loc[remainder, 'start'] = DF_annot.loc[remainder, 'start']
DF_annot_union.loc[remainder, 'end'] = DF_annot.loc[remainder, 'end']
DF_annot_union.loc[remainder, 'strand'] = DF_annot.loc[remainder, 'strand']
DF_annot_union.loc[remainder, 'gene_name'] = DF_annot.loc[remainder, 'gene_name']
DF_annot_union.loc[remainder, 'old_locus_tag'] = DF_annot.loc[remainder, 'old_locus_tag']
DF_annot_union.loc[remainder, 'gene_product_x'] = DF_annot.loc[remainder, 'gene_product']
DF_annot_union.loc[remainder, 'ncbi_protein'] = DF_annot.loc[remainder, 'ncbi_protein']

# No match via prokka, these are left as None
DF_annot_union.loc[remainder, 'prokka_locus_tag'] = None
DF_annot_union.loc[remainder, 'gene_product_y'] = None

In [26]:
DF_annot_union = DF_annot_union[['refseq', 'start', 'end', 'strand',
                                 'gene_name', 'old_locus_tag', 'prokka_locus_tag',
                                 'gene_product_x', 'gene_product_y', 'ncbi_protein']]


for gene in DF_annot_union.index:
    if DF_annot_union['gene_product_x'].isna().loc[gene] == False:
        DF_annot_union.loc[gene, 'gene_product_x'] = DF_annot_union.loc[gene, 'gene_product_x'].replace('%2C', ',')
    
    if DF_annot_union['gene_product_y'].isna().loc[gene] == False:
        DF_annot_union.loc[gene, 'gene_product_y'] = DF_annot_union.loc[gene, 'gene_product_y'].replace('%2C', ',')


DF_annot_union.rename(columns={'gene_product_x': 'gene_product', 'gene_product_y': 'gene_product_prokka'},
                      inplace=True)

DF_annot_union

Unnamed: 0_level_0,refseq,start,end,strand,gene_name,old_locus_tag,prokka_locus_tag,gene_product,gene_product_prokka,ncbi_protein
locus_tag,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
SACI_RS00005,NC_007181.1,101.0,1261.0,+,,Saci_0001,,AAA family ATPase,,WP_011276932.1
SACI_RS00010,NC_007181.1,1294.0,1629.0,+,,Saci_0002,Sacid_00002,hypothetical protein,hypothetical protein,WP_011276933.1
SACI_RS00015,NC_007181.1,1665.0,2504.0,+,,Saci_0003,Sacid_00003,hypothetical protein,hypothetical protein,WP_011276934.1
SACI_RS00020,NC_007181.1,2553.0,3056.0,-,,Saci_0004,Sacid_00004,hypothetical protein,hypothetical protein,WP_015385334.1
SACI_RS00025,NC_007181.1,3049.0,3768.0,-,,Saci_0005,Sacid_00005,hypothetical protein,hypothetical protein,WP_011276936.1
...,...,...,...,...,...,...,...,...,...,...
SACI_RS11465,NC_007181.1,2218618.0,2219355.0,-,COQ5_5,Saci_2371,Sacid_02369,class I SAM-dependent methyltransferase,"2-methoxy-6-polyprenyl-1,4-benzoquinol methyla...",WP_011279151.1
SACI_RS11470,NC_007181.1,2219468.0,2220394.0,+,,Saci_2372,Sacid_02370,ornithine cyclodeaminase family protein,Delta(1)-pyrroline-2-carboxylate reductase,WP_011279152.1
SACI_RS11475,NC_007181.1,2220381.0,2220989.0,-,cobO,Saci_2373,Sacid_02371,"cob(I)yrinic acid a,c-diamide adenosyltransferase",Cobalamin adenosyltransferase,WP_011279153.1
SACI_RS11480,NC_007181.1,2221039.0,2224263.0,-,,Saci_2374,Sacid_02372,S8 family serine peptidase,hypothetical protein,WP_011279154.1


In [27]:
DF_annot_union.to_csv('DF_annot_curated.tsv', sep='\t')