In [128]:
from Bio.Seq import Seq
from Bio import SeqIO 
from Bio.SeqRecord import SeqRecord
from Bio import SeqFeature
from Bio.Alphabet import IUPAC
import sys
import Bio
import pandas as pd 
from Bio.SeqFeature import FeatureLocation
import numpy as np
import re

In [129]:
#Code outputs specific features such as index, record id, length, and features
for index, record in enumerate(SeqIO.parse("GenbankFile.gb", "genbank")):
    print("index %i, ID = %s, length %i, with %i features"
          % (index, record.id, len(record.seq), len(record.features)))

index 0, ID = AH000830.2, length 865, with 7 features


In [130]:
#Code displays the record of the genbank file
print (record)

ID: AH000830.2
Name: AH000830
Description: Mycobacterium smegmatis ATCC 607 16S rRNA gene, 5' untranslated region
Number of features: 7
/molecule_type=DNA
/topology=linear
/data_file_division=BCT
/date=25-AUG-2016
/accessions=['AH000830', 'U07954', 'U07955']
/sequence_version=2
/keywords=['']
/source=Mycobacterium smegmatis
/organism=Mycobacterium smegmatis
/taxonomy=['Bacteria', 'Actinobacteria', 'Corynebacteriales', 'Mycobacteriaceae', 'Mycobacterium']
/references=[Reference(title="Mycobacterium smegmatis ATCC strain 607 5' region to 16S rRNA gene", ...), Reference(title='Mycobacterium smegmatis ATCC strain 607 intergenic region between 16S and 23S rRNA genes', ...), Reference(title='Direct Submission', ...)]
/comment=On or before Aug 25, 2016 this sequence version replaced gi:469549,
gi:469550, gi:493020.
Seq('CAAGCAAGCGTGTTGTTTGAGAACTCAATAGTGTGTTTGGTGGTTTTTGTTTGT...CCC', IUPACAmbiguousDNA())


In [131]:
# Reads the genbank file  and extracts data
genome = SeqIO.read('GenbankFile.gb','genbank')

In [132]:
#list genbank file features
genome.features

[SeqFeature(FeatureLocation(ExactPosition(0), ExactPosition(865), strand=1), type='source'),
 SeqFeature(FeatureLocation(ExactPosition(0), ExactPosition(204), strand=1), type='source'),
 SeqFeature(FeatureLocation(ExactPosition(304), ExactPosition(865), strand=1), type='source'),
 SeqFeature(FeatureLocation(ExactPosition(0), ExactPosition(865), strand=1), type='misc_feature'),
 SeqFeature(FeatureLocation(ExactPosition(0), ExactPosition(204), strand=1), type='misc_feature'),
 SeqFeature(FeatureLocation(ExactPosition(204), ExactPosition(304), strand=1), type='gap'),
 SeqFeature(FeatureLocation(ExactPosition(304), ExactPosition(508), strand=1), type='misc_feature')]

In [133]:
#Strings genome features
feature = str(genome.features)

In [134]:
#Displays features
feature

"[SeqFeature(FeatureLocation(ExactPosition(0), ExactPosition(865), strand=1), type='source'), SeqFeature(FeatureLocation(ExactPosition(0), ExactPosition(204), strand=1), type='source'), SeqFeature(FeatureLocation(ExactPosition(304), ExactPosition(865), strand=1), type='source'), SeqFeature(FeatureLocation(ExactPosition(0), ExactPosition(865), strand=1), type='misc_feature'), SeqFeature(FeatureLocation(ExactPosition(0), ExactPosition(204), strand=1), type='misc_feature'), SeqFeature(FeatureLocation(ExactPosition(204), ExactPosition(304), strand=1), type='gap'), SeqFeature(FeatureLocation(ExactPosition(304), ExactPosition(508), strand=1), type='misc_feature')]"

In [135]:
#seperates genome features & makes it readable to userS
gene_features = re.findall(r'SeqFeature\SFeatureLocation\SExactPosition\S\d{1,60}\S\S\sExactPosition\S\d{1,60}\S\S\sstrand=\d{1,60}\S\S\stype\S', features)

In [136]:
gene_features 

['SeqFeature(FeatureLocation(ExactPosition(0), ExactPosition(4349904), strand=1), type=',
 'SeqFeature(FeatureLocation(ExactPosition(0), ExactPosition(1524), strand=1), type=',
 'SeqFeature(FeatureLocation(ExactPosition(0), ExactPosition(1524), strand=1), type=',
 'SeqFeature(FeatureLocation(ExactPosition(2051), ExactPosition(3260), strand=1), type=',
 'SeqFeature(FeatureLocation(ExactPosition(2051), ExactPosition(3260), strand=1), type=',
 'SeqFeature(FeatureLocation(ExactPosition(3279), ExactPosition(4437), strand=1), type=',
 'SeqFeature(FeatureLocation(ExactPosition(3279), ExactPosition(4437), strand=1), type=',
 'SeqFeature(FeatureLocation(ExactPosition(4433), ExactPosition(4997), strand=1), type=',
 'SeqFeature(FeatureLocation(ExactPosition(4433), ExactPosition(4997), strand=1), type=',
 'SeqFeature(FeatureLocation(ExactPosition(5122), ExactPosition(7267), strand=1), type=',
 'SeqFeature(FeatureLocation(ExactPosition(5122), ExactPosition(7267), strand=1), type=',
 'SeqFeature(Fea

In [137]:
position_of_gene = str(position_of_gene)

In [138]:
re.findall(r'\d{1,60}', position_of_gene)
list_of_gene = re.findall(r'\d{1,60}', position_of_gene)

In [139]:
#Position of genes extracted
list_of_gene

['0',
 '1524',
 '2051',
 '3260',
 '3279',
 '4437',
 '4433',
 '4997',
 '5122',
 '7267',
 '7301',
 '9818',
 '9913',
 '10828',
 '10886',
 '10960',
 '11111',
 '11184',
 '12467',
 '13016',
 '14087',
 '14876',
 '14912',
 '15611',
 '25624',
 '25707',
 '27575',
 '28346',
 '28342',
 '29176',
 '29224',
 '29587',
 '29701',
 '31135',
 '31172',
 '31490',
 '31497',
 '31803',
 '32040',
 '33138',
 '33207',
 '33537',
 '33565',
 '33778',
 '34278',
 '36594',
 '36590',
 '36854',
 '36850',
 '37246',
 '37242',
 '38931',
 '41287',
 '41896',
 '43545',
 '46455',
 '52814',
 '53228',
 '53220',
 '55689',
 '55685',
 '57368',
 '57399',
 '57963',
 '58181',
 '58472',
 '58575',
 '59070',
 '59111',
 '59366',
 '59398',
 '59857',
 '59885',
 '60407',
 '60385',
 '63010',
 '63189',
 '63882',
 '63898',
 '64957',
 '65541',
 '66684',
 '66912',
 '68352',
 '68609',
 '71549',
 '71615',
 '71855',
 '71847',
 '72249',
 '75327',
 '76239',
 '79512',
 '80229',
 '80659',
 '81709',
 '81711',
 '82554',
 '82556',
 '82703',
 '82782',
 '8401

In [140]:
#start codon extracted
start = list_of_gene[::2]
start

['0',
 '2051',
 '3279',
 '4433',
 '5122',
 '7301',
 '9913',
 '10886',
 '11111',
 '12467',
 '14087',
 '14912',
 '25624',
 '27575',
 '28342',
 '29224',
 '29701',
 '31172',
 '31497',
 '32040',
 '33207',
 '33565',
 '34278',
 '36590',
 '36850',
 '37242',
 '41287',
 '43545',
 '52814',
 '53220',
 '55685',
 '57399',
 '58181',
 '58575',
 '59111',
 '59398',
 '59885',
 '60385',
 '63189',
 '63898',
 '65541',
 '66912',
 '68609',
 '71615',
 '71847',
 '75327',
 '79512',
 '80659',
 '81711',
 '82556',
 '82782',
 '84030',
 '86562',
 '88238',
 '89056',
 '89609',
 '89958',
 '90434',
 '92362',
 '93323',
 '93985',
 '95448',
 '96961',
 '97792',
 '98514',
 '99718',
 '100617',
 '105359',
 '106769',
 '107635',
 '108191',
 '109818',
 '110036',
 '117749',
 '122352',
 '124409',
 '131419',
 '133057',
 '133987',
 '136326',
 '137356',
 '137978',
 '138701',
 '141239',
 '144088',
 '148528',
 '148893',
 '149570',
 '151338',
 '152514',
 '154422',
 '155857',
 '158037',
 '161059',
 '161961',
 '162344',
 '162453',
 '162695'

In [141]:
#stop codon extracted
stop = list_of_gene[1::2]
stop

['1524',
 '3260',
 '4437',
 '4997',
 '7267',
 '9818',
 '10828',
 '10960',
 '11184',
 '13016',
 '14876',
 '15611',
 '25707',
 '28346',
 '29176',
 '29587',
 '31135',
 '31490',
 '31803',
 '33138',
 '33537',
 '33778',
 '36594',
 '36854',
 '37246',
 '38931',
 '41896',
 '46455',
 '53228',
 '55689',
 '57368',
 '57963',
 '58472',
 '59070',
 '59366',
 '59857',
 '60407',
 '63010',
 '63882',
 '64957',
 '66684',
 '68352',
 '71549',
 '71855',
 '72249',
 '76239',
 '80229',
 '81709',
 '82554',
 '82703',
 '84018',
 '85203',
 '87168',
 '89060',
 '89515',
 '89954',
 '90438',
 '92363',
 '93313',
 '93986',
 '95452',
 '96927',
 '97636',
 '98386',
 '99285',
 '100486',
 '102903',
 '106751',
 '107639',
 '108187',
 '109814',
 '110055',
 '117575',
 '119735',
 '123867',
 '125606',
 '132910',
 '133807',
 '136045',
 '137283',
 '137947',
 '138551',
 '139619',
 '142184',
 '145666',
 '148897',
 '149262',
 '151187',
 '152406',
 '154320',
 '155790',
 '156637',
 '158493',
 '161665',
 '162345',
 '162452',
 '162693',
 '16

In [142]:
#Converts to dataframe
df = pd.DataFrame({'Starts': start})
df['Stops'] = stop
df

Unnamed: 0,Starts,Stops
0,0,1524
1,2051,3260
2,3279,4437
3,4433,4997
4,5122,7267
5,7301,9818
6,9913,10828
7,10886,10960
8,11111,11184
9,12467,13016


In [143]:
#IGR extracted
names = []
for x in range(1, len(df)+1):
    names.append('IGR_' + str(x))

In [144]:
#IGR names shown
names

['IGR_1',
 'IGR_2',
 'IGR_3',
 'IGR_4',
 'IGR_5',
 'IGR_6',
 'IGR_7',
 'IGR_8',
 'IGR_9',
 'IGR_10',
 'IGR_11',
 'IGR_12',
 'IGR_13',
 'IGR_14',
 'IGR_15',
 'IGR_16',
 'IGR_17',
 'IGR_18',
 'IGR_19',
 'IGR_20',
 'IGR_21',
 'IGR_22',
 'IGR_23',
 'IGR_24',
 'IGR_25',
 'IGR_26',
 'IGR_27',
 'IGR_28',
 'IGR_29',
 'IGR_30',
 'IGR_31',
 'IGR_32',
 'IGR_33',
 'IGR_34',
 'IGR_35',
 'IGR_36',
 'IGR_37',
 'IGR_38',
 'IGR_39',
 'IGR_40',
 'IGR_41',
 'IGR_42',
 'IGR_43',
 'IGR_44',
 'IGR_45',
 'IGR_46',
 'IGR_47',
 'IGR_48',
 'IGR_49',
 'IGR_50',
 'IGR_51',
 'IGR_52',
 'IGR_53',
 'IGR_54',
 'IGR_55',
 'IGR_56',
 'IGR_57',
 'IGR_58',
 'IGR_59',
 'IGR_60',
 'IGR_61',
 'IGR_62',
 'IGR_63',
 'IGR_64',
 'IGR_65',
 'IGR_66',
 'IGR_67',
 'IGR_68',
 'IGR_69',
 'IGR_70',
 'IGR_71',
 'IGR_72',
 'IGR_73',
 'IGR_74',
 'IGR_75',
 'IGR_76',
 'IGR_77',
 'IGR_78',
 'IGR_79',
 'IGR_80',
 'IGR_81',
 'IGR_82',
 'IGR_83',
 'IGR_84',
 'IGR_85',
 'IGR_86',
 'IGR_87',
 'IGR_88',
 'IGR_89',
 'IGR_90',
 'IGR_91',
 'IGR_92

In [145]:
#add column
df['Names'] = names

In [146]:
#Displays the names column
df

Unnamed: 0,Starts,Stops,Names
0,0,1524,IGR_1
1,2051,3260,IGR_2
2,3279,4437,IGR_3
3,4433,4997,IGR_4
4,5122,7267,IGR_5
5,7301,9818,IGR_6
6,9913,10828,IGR_7
7,10886,10960,IGR_8
8,11111,11184,IGR_9
9,12467,13016,IGR_10


In [147]:
#converts dataframe to csv
df.to_csv('IGRproject.csv')