In [1]:
# imports for later usage
import os#
import plotly#
import plotly.graph_objs as go#
import cobra
import glob
import re  

In [2]:
# Read in model_path names
model_paths = glob.glob('/home/bneubert/Lactobacillus/gap_models/*.xml')


In [3]:
# Remove bad model
print(len(model_paths))
model_paths.remove("/home/bneubert/Lactobacillus/gap_models/1579.60.xml")
# make sure the bad model was removed
print(len(model_paths))

1507
1506


In [5]:
model_x = cobra.io.read_sbml_model(model_paths[1])

In [10]:
model_x.reactions[1].id

'rxn00390_c'

In [11]:
print(seq.match(model_paths[1].replace('/home/bneubert/Lactobacillus/gap_models/','')).group())# genome id to use

1002365.5


In [10]:
seq = re.compile('(\d+)\.\d+')

In [12]:
counter = 0
model_reactions = [] # going to be full set of reactions
for name in model_paths:
    # open up the next model
    model_x = cobra.io.read_sbml_model(name)
    
    # find genome_id for this model
    genome_id = seq.match(name.replace('/home/bneubert/Lactobacillus/gap_models/','')).group()
    
    # cycle through the model's reactions
    for reaction in model_x.reactions:
        temp_dict = {}
        
        # add genome id to dict
        temp_dict['genome_id'] = genome_id
        
        # add reaction id to dict
        temp_dict['reaction_id'] = reaction.id
    
        # add reaction dict to the model reactions list
        model_reactions.append(temp_dict)
        
        
    
    
#     # limiter, remove when done testing
#     if(counter is 10):
#         break
#     counter += 1
    

In [13]:
#model_reactions

In [14]:
full_reactions = [] 
core_reactions = []
full_reaction_sizes = []
core_reactions_sizes = []
model_sizes = []
processed_models = []
temp_core_reactions = []
current_reactions = []
modelDict = {} 
second = True # boolean used to delineate first and second genome's addition, becomes false on third genome
reactionDict = {} # will contain all unique cross-genus families and their associated strains
#accessoryGenome = {}

In [15]:
# NOTE: analysis assumes the genomic features are grouped together by their genomic id since they were read in by feature files


for feature in model_reactions:
    
    
    # if the genome is new add genome to processed genomes
    if(feature['genome_id'] not in processed_models):
        # if we just switched to a new genome then
        # add the new pangenome and core genome sizes to their lists
        # also update core genome
        
        if(len(processed_models) > 0):
            full_reaction_sizes.append(len(full_reactions))
            model_sizes.append(len(current_reactions))
            if not second:
                # new coregenome is simply the previous temp coregenome
                core_reactions = list(temp_core_reactions)
            else:
                # no longer second
                second = False
            core_reactions_sizes.append(len(core_reactions))
            temp_core_reactions = []
            

        # add new genome to processed list
        processed_models.append(feature['genome_id'])
        
        #add new genome as a key in the genome dict
        modelDict[feature['genome_id']] = {}
        
        # reset current features
        current_reactions = []
      
    # if the gene family has not been accounted for in the current genome's features, add it
    if(feature['reaction_id'] not in current_reactions):
        current_reactions.append(feature['reaction_id'])
    
    # if gene family has not been accounted for then add to pangenome
    if(feature['reaction_id'] not in full_reactions):
        full_reactions.append(feature['reaction_id'])
    
    # if first runthrough then the core genes are all genes in the first
    # genome, these are used as a baseline for future comparisons   
    if (len(processed_models) is 1):
        # also make sure this family has not already been accounted for
        if(feature['reaction_id'] not in core_reactions):
            core_reactions.append(feature['reaction_id'])
    
    # if we already have a core genome, the present feature is in the core genome of the past strains, and not already added
    # then include this feature in the next core genome
    elif((feature['reaction_id'] in core_reactions) and (feature['reaction_id'] not in temp_core_reactions)):
        temp_core_reactions.append(feature['reaction_id'])

        
    # update the genome dictionary
    if (feature['reaction_id'] not in modelDict[feature['genome_id']]):
        modelDict[feature['genome_id']][feature['reaction_id']] = 1
    else:
        modelDict[feature['genome_id']][feature['reaction_id']] += 1
        
    # if the cross genus family has not been accounted for then add it to the family dictionary
    if feature['reaction_id'] not in reactionDict:
        reactionDict[feature['reaction_id']] = [feature['genome_id']]
    # if the cross genus family has been accounted for, but the strain it came from has not been accounted for, update the
    # key's list
    elif feature['genome_id'] not in reactionDict[feature['reaction_id']]:
        reactionDict[feature['reaction_id']].append(feature['genome_id'])
        
        
# make sure to add the last genome processed to have final pangenome and coregenome sizes
full_reaction_sizes.append(full_reactions.__len__())
core_reactions = list(temp_core_reactions)
core_reactions_sizes.append(core_reactions.__len__())
model_sizes.append(len(current_reactions))

In [77]:
# total number of reactions
len(full_reactions)

1469

In [17]:
# number of reactions present in all models
len(core_reactions)

38

In [20]:
# average number of reactions in a model
sum(model_sizes)/len(model_sizes)

884

In [1]:
plotly.offline.init_notebook_mode(connected=True)

trace1 = go.Scatter(
    x = list(range(1,1507)),
    y = core_reactions_sizes,
    mode = 'lines',
    name = 'core genome'
)
trace2 = go.Scatter(
    x = list(range(1,1507)),
    y = full_reaction_sizes,
    mode = 'lines',
    name = 'pan genome'
)
trace3 = go.Scatter(
    x = list(range(1,1507)),
    y = model_sizes,
    mode = 'lines',
    name = 'added genome'
)
data = [trace1, trace2,trace3]

layout = dict(title = 'Lactobacillus Reaction Analysis',
              xaxis = dict(title = 'Number of Models Added'),
              yaxis = dict(title = 'Number of Unique Reactions'),
              )

fig = dict(data=data, layout=layout)

plotly.offline.iplot(fig, filename='/home/bneubert/Lactobacillus/Results/reaction-line.html')

NameError: name 'plotly' is not defined