In [1]:
import cobra
import glob
import pickle
import os
import re

In [2]:
# Read in model_path names
model_paths = glob.glob('/home/bneubert/Lactobacillus/models/*.xml')

In [3]:
#model_paths
model_x = cobra.io.read_sbml_model(model_paths[1])

In [4]:
# takes a list of patric_id's(list of lists, first list is by genome, second by feature) and uses the feature file 
# to pull out the pgfam_id
# outputs a dictionary with patric_ids linked to their pgfam_ids
def pgfam_finder(feature_master):
    
    output = {}
    # create blank dictionary of keys for every genomic feature in the input list
    for genome_id in feature_master:
        for id in genome_id:
            output[id] = ''

    # create regular expression to identify the genome id from the patric id of the genomic feature
    test = re.compile('(\d+)\.\d+\.')
    
    # variables used for % complete calculation
    total = len(feature_master) 
    count = 0.0
    prev = -1
    
    # find associated protein families for every genomic feature within the models
    # start cycle through first genome
    for genome_id in feature_master:
        # open feature file via associated genome through regular expression
        temp = pickle.load(open('/home/bneubert/Lactobacillus/features/'+ test.match(genome_id[1]).group()+'feats'))
        # cycle through features in this genome
        for feature_id in genome_id:
            # check every feature in the feature file for this feature to find associated protein family
            for i in range(0,len(temp)):
                
                if(('fig|'+str(feature_id)) == str(temp[i]['patric_id'])):  
                        # if this has a protein family associated with it, add this to the dict
                        if('pgfam_id' in temp[i].keys()):
                            # only ever single family values so can use dict mapping string to string
                            output[str(feature_id)] = str(temp[i]['pgfam_id'])
                        # if there is not an associated protein family, then add a blank string
                        else:
                            output[str(feature_id)] = ''
        count += 1
        # if we have changed by a percentage point then update the status
        if (int(count/total*100) > prev):
            prev = int(count/total*100)
            print("Status: "+str(prev)+"% complete finding protein families")
    
    return output

In [5]:
total_genes = []
counter = 0.0
prev = -1
total = len(model_paths)
# create a list of all the genes within the models
# list of lists, initial list by genome and inner list is full of features within that genome
for model_path in model_paths:
    model_x = cobra.io.read_sbml_model(model_path)
    temp = []
    for gene in model_x.genes:
        temp.append(gene.id)
    total_genes.append(temp)
    counter += 1
    # if the status has changed by a percentage point then update status
    if (int(counter/total*100) > prev):
        prev = int(counter/total*100)
        print("Status: "+str(prev)+"% complete finding genes present in models")
#     use this  to test a smaller portion the models
#     if( counter is 5):
#         break

# run function for genes within the models
total_families = pgfam_finder(total_genes)  

Status: 0% complete finding genes present in models
Status: 1% complete finding genes present in models
Status: 2% complete finding genes present in models
Status: 3% complete finding genes present in models
Status: 4% complete finding genes present in models
Status: 5% complete finding genes present in models
Status: 6% complete finding genes present in models
Status: 7% complete finding genes present in models
Status: 8% complete finding genes present in models
Status: 9% complete finding genes present in models
Status: 10% complete finding genes present in models
Status: 11% complete finding genes present in models
Status: 12% complete finding genes present in models
Status: 13% complete finding genes present in models
Status: 14% complete finding genes present in models
Status: 15% complete finding genes present in models
Status: 16% complete finding genes present in models
Status: 17% complete finding genes present in models
Status: 18% complete finding genes present in models
Sta

Status: 63% complete finding protein families
Status: 64% complete finding protein families
Status: 65% complete finding protein families
Status: 66% complete finding protein families
Status: 67% complete finding protein families
Status: 68% complete finding protein families
Status: 69% complete finding protein families
Status: 70% complete finding protein families
Status: 71% complete finding protein families
Status: 72% complete finding protein families
Status: 73% complete finding protein families
Status: 74% complete finding protein families
Status: 75% complete finding protein families
Status: 76% complete finding protein families
Status: 77% complete finding protein families
Status: 78% complete finding protein families
Status: 79% complete finding protein families
Status: 80% complete finding protein families
Status: 81% complete finding protein families
Status: 82% complete finding protein families
Status: 83% complete finding protein families
Status: 84% complete finding prote

In [6]:
total_families

{'1599.97.peg.1364': 'PGF_01444546',
 '1599.97.peg.1362': 'PGF_00067554',
 '1599.97.peg.1363': 'PGF_06522349',
 '1423784.5.peg.1815': 'PGF_00056900',
 '1423784.5.peg.1816': 'PGF_06985891',
 '1599.97.peg.1368': 'PGF_06735060',
 '1423784.5.peg.1811': 'PGF_00011979',
 '1382301.3.peg.2182': 'PGF_02776739',
 '1423784.4.peg.2313': 'PGF_00020735',
 '559301.5.peg.350': 'PGF_00960048',
 '559301.5.peg.351': 'PGF_04655384',
 '337330.9.peg.1564': 'PGF_05366548',
 '1606.59.peg.1061': 'PGF_00402003',
 '1606.59.peg.1067': 'PGF_00020735',
 '1423730.4.peg.331': 'PGF_06162930',
 '1423730.4.peg.330': 'PGF_03295331',
 '1606.59.peg.1068': 'PGF_00064393',
 '1423730.4.peg.332': 'PGF_00033359',
 '1598.343.peg.1428': 'PGF_00064610',
 '1926284.3.peg.1027': 'PGF_00402268',
 '1926284.3.peg.1022': 'PGF_05406020',
 '1590.468.peg.1386': 'PGF_01599021',
 '1590.468.peg.1384': 'PGF_00048627',
 '1624.88.peg.1594': 'PGF_04947330',
 '872326.3.peg.881': 'PGF_02797402',
 '1598.343.peg.1427': 'PGF_00064610',
 '1624.88.peg.15

In [7]:
# download the output total families
file_name = "/home/bneubert/model.features"
pickle.dump(total_families, open(file_name, "wb"))

In [8]:
pickle.load(open(file_name, "rb"))

{'1599.97.peg.1364': 'PGF_01444546',
 '1599.97.peg.1362': 'PGF_00067554',
 '1599.97.peg.1363': 'PGF_06522349',
 '1423784.5.peg.1815': 'PGF_00056900',
 '1423784.5.peg.1816': 'PGF_06985891',
 '1599.97.peg.1368': 'PGF_06735060',
 '1423784.5.peg.1811': 'PGF_00011979',
 '1590.601.peg.800': 'PGF_00772015',
 '1423784.4.peg.2313': 'PGF_00020735',
 '559301.5.peg.350': 'PGF_00960048',
 '559301.5.peg.351': 'PGF_04655384',
 '1624.68.peg.971': 'PGF_00008874',
 '1606.59.peg.1061': 'PGF_00402003',
 '1606.59.peg.1067': 'PGF_00020735',
 '1423730.4.peg.331': 'PGF_06162930',
 '1423730.4.peg.330': 'PGF_03295331',
 '1606.59.peg.1068': 'PGF_00064393',
 '1423730.4.peg.332': 'PGF_00033359',
 '1382301.3.peg.2183': 'PGF_02011760',
 '1598.343.peg.1428': 'PGF_00064610',
 '1926284.3.peg.1027': 'PGF_00402268',
 '1926284.3.peg.1022': 'PGF_05406020',
 '1590.468.peg.1386': 'PGF_01599021',
 '1590.468.peg.1384': 'PGF_00048627',
 '1624.88.peg.1594': 'PGF_04947330',
 '872326.3.peg.881': 'PGF_02797402',
 '1598.343.peg.1427