In [1]:
# imports for later usage
import pickle
import os
import plotly
import plotly.graph_objs as go
import random

In [3]:
feature_files = os.listdir('/home/bneubert/Lactobacillus/features')

In [5]:
feature_list = []
counter = 0
for file in feature_files:
    # load in the feature file as a pickle object, this will be a list
    temp = pickle.load(open('/home/bneubert/Lactobacillus/features/'+file))
    # add every feature to the total feature list for later use
    for i in range(0, len(temp)):
        # exclude features that do not have a pgfam_id
        if('pgfam_id' in temp[i].keys()):
            feature_list.append(temp[i])
    # limiter, will be removed to process full data set so as to not take too much time
    counter += 1
    if(counter is 100):
        break


# Analysis

In [6]:
pangenome = [] 
coregenome = []
pangenome_sizes = []
coregenome_sizes = []
genome_sizes = []
processed_genomes = []
temp_coregenome = []
genomeDict = {} 
first = True # boolean used to delineate first genome's addition
familyDict = {} # will contain all unique cross-genus families and their associated strains
accessoryGenome = {}

In [7]:
count = 0
for feature in feature_list:
    count += 1
    # if the genome is new add genome to processed genomes
    if(feature['genome_id'] not in processed_genomes):
        # if we just switched to a new genome then
        # add the new pangenome and core genome sizes to their lists
        # also update core genome
        
        if(processed_genomes.__len__() > 0):
            pangenome_sizes.append(pangenome.__len__())
            genome_sizes.append(count-1)
            count = 1
            if not first:
                # new coregenome is simply the previous temp coregenome
                coregenome = list(temp_coregenome)
            coregenome_sizes.append(coregenome.__len__())
            temp_coregenome = []
            # no longer first
            first = False

        # add new genome to processed list
        processed_genomes.append(feature['genome_id'])
        
        #add new genome as a key in the genome dict
        genomeDict[feature['genome_id']] = {}
        
    # if gene family has not been accounted for then add to pangenome
    if(feature['pgfam_id'] not in pangenome):
        pangenome.append(feature['pgfam_id'])
        
    # if first runthrough then the core genes are all genes in the first
    # genome, these are used as a baseline for future comparisons
    # also make sure this family has not already been accounted for
    if first and feature['pgfam_id'] not in coregenome:
        coregenome.append(feature['pgfam_id'])
    # if we already have a core genome, the present feature is in the core genome of the past strains, and not already added
    # then include this feature in the next core genome
    elif((feature['pgfam_id'] in coregenome) and (feature['pgfam_id'] not in temp_coregenome)):
        temp_coregenome.append(feature['pgfam_id'])

    if feature['pgfam_id'] not in genomeDict[feature['genome_id']]:
        genomeDict[feature['genome_id']][feature['pgfam_id']] = 1
    else:
        genomeDict[feature['genome_id']][feature['pgfam_id']] += 1
        
     # if the cross genus family has not been accounted for then add it to the genome dictionary
    if feature['pgfam_id'] not in familyDict:
        familyDict[feature['pgfam_id']] = [feature['genome_id']]
    # if the cross genus family has been accounted for, but the strain it came from has not been accounted for, update the
    # key's list
    elif feature['genome_id'] not in familyDict[feature['pgfam_id']]:
        familyDict[feature['pgfam_id']].append(feature['genome_id'])
        
        
# make sure to add the last genome processed to have final pangenome and coregenome sizes
pangenome_sizes.append(pangenome.__len__())
coregenome = list(temp_coregenome)
coregenome_sizes.append(coregenome.__len__())
genome_sizes.append(count)

In [48]:
len(pangenome)

31582

In [None]:
#genomeDict

In [49]:
#familyDict

In [11]:
for family,genome in familyDict.items():
    # if this cross-genus family is not apart of the coregenome and the cross-genus family is not unique to one strain
    # add this cross-genus family and associated strains to the accessoryGenome
    if  genome.__len__() < processed_genomes.__len__() and genome.__len__()>1:
        accessoryGenome[family] = genome

In [12]:
new_heat = []
for i in range(0, accessoryGenome.keys().__len__()):
    temp = []
    for j in range(0,processed_genomes.__len__()):
        temp.append(0)
    new_heat.append(temp)

In [13]:
accessoryGenomeList = list(accessoryGenome.keys())

In [59]:
#random.shuffle(processed_genomes)

In [14]:
num = 0

for strain in processed_genomes:
    for family, count in genomeDict[strain].items():
        if(family in accessoryGenomeList):
            new_heat[accessoryGenomeList.index(family)][num] = 1
            
    num+=1

In [50]:
#new_heat

In [16]:
# number of genomes processed
print(processed_genomes.__len__())
# size of core genome
print(coregenome.__len__())

100
251


In [17]:
plotly.offline.init_notebook_mode(connected=True)



trace1 = go.Scatter(
    x = list(range(1,218)),
    y = coregenome_sizes,
    mode = 'lines',
    name = 'core genome'
)
trace2 = go.Scatter(
    x = list(range(1,218)),
    y = pangenome_sizes,
    mode = 'lines',
    name = 'pan genome'
)
trace3 = go.Scatter(
    x = list(range(1,218)),
    y = genome_sizes,
    mode = 'lines',
    name = 'added genome'
)
data = [trace1, trace2,trace3]

layout = dict(title = 'Lactobacillus Analysis',
              xaxis = dict(title = 'Number of Strains Added'),
              yaxis = dict(title = 'Number of Unique Gene Families'),
              )

fig = dict(data=data, layout=layout)

plotly.offline.iplot(fig, filename='pangenome-line')

In [18]:
# placeholder processed genome array
x = []
count = 1
for i in range(0,processed_genomes.__len__()):
    x.append(count)
    count +=1
    
# placeholder accessory genome list
y = []
count = 1
for i in range(0,accessoryGenomeList.__len__()):
    y.append(count)
    count +=1

In [19]:
trace = go.Heatmap(z=new_heat, x=x, y=y )

data=[trace]
            
# Set layout
layout = go.Layout(
    title= 'Lactobacillus Accessory Genome',
    xaxis=dict(
        title='Lactobacillus Strain',
        titlefont=dict(
            family='Courier New, monospace',
            size=18,
            color='#7f7f7f'
        )
    ),
    yaxis=dict(
        title='Accessory Gene Family',
        titlefont=dict(
            family='Courier New, monospace',
            size=18,
            color='#7f7f7f'
        )
    )
)
fig = go.Figure(data=data, layout=layout)
plotly.offline.plot(fig, filename='lactobacillus-heatmap.html')

'file:///home/bneubert/code/lactobacillus-heatmap.html'

In [20]:
summary_heat = []
for i in range(0,len(accessoryGenomeList)):
    summary_heat.append([0])

In [21]:
accessoryGenomeList

[u'',
 u'PGF_07853908',
 u'PGF_03983202',
 u'PGF_02010654',
 u'PGF_02010655',
 u'PGF_00701237',
 u'PGF_02010659',
 u'PGF_02025614',
 u'PGF_00354837',
 u'PGF_00009169',
 u'PGF_02020882',
 u'PGF_02008562',
 u'PGF_02008561',
 u'PGF_06860984',
 u'PGF_02025619',
 u'PGF_02008564',
 u'PGF_10392558',
 u'PGF_02999965',
 u'PGF_08098596',
 u'PGF_01027157',
 u'PGF_03114390',
 u'PGF_03114396',
 u'PGF_06375348',
 u'PGF_01933855',
 u'PGF_03500742',
 u'PGF_02016272',
 u'PGF_00976587',
 u'PGF_00075770',
 u'PGF_00007935',
 u'PGF_00301865',
 u'PGF_03115963',
 u'PGF_02024747',
 u'PGF_03052038',
 u'PGF_02030767',
 u'PGF_02030764',
 u'PGF_00764110',
 u'PGF_02030762',
 u'PGF_10415011',
 u'PGF_02030760',
 u'PGF_02030761',
 u'PGF_00726863',
 u'PGF_06839556',
 u'PGF_02030768',
 u'PGF_02011385',
 u'PGF_02023354',
 u'PGF_02020081',
 u'PGF_02016173',
 u'PGF_02016172',
 u'PGF_00381824',
 u'PGF_02020438',
 u'PGF_00378260',
 u'PGF_02026874',
 u'PGF_00917208',
 u'PGF_00149990',
 u'PGF_02020585',
 u'PGF_02016179',
 u'P

In [22]:
for strain in processed_genomes:
    for family, count in genomeDict[strain].items():
        if(family in accessoryGenomeList):
            # heat of protein family at corresponding index based upon accessory genome list
            summary_heat[accessoryGenomeList.index(family)][0] += 1

In [23]:
trace = go.Heatmap(z=summary_heat, x=[], y=y )

data=[trace]
            
# Set layout
layout = go.Layout(
    title= 'Lactobacillus Accessory Genome',
    xaxis=dict(
        title='Lactobacillus Strain',
        titlefont=dict(
            family='Courier New, monospace',
            size=18,
            color='#7f7f7f'
        )
    ),
    yaxis=dict(
        title='Accessory Gene Family',
        titlefont=dict(
            family='Courier New, monospace',
            size=18,
            color='#7f7f7f'
        )
    )
)
fig = go.Figure(data=data, layout=layout)
plotly.offline.plot(fig, filename='lactobacillus-summary-heatmap.html')

'file:///home/bneubert/code/lactobacillus-summary-heatmap.html'

In [108]:
len(summary_heat)
len(y)

17505

In [109]:
count = 0
for i in range(0,len(summary_heat)):
    count += summary_heat[i][0]
count

150872

In [40]:
#takes a list of patric_id's and uses the feature file to pull out the pgfam_id
def pgfam_finder(patric_ids):
    feature_files = os.listdir('/home/bneubert/Lactobacillus/features')
    output = {}
    for id in patric_ids:
        output[id] = ''
        count = 0
        for file in feature_files:
            temp = pickle.load(open('/home/bneubert/Lactobacillus/features/'+file))
            for i in range(0,len(temp)):
                if(id == temp[i]['patric_id']):                
                    if('pgfam_id' in temp[i].keys()):
                        output[id] = temp[i]['pgfam_id']
                    else:
                        output[id] = '0'
                
            count += 1
            if (count is 10):
                break
    return output
    

In [47]:
pgfam_finder([u'fig|1001583.3.peg.1295',u'fig|1001583.3.peg.588',u'fig|1001583.3.rna.28'])

HERE
HERE
HERE


{u'fig|1001583.3.peg.1295': u'PGF_06511905',
 u'fig|1001583.3.peg.588': u'PGF_03708253',
 u'fig|1001583.3.rna.28': '0'}

In [26]:
test = pickle.load(open('/home/bneubert/Lactobacillus/features/1001583.3.feats'))


In [34]:
marker = test[200]['patric_id']

In [46]:
test[5]

{u'accession': u'AP012167',
 u'alt_locus_tag': u'VBILacBre187236_r028',
 u'annotation': u'PATRIC',
 u'date_inserted': u'2014-10-21T00:33:29.554Z',
 u'date_modified': u'2014-10-22T09:01:13.537Z',
 u'document_type': u'genome_feature',
 u'end': 925404,
 u'feature_id': u'PATRIC.1001583.3.AP012167.tRNA.925332.925404.fwd',
 u'feature_type': u'tRNA',
 u'gene_id': 0,
 u'genome_id': u'1001583.3',
 u'genome_name': u'Lactobacillus brevis KB290',
 u'gi': 0,
 u'location': u'925332..925404',
 u'na_length': 73,
 u'na_sequence': u'ggaggattagctcagttgggagagcgtctgccttacaagcagagggtcacaggttcgagccctgtatcctcca',
 u'owner': u'PATRIC',
 u'p2_feature_id': 124906975,
 u'patric_id': u'fig|1001583.3.rna.28',
 u'pos_group': u'AP012167:925404:+',
 u'product': u'tRNA-Val-TAC',
 u'public': True,
 u'segments': [u'925332..925404'],
 u'sequence_id': u'AP012167',
 u'start': 925332,
 u'strand': u'+',
 u'taxon_id': 1001583}