In [1]:
# imports for later usage
import pickle
import os
import plotly
import plotly.graph_objs as go
import random

In [2]:
feature_files = os.listdir('/home/bneubert/Lactobacillus/features')

In [34]:
feature_list = []
counter = 0
for file in feature_files:
    # load in the feature file as a pickle object, this will be a list
    temp = pickle.load(open('/home/bneubert/Lactobacillus/features/'+file))
    # add every feature to the total feature list for later use
    for i in range(0, len(temp)):
        # exclude features that do not have a pgfam_id
        if('pgfam_id' in temp[i].keys()):
            feature_list.append(temp[i])
    # limiter, will be removed to process full data set so as to not take too much time
    counter += 1
    if(counter is 200):
        break


In [35]:
pangenome = [] 
coregenome = []
pangenome_sizes = []
coregenome_sizes = []
genome_sizes = []
processed_genomes = []
temp_coregenome = []
current_features = []
genomeDict = {} 
second = True # boolean used to delineate first and second genome's addition, becomes false on third genome
familyDict = {} # will contain all unique cross-genus families and their associated strains
accessoryGenome = {}

In [36]:
# NOTE: analysis assumes the genomic features are grouped together by their genomic id since they were read in by feature files


for feature in feature_list:
    
    
    # if the genome is new add genome to processed genomes
    if(feature['genome_id'] not in processed_genomes):
        # if we just switched to a new genome then
        # add the new pangenome and core genome sizes to their lists
        # also update core genome
        
        if(len(processed_genomes) > 0):
            pangenome_sizes.append(len(pangenome))
            genome_sizes.append(len(current_features))
            if not second:
                # new coregenome is simply the previous temp coregenome
                coregenome = list(temp_coregenome)
            else:
                # no longer second
                second = False
            coregenome_sizes.append(len(coregenome))
            temp_coregenome = []
            

        # add new genome to processed list
        processed_genomes.append(feature['genome_id'])
        
        #add new genome as a key in the genome dict
        genomeDict[feature['genome_id']] = {}
        
        # reset current features
        current_features = []
      
    # if the gene family has not been accounted for in the current genome's features, add it
    if(feature['pgfam_id'] not in current_features):
        current_features.append(feature['pgfam_id'])
    
    # if gene family has not been accounted for then add to pangenome
    if(feature['pgfam_id'] not in pangenome):
        pangenome.append(feature['pgfam_id'])
    
    # if first runthrough then the core genes are all genes in the first
    # genome, these are used as a baseline for future comparisons   
    if (len(processed_genomes) is 1):
        # also make sure this family has not already been accounted for
        if(feature['pgfam_id'] not in coregenome):
            coregenome.append(feature['pgfam_id'])
    
    # if we already have a core genome, the present feature is in the core genome of the past strains, and not already added
    # then include this feature in the next core genome
    elif((feature['pgfam_id'] in coregenome) and (feature['pgfam_id'] not in temp_coregenome)):
        temp_coregenome.append(feature['pgfam_id'])

        
    # update the genome dictionary
    if (feature['pgfam_id'] not in genomeDict[feature['genome_id']]):
        genomeDict[feature['genome_id']][feature['pgfam_id']] = 1
    else:
        genomeDict[feature['genome_id']][feature['pgfam_id']] += 1
        
    # if the cross genus family has not been accounted for then add it to the family dictionary
    if feature['pgfam_id'] not in familyDict:
        familyDict[feature['pgfam_id']] = [feature['genome_id']]
    # if the cross genus family has been accounted for, but the strain it came from has not been accounted for, update the
    # key's list
    elif feature['genome_id'] not in familyDict[feature['pgfam_id']]:
        familyDict[feature['pgfam_id']].append(feature['genome_id'])
        
        
# make sure to add the last genome processed to have final pangenome and coregenome sizes
pangenome_sizes.append(pangenome.__len__())
coregenome = list(temp_coregenome)
coregenome_sizes.append(coregenome.__len__())
genome_sizes.append(len(current_features))

In [38]:
len(pangenome)

47546

In [39]:
#genomeDict

In [40]:
#familyDict

In [41]:
for family,genome in familyDict.items():
    # if this cross-genus family is not apart of the coregenome and the cross-genus family is not unique to one strain
    # add this cross-genus family and associated strains to the accessoryGenome
    if((len(genome) < len(processed_genomes)) and (len(genome)>1)):
        accessoryGenome[family] = genome

In [42]:
full_heat = []
for i in range(0, accessoryGenome.keys().__len__()):
    temp = []
    for j in range(0,processed_genomes.__len__()):
        temp.append(0)
    full_heat.append(temp)

In [43]:
accessoryGenomeList = list(accessoryGenome.keys())

In [45]:
num = 0

for strain in processed_genomes:
    for family, count in genomeDict[strain].items():
        # if strain has the family and the family is in the accessory genome then the index becomes 1 in the heat indicating presence
        if(family in accessoryGenomeList):
            # family indices based upon accessory genome list, strain indices based upon processed_genomes
            full_heat[accessoryGenomeList.index(family)][num] = 1
            
    num += 1

In [46]:
#full_heat

In [47]:
# Summary information

print("Number of genomes processed: " + str(len(processed_genomes)))
print("Number of unique protein families in the core genome: "+ str(len(coregenome)))
print("Number of unique protein families in the pan genome: "+ str(len(pangenome)))
print("Number of unique protein families in the accessory genome: "+ str(len(accessoryGenomeList)))
print("Average number of unique protein families in each processed genome: "+ str(sum(genome_sizes)/len(genome_sizes)))

Number of genomes processed: 200
Number of unique protein families in the core genome: 87
Number of unique protein families in the pan genome: 47546
Number of unique protein families in the accessory genome: 27633
Average number of unique protein families in each processed genome: 2032


In [48]:
plotly.offline.init_notebook_mode(connected=True)

trace1 = go.Scatter(
    x = list(range(1,218)),
    y = coregenome_sizes,
    mode = 'lines',
    name = 'core genome'
)
trace2 = go.Scatter(
    x = list(range(1,218)),
    y = pangenome_sizes,
    mode = 'lines',
    name = 'pan genome'
)
trace3 = go.Scatter(
    x = list(range(1,218)),
    y = genome_sizes,
    mode = 'lines',
    name = 'added genome'
)
data = [trace1, trace2,trace3]

layout = dict(title = 'Lactobacillus Analysis',
              xaxis = dict(title = 'Number of Strains Added'),
              yaxis = dict(title = 'Number of Unique Gene Families'),
              )

fig = dict(data=data, layout=layout)

plotly.offline.iplot(fig, filename='pangenome-line')

In [49]:
# placeholder processed genome array
x = []
count = 0
for i in range(0,processed_genomes.__len__()):
    x.append(count)
    count +=1
    
# placeholder accessory genome list
y = []
count = 0
for i in range(0,accessoryGenomeList.__len__()):
    y.append(count)
    count +=1

In [50]:
trace = go.Heatmap(z=full_heat, x=x, y=y )

data=[trace]
            
# Set layout
layout = go.Layout(
    title= 'Lactobacillus Accessory Genome',
    xaxis=dict(
        title='Lactobacillus Strain',
        titlefont=dict(
            family='Courier New, monospace',
            size=18,
            color='#7f7f7f'
        )
    ),
    yaxis=dict(
        title='Accessory Gene Family',
        titlefont=dict(
            family='Courier New, monospace',
            size=18,
            color='#7f7f7f'
        )
    )
)
fig = go.Figure(data=data, layout=layout)
plotly.offline.plot(fig, filename='lactobacillus-heatmap.html')

'file:///home/bneubert/Lactobacillus/Code/lactobacillus-heatmap.html'

In [51]:
summary_heat = []
for i in range(0,len(accessoryGenomeList)):
    summary_heat.append([0])

In [52]:
# accessoryGenomeList

In [53]:
for strain in processed_genomes:
    for family, count in genomeDict[strain].items():
        if(family in accessoryGenomeList):
            # heat of protein family at corresponding index based upon accessory genome list
            summary_heat[accessoryGenomeList.index(family)][0] += 1

In [54]:
trace = go.Heatmap(z=summary_heat, x=[], y=y )

data=[trace]
            
# Set layout
layout = go.Layout(
    title= 'Lactobacillus Accessory Genome',
    xaxis=dict(
        title='Lactobacillus Strain',
        titlefont=dict(
            family='Courier New, monospace',
            size=18,
            color='#7f7f7f'
        )
    ),
    yaxis=dict(
        title='Accessory Gene Family',
        titlefont=dict(
            family='Courier New, monospace',
            size=18,
            color='#7f7f7f'
        )
    )
)
fig = go.Figure(data=data, layout=layout)
plotly.offline.plot(fig, filename='lactobacillus-summary-heatmap.html')

'file:///home/bneubert/Lactobacillus/Code/lactobacillus-summary-heatmap.html'

In [None]:
# convert model gene pickle objects to same form as the feature files so the same analysis can be run
file_name = "/home/bneubert/model.features"
model_genes = pickle.load(open(file_name, "rb"))