In [1]:
import os
os.chdir('IBM_pickled_data/') # change directory
import pickle
import csv
from pprint import pprint
# Make sure "ClusterMapData.py" is in your working directory and that "typing" is also installed
from ClusterMapData import ClusterMapData
# https://docs.scipy.org/doc/scipy/reference/generated/scipy.cluster.hierarchy.linkage.html
# https://seaborn.pydata.org/generated/seaborn.clustermap.html

# Pick which genera to analyze

In [3]:
genera = [
    'escherichia',
    'shigella',
    'listeria',
    'salmonella'
]

# also includes campylobacter, but we aren't using this data

# Extract data from pivot vs neighbor domains
Notice that we have to specify which folder the data is in. In the examples below, we are loading data from the "domain/" directory.

The "domain/" directory contains pickle and png files from the analysis of pivot domain architecture vs neighbor domain architecture. We likely won't be using this much for our primary analysis.

In [4]:
print('Pivot Domain vs. Neighbor Domain')
print('Example for escherichia dataset.')
print('Below are the data objects we in the pickled data for each genus.')
print('--------------------------------')
print('')

for genus in genera[:1]:
    with open(f'domain/{genus}.pickle', 'rb') as pickle_file:
        cluster_map_data = pickle.load(pickle_file)
        print(f'genus: {genus}')
        print(f'shape={cluster_map_data.data.shape}')
        print(f'row_labels={cluster_map_data.row_labels[:1]}')
        print(f'row_linkage={cluster_map_data.row_linkage[:1]}')
        print(f'col_labels={cluster_map_data.col_labels[:1]}')
        print(f'col_linkage={cluster_map_data.col_linkage[:1]}')
        print(f'col_colors={cluster_map_data.col_colors[:1]}')
        print(f'col_color_indices={cluster_map_data.col_color_indices[:1]}')
        print('')

Pivot Domain vs. Neighbor Domain
Example for escherichia dataset.
Below are the data objects we in the pickled data for each genus.
--------------------------------

genus: escherichia
shape=(899, 3700)
row_labels=['000e97918ef5156f91e7b45faccb8a34']
row_linkage=[[2.83000000e+02 6.05000000e+02 6.44261629e-04 2.00000000e+00]]
col_labels=['702149cb9bb9153481a1eac12614ce4c:D']
col_linkage=[[  1. 301.   0.   2.]]
col_colors=['#2980B9']
col_color_indices=[1]



In [5]:
print('Pivot Domain vs. Neighbor Domain')
print("Example for escherichia dataset")
print('--------------------------------')
print('')

for genus in genera[:1]:
    with open(f'domain/{genus}.pickle', 'rb') as pickle_file:
        cluster_map_data = pickle.load(pickle_file)
        print(f'shape={cluster_map_data.data.shape}')
        

Pivot Domain vs. Neighbor Domain
Example for escherichia dataset
--------------------------------

shape=(899, 3700)


# Extract data from genome vs domain
Now, let's get data from the "genome/" directory.

This contains pickle and png files from the analysis of genome vs domain architecture.

In [3]:
print('Genome vs. Domain')
print('-----------------')
print('')

for genus in genera:
    with open(f'genome/{genus}.pickle', 'rb') as pickle_file:
        cluster_map_data = pickle.load(pickle_file)
        print(f'genus: {genus}')
        print(f'shape={cluster_map_data.data.shape}')
        print(f'row_labels={cluster_map_data.row_labels[:1]}')
        print(f'row_linkage={cluster_map_data.row_linkage[:1]}')
        print(f'col_labels={cluster_map_data.col_labels[:1]}')
        print(f'col_linkage={cluster_map_data.col_linkage[:1]}')
        print(f'col_colors={cluster_map_data.col_colors[:1]}')
        print(f'col_color_indices={cluster_map_data.col_color_indices[:1]}')
        print('')


Genome vs. Domain
-----------------

genus: escherichia
shape=(11339, 4599)
row_labels=['DRR015927']
row_linkage=[[3.933e+03 4.780e+03 0.000e+00 2.000e+00]]
col_labels=['fdaf9740209c89757d7863d311697df7:P']
col_linkage=[[7.82000000e+02 7.83000000e+02 3.33671063e-01 2.00000000e+00]]
col_colors=['#2980B9']
col_color_indices=[2]

genus: shigella
shape=(5660, 3882)
row_labels=['DRR015915']
row_linkage=[[3.345e+03 3.846e+03 0.000e+00 2.000e+00]]
col_labels=['fdaf9740209c89757d7863d311697df7:P']
col_linkage=[[ 795. 1613.    0.    2.]]
col_colors=['#2980B9']
col_color_indices=[2]

genus: listeria
shape=(7968, 1876)
row_labels=['DRR015836']
row_linkage=[[8.170e+02 2.012e+03 0.000e+00 2.000e+00]]
col_labels=['fdaf9740209c89757d7863d311697df7:P']
col_linkage=[[ 351. 1506.    0.    2.]]
col_colors=['#2980B9']
col_color_indices=[2]

genus: salmonella
shape=(39777, 3919)
row_labels=['DRR021403']
row_linkage=[[9.4980e+03 1.4264e+04 0.0000e+00 2.0000e+00]]
col_labels=['fdaf9740209c89757d7863d311697df

# Extract the genome IDs

In [6]:
print('Genome vs. Domain')
print('-----------------')
print('')

# Dictionary for genus
genera_runs = {}

for genus in genera:
    with open(f'genome/{genus}.pickle', 'rb') as pickle_file:
        cluster_map_data = pickle.load(pickle_file)
        print(f'genus: {genus}')
        genera_runs[genus] = cluster_map_data.row_labels


Genome vs. Domain
-----------------

genus: escherichia
genus: shigella
genus: listeria
genus: salmonella


## Print out file with genome IDs for each genera

Need to run each block twice for some reason..

In [15]:
%%capture cap --no-stderr

for SRR in genera_runs['escherichia']:
    with open(f'high_quality_escherichia_genomes.txt','w') as f:
        print(SRR)
        f.write(cap.stdout)


In [56]:
%%capture cap --no-stderr

for SRR in genera_runs['shigella']:
    with open(f'high_quality_shigella_genomes.txt','w') as f:
        print(SRR)
        f.write(cap.stdout)


In [61]:
%%capture cap --no-stderr

for SRR in genera_runs['listeria']:
    with open(f'high_quality_listeria_genomes.txt','w') as f:
        print(SRR)
        f.write(cap.stdout)


In [63]:
%%capture cap --no-stderr

for SRR in genera_runs['salmonella']:
    with open(f'high_quality_salmonella_genomes.txt','w') as f:
        print(SRR)
        f.write(cap.stdout)


In [9]:
dir(genera_runs['salmonella'])

['__add__',
 '__class__',
 '__contains__',
 '__delattr__',
 '__delitem__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__gt__',
 '__hash__',
 '__iadd__',
 '__imul__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__mul__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__reversed__',
 '__rmul__',
 '__setattr__',
 '__setitem__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 'append',
 'clear',
 'copy',
 'count',
 'extend',
 'index',
 'insert',
 'pop',
 'remove',
 'reverse',
 'sort']