In [None]:
!pip3 install networkx

In [None]:
import site
import sys

sys.path.append(site.USER_SITE)

In [None]:
import os
import logging

from multiprocessing import Pool, cpu_count
from pathlib import Path

import matplotlib.pyplot as plt
import networkx as nx
import pandas as pd
from cosifer.combiners import COMBINERS, RECOMMENDED_COMBINER
from cosifer.inferencers import INFERENCERS, RECOMMENDED_INFERENCERS
from cosifer.pipelines.pipeline_cli import run as cli_run, run_combiner, get_interaction_tables
from cosifer.pipelines.pipeline_gui import run as gui_run

logging.getLogger('matplotlib').setLevel(logging.WARNING)

FILEPATH = os.path.abspath('data_matrix.csv')

# Application of cosifer pipeline with dummy data

In [None]:
# to download the the dummy data file
# from [Box](https://ibm.box.com/s/x6fvci11k6x7napo1wgt74mgs5jvfy9x):
# !mkdir -p /data/demo
# !curl -L https://ibm.box.com/shared/static/x6fvci11k6x7napo1wgt74mgs5jvfy9x.csv -o /data/demo/data_matrix.csv
# !ls -l /data/demo
# !head -n 3 /data/demo/data_matrix.csv
# FILEPATH = '/data/demo/data_matrix.csv'

In [None]:
df = pd.read_csv(FILEPATH)
df.head()

## Run a pipeline
We could use either the function executed via the command line interface or in the
[web service](http://www.ibm.biz/cosifer-aas).

In [None]:
print(gui_run.__doc__)
print(cli_run.__doc__)

The **difference** is mainly that the gui version happens in memory, that is it
doesn't read the input file but accepts a pd.DataFrame and does not save the
individual networks, but only the final combined one to disk.
(Also the gui version does not offer to repeat the run for multiple feature
subsets via `gmt_filepath`)

The gui version is therefore a bit easier to inspect and get an idea of what is
happending, but later we will use functions used in the cli pipeline to run
things in parallel.

They have in **common** that they sequentially
- run different network inference methods in series
- run a combiner

In [None]:
gui_run(
    df, '/data/demo/gui_default_inference.csv.gz',
    # methods=INFERENCERS.keys()  # run all methods
)

This ran some default inference methods and the default combiner

In [None]:
RECOMMENDED_INFERENCERS, list(INFERENCERS.keys())

In [None]:
RECOMMENDED_COMBINER, list(COMBINERS.keys())

... and resulted in a compressed csv.

In [None]:
!gzip -cd /data/demo/gui_default_inference.csv.gz | head


In [None]:
inference_results = pd.read_csv(
    '/data/demo/gui_default_inference.csv.gz', header=0, index_col=0,
    compression='gzip'
).sort_values('intensity', ascending=False)
len(inference_results)

In [None]:
# draw edges with top 50 intensity
G = nx.from_pandas_edgelist(inference_results[:50], 'e1', 'e2', 'intensity')
nx.draw_spring(G, with_labels=True)


## Not running a pipeline
This allows some more finegrained control, for example only using the consensus
methods on your own interaction networks.


Here we show how to run different inference methods in parallel.
Saving the networks of individual inference methods allows running multiple
combiners without recomputing them.

In [None]:
inference_directory = '/data/demo/parallel/inference/'
combiner_directory = '/data/demo/parallel/combiner/'
data = df


def run_single_method(name, inferencer):
    output_filepath = '{}/{}.csv.gz'.format(inference_directory, name)
    if not os.path.exists(output_filepath):
        inferencer.filepath = output_filepath
        inferencer.load()  # will log again that file is not there yet
        inferencer.infer_network(data)
        # NOTE: allow retraining on new data
        inferencer.trained = False
    else:
        print(
            'inference already run and stored in {}'.
            format(output_filepath)
        )


selected_methods = INFERENCERS

Note that the values in this `INFERENCERS` dict are (stateful) instances with
default parameters.

Here you could create you own dict with different parameters, or
even instances of your own implementations inheriting from
`cosifer.network_inferencer.NetworkInferencer`

In [None]:
if __name__ == "__main__":
    # parallelize inference running methods at the same time
    print(f'we are running {cpu_count()} processes in parellel')
    with Pool() as pool:
        pool.starmap(run_single_method, selected_methods.items())
        # in series, this compares to
        # list(map(run_single_method, selected_methods.keys(), selected_methods.values()))
        # list(map(run_single_method, *zip(*selected_methods.items())))

    # find and read interaction tables
    tables = get_interaction_tables(inference_directory)
    # run combination
    run_combiner(RECOMMENDED_COMBINER, tables, combiner_directory)


In [None]:
!ls $inference_directory
!ls $combiner_directory

Done!