## Loading packages, constants, and functions

In [1]:
import functions as f
import constants as c
import classes as cl

import pandas as pd
import os, joblib, time

import netwulf as wulf
import json

import warnings

print(f"Current directories in {c.cwd}\n {os.listdir()}")

long = c.cwd + "\\raw_data\\long_alzheimers.tsv"
short = c.cwd + "\\raw_data\\short_alzheimers.tsv"



Current directories in /Users/asbjornhansen/GitRep/02807_project_Group13-3\
 ['joblib_vars', 'functions.py', 'modularity.ipynb', 'Notebook.ipynb', 'constants.py', 'runclass.ipynb', 'raw_data', '__pycache__', 'CustomPool_logs', 'README.md', '.gitignore', 'OldCode_saveforReport.ipynb', 'enrichment.ipynb', 'classes.py', '.git', 'miserables.json']


## Loading Datasets

#### Creating id_prot and saving
id_prot is a dictionary meant to point a id <integer> to a protein name <string> with 19 str characters \
id_prot is saved using joblib.dump() in dir "joblib_vars", and can be reloaded as a dict object using joblib.load(path)

In [None]:
chunksize = 10
info_df = pd.read_csv("https://stringdb-downloads.org/download/protein.info.v12.0/9606.protein.info.v12.0.txt.gz", compression='gzip', sep="\t")
info_df = info_df[["#string_protein_id"]].values
id_prot = dict()
for c, e in enumerate(info_df):
    id_prot[c] = e[0]
print(id_prot)
joblib.dump(id_prot, "./joblib_vars/id_prot.joblib")

#### Modifying raw datasets
modified datasets are stored in "/mod_data/" for easy retrieval

In [None]:
"""
An example of an interaction dataset loaded
"""
alz_int_df = pd.read_csv("./raw_data/long_alzheimers.tsv", sep="\t")
alz_int_df.columns = ["Prot1_ShortName", "Prot2_ShortName", "protein1", "protein2", "col1", "col2", "col3", "col4", "col5", "col6", "col7", "col8", "col9"]
alz_int_df

In [None]:
HS_int_df = pd.read_csv("https://stringdb-downloads.org/download/protein.links.detailed.v12.0/9606.protein.links.detailed.v12.0.txt.gz", compression="gzip", sep=" ")
HS_int_df = HS_int_df[HS_int_df["experimental"] > 0] #removing interactions with no experimental relevance 
#joblib.dump(HS_int_df, "./joblib_vars/HS_int_df") #FILE TOO LARGE 
HS_int_df


Creating HS_int_simple, containing protein1 and protein2 names converted to int using id_prot. \
If all interactions are weight=1, only the names (ids) of the proteins are needed when constructing the interaction network

In [1]:
id_prot = joblib.load("./joblib_vars/id_prot.joblib")

#Swapping key value in dict
id_swap = dict()
for k, v in id_prot.items():
    id_swap[str(v)] = k
print(id_swap)

#exporting modified HS_int
HS_int_simple = HS_int_df.map(lambda x: id_swap[x] if x in id_swap else x).reset_index()
HS_int_simple = HS_int_simple[["protein1", "protein2"]]
joblib.dump(HS_int_simple, "./joblib_vars/HS_int_simple.joblib")


NameError: name 'joblib' is not defined

## Running the class

### Setting the color of the progress bar

In [2]:
%%html
<style>
.cell-output-ipywidget-background {
    background-color: transparent !important;
}
:root {
    --jp-widgets-color: var(--vscode-editor-foreground);
    --jp-widgets-font-size: var(--vscode-editor-font-size);
}  
</style>

### Running the class

In [3]:
warnings.filterwarnings("ignore", category=UserWarning, message=r".*urllib3.*NotOpenSSLWarning.*")

a = cl.interaction_network(testdataset = True, threshold=0.995)
a.create_encoding_dict()
a.load_data()
a.cluster()

Creating encoding table
Fetching data
Cropping data
Given threshold: 0.995 filtered out combined_score values: 998.0 and below


Parsing data:   0%|          | 0/13761 [00:00<?, ?it/s]

Data loaded in var .vertices, with size: 0.004216mb
NEW ITERATION --- Clusters left: 509 --- Cluster length: 2808 --- Finished clusters: 0 --- Latest cluster length: 0 --- last edge severed: 
-----Density calculation-----
-----Evaluate edge removal-----
-----Finding edge to remove-----
-----Mapping-----


Child processes completed::   0%|          | 0/2808 [00:00<?, ?it/s]



-----Reducing-----
NEW ITERATION --- Clusters left: 509 --- Cluster length: 2808 --- Finished clusters: 0 --- Latest cluster length: 0 --- last edge severed: 10624-2
-----Density calculation-----
-----Evaluate edge removal-----
-----Finding edge to remove-----
-----Mapping-----


Child processes completed::   0%|          | 0/2808 [00:00<?, ?it/s]



-----Reducing-----
-----Modularity-----
-----Finalized split-----
NEW ITERATION --- Clusters left: 510 --- Cluster length: 2 --- Finished clusters: 0 --- Latest cluster length: 0 --- last edge severed: 2-7655
-----Density calculation-----
-----GSEA-----
NEW ITERATION --- Clusters left: 509 --- Cluster length: 5 --- Finished clusters: 1 --- Latest cluster length: 2 --- last edge severed: 2-7655
-----Density calculation-----
-----GSEA-----
NEW ITERATION --- Clusters left: 508 --- Cluster length: 2 --- Finished clusters: 2 --- Latest cluster length: 2 --- last edge severed: 2-7655
-----Density calculation-----
-----GSEA-----
NEW ITERATION --- Clusters left: 507 --- Cluster length: 13 --- Finished clusters: 3 --- Latest cluster length: 2 --- last edge severed: 2-7655
-----Density calculation-----
-----Evaluate edge removal-----
-----Finding edge to remove-----
-----Mapping-----


Child processes completed::   0%|          | 0/13 [00:00<?, ?it/s]



-----Reducing-----
NEW ITERATION --- Clusters left: 507 --- Cluster length: 13 --- Finished clusters: 3 --- Latest cluster length: 2 --- last edge severed: 25-5096
-----Density calculation-----
-----Evaluate edge removal-----
-----Finding edge to remove-----
-----Mapping-----


Child processes completed::   0%|          | 0/13 [00:00<?, ?it/s]



-----Reducing-----
NEW ITERATION --- Clusters left: 507 --- Cluster length: 13 --- Finished clusters: 3 --- Latest cluster length: 2 --- last edge severed: 14427-25
-----Density calculation-----
-----Evaluate edge removal-----
-----Finding edge to remove-----
-----Mapping-----


Child processes completed::   0%|          | 0/13 [00:00<?, ?it/s]



-----Reducing-----
NEW ITERATION --- Clusters left: 507 --- Cluster length: 13 --- Finished clusters: 3 --- Latest cluster length: 2 --- last edge severed: 16402-25
-----Density calculation-----
-----Evaluate edge removal-----
-----Finding edge to remove-----
-----Mapping-----


Child processes completed::   0%|          | 0/13 [00:00<?, ?it/s]



-----Reducing-----
NEW ITERATION --- Clusters left: 507 --- Cluster length: 13 --- Finished clusters: 3 --- Latest cluster length: 2 --- last edge severed: 17602-25
-----Density calculation-----
-----Evaluate edge removal-----
-----Finding edge to remove-----
-----Mapping-----


Child processes completed::   0%|          | 0/13 [00:00<?, ?it/s]



-----Reducing-----
-----Modularity-----
-----Finalized split-----
NEW ITERATION --- Clusters left: 508 --- Cluster length: 2 --- Finished clusters: 3 --- Latest cluster length: 2 --- last edge severed: 18106-25
-----Density calculation-----
-----GSEA-----
NEW ITERATION --- Clusters left: 507 --- Cluster length: 2 --- Finished clusters: 4 --- Latest cluster length: 2 --- last edge severed: 18106-25
-----Density calculation-----
-----GSEA-----
NEW ITERATION --- Clusters left: 506 --- Cluster length: 2 --- Finished clusters: 5 --- Latest cluster length: 2 --- last edge severed: 18106-25
-----Density calculation-----
-----GSEA-----


IndexError: single positional indexer is out-of-bounds

## Graphing

In [20]:
### Test data from netwulf ###
with open("./miserables.json", "r") as jsonfile:
    allstring = ""
    for line in jsonfile:
        allstring += line.rstrip()
data = json.loads(allstring)
print(data)
nx.visualize(data)

{'nodes': [{'id': 'Myriel', 'size': 1}, {'id': 'Napoleon', 'size': 1}, {'id': 'Mlle.Baptistine', 'size': 1}, {'id': 'Mme.Magloire', 'size': 1}, {'id': 'CountessdeLo', 'size': 1}, {'id': 'Geborand', 'size': 1}, {'id': 'Champtercier', 'size': 1}, {'id': 'Cravatte', 'size': 1}, {'id': 'Count', 'size': 1}, {'id': 'OldMan', 'size': 1}, {'id': 'Labarre', 'size': 2}, {'id': 'Valjean', 'size': 2}, {'id': 'Marguerite', 'size': 3}, {'id': 'Mme.deR', 'size': 2}, {'id': 'Isabeau', 'size': 2}, {'id': 'Gervais', 'size': 2}, {'id': 'Tholomyes', 'size': 3}, {'id': 'Listolier', 'size': 3}, {'id': 'Fameuil', 'size': 3}, {'id': 'Blacheville', 'size': 3}, {'id': 'Favourite', 'size': 3}, {'id': 'Dahlia', 'size': 3}, {'id': 'Zephine', 'size': 3}, {'id': 'Fantine', 'size': 3}, {'id': 'Mme.Thenardier', 'size': 4}, {'id': 'Thenardier', 'size': 4}, {'id': 'Cosette', 'size': 5}, {'id': 'Javert', 'size': 4}, {'id': 'Fauchelevent', 'size': 0}, {'id': 'Bamatabois', 'size': 2}, {'id': 'Perpetue', 'size': 3}, {'id': 

(None, None)

In [2]:
### Our data format ###
# clusters = [ [Enrichment label, {"prot1": {"prot2" : edge_score} } ] , [Enrichment label, {"prot1": {"prot2" : edge_score} } ] ...]
test_data = [["enrich_label1", {"prot1": {"prot2" : 2}}], ["enrich_label2", {"prot2": {"prot3" : 2}}], ["enrich_label3", {"prot3": {"prot1" : 2}}], ["enrich_labe14", {"prot4": {"prot2" : 2}}]]

network = f.construct_graph(test_data, debug_mode=True)
print(network)
wulf.visualize(network)

##### Splitting label and cluster dict #####
['enrich_label1', {'prot1': {'prot2': 2}}]
['enrich_label2', {'prot2': {'prot3': 2}}]
['enrich_label3', {'prot3': {'prot1': 2}}]
['enrich_labe14', {'prot4': {'prot2': 2}}]
##### Adding nodes to graph #####
--- Node collected: prot1 ---
--- Node collected: prot2 ---
--- Node collected: prot3 ---
--- Node collected: prot4 ---
##### Adding edges to graph #####
--- Edge added between: ('prot1', 'prot2') ---
--- Edge added between: ('prot2', 'prot3') ---
--- Edge added between: ('prot3', 'prot1') ---
--- Edge added between: ('prot4', 'prot2') ---
Graph named 'PPI_GraphNetwork' with 4 nodes and 4 edges


(None, None)

In [8]:
print(list(network.nodes(data=True)))
list(network.nodes(data=True))[:3]

[('prot1', {}), ('prot3', {}), ('prot2', {}), ('prot4', {})]


[('prot1', {}), ('prot3', {}), ('prot2', {})]

In [11]:
for k, v in network.nodes(data=True):
    print(k, v)
    v["group"] = v["block"]; del v["block"]
    print(k, v)

prot1 {}


KeyError: 'block'

In [2]:
test_list = []
print(len(test_list))

0
