## Loading packages, constants, and functions

In [1]:
import functions as f
import constants as c
import classes as cl

import pandas as pd
import os, joblib, time

from netwulf import visualize
print(f"Current directories in {c.cwd}\n {os.listdir()}")

long = c.cwd + "\\raw_data\\long_alzheimers.tsv"
short = c.cwd + "\\raw_data\\short_alzheimers.tsv"

Current directories in /Users/s243564/Computational Tools/02807_project_Group13-1\
 ['joblib_vars', 'functions.py', 'test_data', 'Notebook.ipynb', 'constants.py', 'raw_data', '__pycache__', 'OldCode.ipynb', 'README.md', '.gitignore', 'comp_tls', 'classes.py', '.git']


## Loading Datasets

#### Creating id_prot and saving
id_prot is a dictionary meant to point a id <integer> to a protein name <string> with 19 str characters \
id_prot is saved using joblib.dump() in dir "joblib_vars", and can be reloaded as a dict object using joblib.load(path)

In [2]:
chunksize = 10
info_df = pd.read_csv("https://stringdb-downloads.org/download/protein.info.v12.0/9606.protein.info.v12.0.txt.gz", compression='gzip', sep="\t")
info_df = info_df[["#string_protein_id"]].values
id_prot = dict()
for c, e in enumerate(info_df):
    id_prot[c] = e[0]
print(id_prot)
joblib.dump(id_prot, "./joblib_vars/id_prot.joblib")

{0: '9606.ENSP00000000233', 1: '9606.ENSP00000000412', 2: '9606.ENSP00000001008', 3: '9606.ENSP00000001146', 4: '9606.ENSP00000002125', 5: '9606.ENSP00000002165', 6: '9606.ENSP00000002596', 7: '9606.ENSP00000002829', 8: '9606.ENSP00000003084', 9: '9606.ENSP00000003100', 10: '9606.ENSP00000003302', 11: '9606.ENSP00000004531', 12: '9606.ENSP00000004982', 13: '9606.ENSP00000005178', 14: '9606.ENSP00000005226', 15: '9606.ENSP00000005257', 16: '9606.ENSP00000005260', 17: '9606.ENSP00000005284', 18: '9606.ENSP00000005286', 19: '9606.ENSP00000005340', 20: '9606.ENSP00000005386', 21: '9606.ENSP00000005587', 22: '9606.ENSP00000005995', 23: '9606.ENSP00000006015', 24: '9606.ENSP00000006053', 25: '9606.ENSP00000006275', 26: '9606.ENSP00000006526', 27: '9606.ENSP00000006658', 28: '9606.ENSP00000006724', 29: '9606.ENSP00000006777', 30: '9606.ENSP00000007390', 31: '9606.ENSP00000007414', 32: '9606.ENSP00000007699', 33: '9606.ENSP00000007735', 34: '9606.ENSP00000008391', 35: '9606.ENSP00000008527', 3

['./joblib_vars/id_prot.joblib']

#### Modifying raw datasets
modified datasets are stored in "/mod_data/" for easy retrieval

In [3]:
"""
An example of an interaction dataset loaded
"""
alz_int_df = pd.read_csv("./raw_data/long_alzheimers.tsv", sep="\t")
alz_int_df.columns = ["Prot1_ShortName", "Prot2_ShortName", "protein1", "protein2", "col1", "col2", "col3", "col4", "col5", "col6", "col7", "col8", "col9"]
alz_int_df

Unnamed: 0,Prot1_ShortName,Prot2_ShortName,protein1,protein2,col1,col2,col3,col4,col5,col6,col7,col8,col9
0,A2M,APOA1,9606.ENSP00000323929,9606.ENSP00000236850,0.0,0.0,0.0,0.0,0.111,0.000,0.75,0.994,0.998
1,A2M,LRP1,9606.ENSP00000323929,9606.ENSP00000243077,0.0,0.0,0.0,0.0,0.132,0.504,0.00,0.997,0.998
2,A2M,APOE,9606.ENSP00000323929,9606.ENSP00000252486,0.0,0.0,0.0,0.0,0.270,0.292,0.00,0.957,0.975
3,A2M,SORL1,9606.ENSP00000323929,9606.ENSP00000260197,0.0,0.0,0.0,0.0,0.000,0.066,0.00,0.574,0.585
4,A2M,ADAM10,9606.ENSP00000323929,9606.ENSP00000260408,0.0,0.0,0.0,0.0,0.000,0.000,0.00,0.498,0.497
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1354,TOMM40,VDAC1,9606.ENSP00000410339,9606.ENSP00000378487,0.0,0.0,0.0,0.0,0.063,0.428,0.75,0.749,0.962
1355,TREM2,UNC5C,9606.ENSP00000362205,9606.ENSP00000406022,0.0,0.0,0.0,0.0,0.000,0.000,0.00,0.495,0.495
1356,TREM2,ZCWPW1,9606.ENSP00000362205,9606.ENSP00000381109,0.0,0.0,0.0,0.0,0.000,0.000,0.00,0.514,0.514
1357,VDAC1,VPS35,9606.ENSP00000378487,9606.ENSP00000299138,0.0,0.0,0.0,0.0,0.065,0.292,0.00,0.260,0.467


In [4]:
HS_int_df = pd.read_csv("https://stringdb-downloads.org/download/protein.links.detailed.v12.0/9606.protein.links.detailed.v12.0.txt.gz", compression="gzip", sep=" ")
HS_int_df = HS_int_df[HS_int_df["experimental"] > 0] #removing interactions with no experimental relevance 
#joblib.dump(HS_int_df, "./joblib_vars/HS_int_df") #FILE TOO LARGE 
HS_int_df


Unnamed: 0,protein1,protein2,neighborhood,fusion,cooccurence,coexpression,experimental,database,textmining,combined_score
0,9606.ENSP00000000233,9606.ENSP00000356607,0,0,0,45,134,0,81,173
1,9606.ENSP00000000233,9606.ENSP00000427567,0,0,0,0,128,0,70,154
2,9606.ENSP00000000233,9606.ENSP00000253413,0,0,0,118,49,0,69,151
3,9606.ENSP00000000233,9606.ENSP00000493357,0,0,0,56,53,0,457,471
4,9606.ENSP00000000233,9606.ENSP00000324127,0,0,0,0,46,0,197,201
...,...,...,...,...,...,...,...,...,...,...
13715395,9606.ENSP00000501317,9606.ENSP00000361930,0,0,0,0,238,0,0,237
13715399,9606.ENSP00000501317,9606.ENSP00000475489,0,0,0,60,99,0,126,195
13715400,9606.ENSP00000501317,9606.ENSP00000370447,0,0,0,55,111,0,79,158
13715402,9606.ENSP00000501317,9606.ENSP00000402092,0,0,0,0,67,0,146,169


Creating HS_int_simple, containing protein1 and protein2 names converted to int using id_prot. \
If all interactions are weight=1, only the names (ids) of the proteins are needed when constructing the interaction network

In [5]:
id_prot = joblib.load("./joblib_vars/id_prot.joblib")

#Swapping key value in dict
id_swap = dict()
for k, v in id_prot.items():
    id_swap[str(v)] = k
print(id_swap)

#exporting modified HS_int
HS_int_simple = HS_int_df.map(lambda x: id_swap[x] if x in id_swap else x).reset_index()
HS_int_simple = HS_int_simple[["protein1", "protein2"]]
joblib.dump(HS_int_simple, "./joblib_vars/HS_int_simple.joblib")


{'9606.ENSP00000000233': 0, '9606.ENSP00000000412': 1, '9606.ENSP00000001008': 2, '9606.ENSP00000001146': 3, '9606.ENSP00000002125': 4, '9606.ENSP00000002165': 5, '9606.ENSP00000002596': 6, '9606.ENSP00000002829': 7, '9606.ENSP00000003084': 8, '9606.ENSP00000003100': 9, '9606.ENSP00000003302': 10, '9606.ENSP00000004531': 11, '9606.ENSP00000004982': 12, '9606.ENSP00000005178': 13, '9606.ENSP00000005226': 14, '9606.ENSP00000005257': 15, '9606.ENSP00000005260': 16, '9606.ENSP00000005284': 17, '9606.ENSP00000005286': 18, '9606.ENSP00000005340': 19, '9606.ENSP00000005386': 20, '9606.ENSP00000005587': 21, '9606.ENSP00000005995': 22, '9606.ENSP00000006015': 23, '9606.ENSP00000006053': 24, '9606.ENSP00000006275': 25, '9606.ENSP00000006526': 26, '9606.ENSP00000006658': 27, '9606.ENSP00000006724': 28, '9606.ENSP00000006777': 29, '9606.ENSP00000007390': 30, '9606.ENSP00000007414': 31, '9606.ENSP00000007699': 32, '9606.ENSP00000007735': 33, '9606.ENSP00000008391': 34, '9606.ENSP00000008527': 35, '

['./joblib_vars/HS_int_simple.joblib']

## Running the class

### Setting the color of the progress bar

In [6]:
%%html
<style>
.cell-output-ipywidget-background {
    background-color: transparent !important;
}
:root {
    --jp-widgets-color: var(--vscode-editor-foreground);
    --jp-widgets-font-size: var(--vscode-editor-font-size);
}  
</style>

### Running the class

In [7]:
a = cl.interaction_network()
a.create_encoding_dict()
a.load_data()

Creating encoding table
Fetching data
Cropping data


Parsing data:   0%|          | 0/5847852 [00:00<?, ?it/s]

Data loaded in var .vertices, with size: 0.589912mb


In [8]:
#Example code for the format of a.vertices
for root in a.vertices.keys():
    print("root", "neighbor", "norm_score")
    for neighbor in a.vertices[root].keys():
        print(root, neighbor, a.vertices[root][neighbor])
    break

root neighbor norm_score
0 9827 0.027090694935217905
0 16039 0.004711425206124852
0 1687 0.001177856301531213
0 19094 0.37809187279151946
0 6750 0.06007067137809187
0 6819 0.0353356890459364
0 6515 0.03651354534746761
0 12375 0.5229681978798587
0 18110 0.004711425206124852
0 14217 0.012956419316843345
0 17284 0.18021201413427562
0 13487 0.02120141342756184
0 1034 0.07891637220259129
0 1203 0.005889281507656066
0 6241 0.232037691401649
0 18639 0.05535924617196702
0 18036 0.20259128386336867
0 12056 0.06595995288574794
0 19153 0.09069493521790342
0 13684 0.0035335689045936395
0 3009 0.011778563015312132
0 2344 0.016489988221436984
0 11439 0.028268551236749116
0 6809 0.06007067137809187
0 5730 0.5865724381625441
0 25 0.004711425206124852
0 12198 0.06595995288574794
0 2929 0.160188457008245
0 7251 0.01884570082449941
0 10236 0.016489988221436984
0 11152 0.45111896348645464
0 756 0.06007067137809187
0 9128 0.023557126030624265
0 16848 0.0706713780918728
0 2742 0.0636042402826855
0 18354 0.0

In [9]:
a.construct_graph(a.vertices)
#visualize(a.graph_network) #DANGER: large visualization - Returns html graph, aka. opens up visualization in browser.

In [10]:
# Example usage of shortest_path
vertexes = {"A": ["B", "C"],
            "B": ["A", "C", "D"],
            "C": ["A", "B"],
            "D": ["B", "E"],
            "E": ["D"]}

# Make sure you reference the correct function name
for start_end in (paths := f.shortest_path("A", vertexes)):
    print(start_end, paths[start_end], sep = ":   ")

A->B:   A_B
A->C:   A_C
A->D:   A_B_D
A->E:   A_B_D_E


In [None]:
# Example usage of check_connection
vertexes = {"A": ["B", "C"],
            "B": ["A", "C", "D"],
            "C": ["A", "B"],
            "D": ["B", "E"],
            "E": ["D"],
            "F": ["G"],
            "G": ["F"]}

is_connected, connected_vertices = check_connection(vertexes)
print("The network is fully connected: {}".format(is_connected))
for i, group in enumerate(connected_vertices):
    print("Grouped vertices {}:".format(i+1))
    print(group)

The network is fully connected: False
Grouped vertices 1:
['A', 'B', 'C', 'D', 'E']
Grouped vertices 2:
['F', 'G']
