## Loading packages, constants, and functions

In [1]:
import functions as f
import constants as c
import classes as cl

import pandas as pd
import os, joblib, time

from netwulf import visualize
print(f"Current directories in {c.cwd}\n {os.listdir()}")

long = c.cwd + "\\raw_data\\long_alzheimers.tsv"
short = c.cwd + "\\raw_data\\short_alzheimers.tsv"

Current directories in c:\Users\tobia\Vs_code\02807 Computational Tools for Data Science\02807_project_Group13\
 ['.git', '.gitignore', 'classes.py', 'constants.py', 'functions.py', 'joblib_vars', 'Notebook.ipynb', 'OldCode.ipynb', 'raw_data', 'README.md', '__pycache__']


## Loading Datasets

#### Creating id_prot and saving
id_prot is a dictionary meant to point a id <integer> to a protein name <string> with 19 str characters \
id_prot is saved using joblib.dump() in dir "joblib_vars", and can be reloaded as a dict object using joblib.load(path)

In [None]:
chunksize = 10
info_df = pd.read_csv("https://stringdb-downloads.org/download/protein.info.v12.0/9606.protein.info.v12.0.txt.gz", compression='gzip', sep="\t")
info_df = info_df[["#string_protein_id"]].values
id_prot = dict()
for c, e in enumerate(info_df):
    id_prot[c] = e[0]
print(id_prot)
joblib.dump(id_prot, "./joblib_vars/id_prot.joblib")

#### Modifying raw datasets
modified datasets are stored in "/mod_data/" for easy retrieval

In [None]:
"""
An example of an interaction dataset loaded
"""
alz_int_df = pd.read_csv("./raw_data/long_alzheimers.tsv", sep="\t")
alz_int_df.columns = ["Prot1_ShortName", "Prot2_ShortName", "protein1", "protein2", "col1", "col2", "col3", "col4", "col5", "col6", "col7", "col8", "col9"]
alz_int_df

In [None]:
HS_int_df = pd.read_csv("https://stringdb-downloads.org/download/protein.links.detailed.v12.0/9606.protein.links.detailed.v12.0.txt.gz", compression="gzip", sep=" ")
HS_int_df = HS_int_df[HS_int_df["experimental"] > 0] #removing interactions with no experimental relevance 
#joblib.dump(HS_int_df, "./joblib_vars/HS_int_df") #FILE TOO LARGE 
HS_int_df


Creating HS_int_simple, containing protein1 and protein2 names converted to int using id_prot. \
If all interactions are weight=1, only the names (ids) of the proteins are needed when constructing the interaction network

In [None]:
id_prot = joblib.load("./joblib_vars/id_prot.joblib")

#Swapping key value in dict
id_swap = dict()
for k, v in id_prot.items():
    id_swap[str(v)] = k
print(id_swap)

#exporting modified HS_int
HS_int_simple = HS_int_df.map(lambda x: id_swap[x] if x in id_swap else x).reset_index()
HS_int_simple = HS_int_simple[["protein1", "protein2"]]
joblib.dump(HS_int_simple, "./joblib_vars/HS_int_simple.joblib")


## Running the class

### Setting the color of the progress bar

In [2]:
%%html
<style>
.cell-output-ipywidget-background {
    background-color: transparent !important;
}
:root {
    --jp-widgets-color: var(--vscode-editor-foreground);
    --jp-widgets-font-size: var(--vscode-editor-font-size);
}  
</style>

### Running the class

In [6]:
a = cl.interaction_network()
a.create_encoding_dict(testDataset=False)
a.load_data(testDataset=False)

Creating encoding table
Fetching data
Cropping data


Parsing data:   0%|          | 0/5847852 [00:00<?, ?it/s]

Data loaded in var .vertices, with size: 0.58992mb


In [None]:
#Example code for the format of a.vertices
for root in a.vertices.keys():
    print("root", "neighbor", "norm_score")
    for neighbor in a.vertices[root].keys():
        print(root, neighbor, a.vertices[root][neighbor])
    break

In [None]:
a.construct_graph(a.vertices)
#visualize(a.graph_network) #DANGER: large visualization - Returns html graph, aka. opens up visualization in browser.

In [None]:
# Example usage of shortest_path
vertexes = {"A": ["B", "C"],
            "B": ["A", "C", "D"],
            "C": ["A", "B"],
            "D": ["B", "E"],
            "E": ["D"]}

# Make sure you reference the correct function name
#for start_end in (paths := f.shortest_path("A", vertexes)):
#    print(start_end, paths[start_end], sep = ":   ")

In [7]:
f.shortest_path(1, a.vertices, timed=True)

Time: 0ns


{'1->1414': '1_1414',
 '1->7280': '1_7280',
 '1->9082': '1_9082',
 '1->13148': '1_13148',
 '1->15992': '1_15992',
 '1->5476': '1_5476',
 '1->340': '1_340',
 '1->191': '1_191',
 '1->11991': '1_11991',
 '1->4406': '1_4406',
 '1->1843': '1_1843',
 '1->16482': '1_16482',
 '1->15610': '1_15610',
 '1->7647': '1_7647',
 '1->4565': '1_4565',
 '1->5642': '1_5642',
 '1->1931': '1_1931',
 '1->14504': '1_14504',
 '1->6248': '1_6248',
 '1->3792': '1_3792',
 '1->4247': '1_4247',
 '1->8498': '1_8498',
 '1->12394': '1_12394',
 '1->2459': '1_2459',
 '1->18665': '1_18665',
 '1->5887': '1_5887',
 '1->14911': '1_14911',
 '1->9393': '1_9393',
 '1->19318': '1_19318',
 '1->12375': '1_12375',
 '1->15588': '1_15588',
 '1->9881': '1_9881',
 '1->16315': '1_16315',
 '1->5585': '1_5585',
 '1->9883': '1_9883',
 '1->18706': '1_18706',
 '1->17673': '1_17673',
 '1->624': '1_624',
 '1->2162': '1_2162',
 '1->2602': '1_1414_2602',
 '1->7169': '1_1414_7169',
 '1->5722': '1_1414_5722',
 '1->8600': '1_1414_8600',
 '1->14374

In [None]:
print(a.vertices.keys())
print(len(a.vertices))