# Super organised modular notebook


## Imports and installation

You can install the packages needed using the following command `pip install -r requirements.txt`  where the file contains the following:

<div class="alert alert-block alert-info">

aiohttp==3.9.5\
asyncio==3.4.3\
igraph\
requests\
py4cytoscape==1.9.0\
pandas\
tqdm==4.66.2\
numpy
</div>

Please note that Cytoscape has to be installed and open on your machine for visualisation to work.

In [179]:
# General packages
import pandas as pd         
import os
from tqdm import tqdm       
import pickle
import numpy as np
from datetime import datetime, timedelta

# define format, and set parameters for time, TODO: rm?  
date_format = "%Y_%m_%d_%H_%M" 
current_date = datetime.now().strftime(date_format)
last_week_date = (datetime.now() - timedelta(days=7)).strftime(date_format)



# Project specific packages
import aiohttp              # Used for aggregating requests into single session
import asyncio              # -"-
import nest_asyncio         # For jupyter asyncio compatibility 
nest_asyncio.apply()        # Automatically takes into account how jupyter handles running event loops

#import jsonpath_ng.ext      # More efficient json processing TODO: look into if actually computationally more efficient 
import igraph               # Used to create te citationa graph 
import requests             # For single API requests 

# Visualisation 
import py4cytoscape as p4c  # Cytoscape for visualisation of the citaton graph, 
                            # note that Cytoscape has to be installed and open on your machine for visualisation to work

Check that the packages are installed and functional 

In [180]:
print("igraph version:", igraph.__version__)
try: 
    dir(p4c)
    p4c.cytoscape_ping() 
    print("Cytoscape version:",p4c.cytoscape_version_info()['cytoscapeVersion'])    
except:
    print ("Make sure to have Cytoscape installed and open (or don't, if you don't care about the visualisation)!")


igraph version: 0.11.4
You are connected to Cytoscape!
Cytoscape version: 3.10.2


## Downloading data 

In [181]:
# format: 
"""

    Function description 
    
    Parameters
    ----------
    name : type
        description
"""

'\n\n    Function description \n    \n    Parameters\n    ----------\n    name : type\n        description\n'

In [182]:
async def fetch_biotools_page(session, url):
    """ 
    Sync the bio.tools (page) requests so they are all made in a single session 

    Parameters
    ----------
    session : aiohttp.client.ClientSession object
        session object for package aiohttp
    url : str
        url for request
    """
    
    async with session.get(url) as response:
        return await response.json()

In [183]:
async def get_biotools_metadata(topicID="topic_0121"):  # TODO: I removed format. Check if there is any reason to have it 
                                                        # TODO: should add parameter for optional forced retrieval - even if csv file, still recreate it 
                                                        # TODO: Currently no timing - add tracker
    """
    Fetches metadata about tools from bio.tools, belonging to a given topicID and returns as a dataframe.
    If a CSV file already exists load the dataframe from it. 

    Parameters
    ----------
    topicID : str TODO: make this a int instead? why am I writing topic? 
        The ID to which the tools belongs to, ex. "Proteomics" or "DNA" as defined by 
        EDAM ontology (visualisation: https://edamontology.github.io/edam-browser/#topic_0003)

    
    """

    date_format = "%Y%m%d"

    # Define the CSV filename
    csv_filename = f'biotools_metadata_{topicID}_{datetime.now().strftime(date_format)}.csv' 

    # Check if the file exists and if it's older than a week
    if os.path.isfile(csv_filename):
        file_date = datetime.strptime(csv_filename.split('_')[-1].split('.')[0], date_format)
        if file_date < datetime.now() - timedelta(days=7):
            print("Old datafile. Updating...")
        else:
            print("Bio.tools data loaded from existing CSV file.")
            df = pd.read_csv(csv_filename)
            return df
    else:
        print("No existing bio.tools CSV file. Downloading data.") 
    
    # TODO: should filepath/name be allowed to be configurable?
    # then the following could be a separate function called by this one, or is this very inefficient?
    # TODO: should place files created in a folder named for each run

    all_tool_data = [] # TODO: predefine the length, means one more request 

    # start at page 1 
    page = 1 

    # requests are made during single session
    async with aiohttp.ClientSession() as session: 
        while page:

            # send request for tools on the page, await further requests 
            biotools_url = f'https://bio.tools/api/t?topicID=%22{topicID}%22&format=json&page={page}'
            biotool_data = await fetch_biotools_page(session, biotools_url)
            

            # TODO: Do I need to check? what happens if no response for page == 1? Maybe try/except instead
            # Checking if there are any tools, if 
            if 'list' in biotool_data: 
                biotools_lst = biotool_data['list']


                for tool in biotools_lst:
                    name = tool.get('name') 
                    publication = tool.get('publication')
                    topic = tool.get('topic')
                    # TODO: decide whether we even want this information since it would only be used for visualisation and possibly teh multitopic graph 

                    # TODO: if no pmid, use doi converter 
                    # TODO: Download primary (using jasonpatg-ng), 
                    if name and publication and publication[0].get('pmid') and topic and topic[0].get('term'): 
                        all_tool_data.append({ #predefine, since max length == nr_tools, then need to define this earlier
                            'name': name,
                            'pmid': str(publication[0]['pmid']), # making sure they are all strings
                            'topic': topic[0]['term']
                        })

                page = biotool_data.get('next')
                if page: # else page will be None and loop will stop 
                    page = page.split('=')[-1] # only want the page number 
            else: 
                print(f'Error while fetching tool names from page {page}')
                break

    # Convert list of dictionaries to dataframe
    df = pd.DataFrame(all_tool_data)
    # Save dataframe to file
    df.to_csv(csv_filename, index=False)

    # If there were any pages, check how many tools were retrieved and how many tools had pmids
    if biotool_data: 
        nr_tools = int(biotool_data['count']) 
        print(f'Found {len(all_tool_data)} out of a total of {nr_tools} tools with PMIDS.')

    return df

In [184]:
def europepmc(article_id, format='JSON', source='MED', page=1, page_size=1000):   # TODO: replace own wrapper with recommendation? https://github.com/ML4LitS/CAPITAL/tree/main
                                                                                # TODO: call output="idlist" immidiately? then we have no metadata but we dont use that anyways!
    """ 
    Downloads pmids for the articles citing the given article_id, returns list of citation pmids (PubMed IDs)
        
    Parameters
    ----------
    article_id : str # TODO: int? 
        pmid, PubMed ID, for a given article.
    source: str
        source ID as given by the EuropePMC API documentation: https://europepmc.org/Help#contentsources 

    page, int, default == 1
        determines where to start looking TODO: remove this, why would you not start at 1? 

    pagesize, int, default 1000 max 1000
        determines number of results per page
    
    """ 

    # create a url with the given requirements according to the EuropePMC API synthax and query the API
    base_url = f'https://www.ebi.ac.uk/europepmc/webservices/rest/{source}/{article_id}/citations?page={page}&pageSize={page_size}&format={format}'
    result = requests.get(base_url)

    # Return all citations, given the query was accepted
    # TODO: jsonpath-ng
    if result.ok:
        return result.json()['citationList']['citation']
    else:
        print('Something went wrong') # TODO: better error message. Try/except? 


## Graph generation

The function reate_citation_network can be called to create or load the cocitation network

In [185]:
#TODO: all of the descriptions - Where do I put default value? 

def create_citation_network(topicID="topic_0121", testSize=None, randomSeed=42, loadData=True, filePath='', saveFiles=True): # TODO: I just threw  code into this function- improve
    
    """
    Creates a citation network given a topic and returns a graph and the tools included in the graph

        
    Parameters
    ----------
    topicID : str, default "topic_0121" (proteomics) TODO: int? 
        The ID to which the tools belongs to, ex. "Proteomics" or "DNA" as defined by 
        EDAM ontology (visualisation: https://edamontology.github.io/edam-browser/#topic_0003)

    testSize : int or None, default None
        Determines the number of tools included in the citation graph.

    randomSeed : int, default 42
        Specifies what seed is used to randomly pick tools in a test run. 
    
    loadData : Boolean, default True
        Determines if already generated graph is loaded or if it is recreated.
    
    filePath : str  TODO: add filepath 
        Path to already generated graph

    saveFiles : Boolean, default True
        Determines if newly generated graph is saved. 

    """
    # Retrieve the data 
    # run the asynchronous function for single session requests 
    result = asyncio.run(get_biotools_metadata(topicID=topicID)) 
    pmids = result['pmid'].tolist() # should I use numpy for all my lists? 

    # Randomly picks out a subset of the pmids
    if testSize:   
        print(f"Creating test-cocitation network of size {testSize}. Random seed is {randomSeed}.")
        np.random.seed(randomSeed)
        pmids = np.random.choice(pmids,testSize)

    
    # Edge creation 
    # Load previously created data or recreate it
    if loadData: # TODO: pickle mayebe is not the way to go in future? 
        
        if os.path.isfile('edges{testSize}.pkl') and os.path.isfile('graph{testSize}.pkl') and os.path.isfile('included_tools{testSize}.pkl'): # should give option to specify these names
            print("Loading data")
            with open(f'edges{testSize}.pkl', 'rb') as f:
                unq_edges = pickle.load(f) # should be unique ones right 
            with open(f'graph{testSize}.pkl', 'rb') as f:
                G = pickle.load(f) 
            with open(f'included_tools{testSize}.pkl', 'rb') as f:
                included_tools = pickle.load(f) 
        else:
            print(f"Files not found. Please check that 'edges{testSize}.pkl', 'graph{testSize}.pkl' and 'included_tools{testSize}.pkl' are in your current directory and run again. Or set loadData = False, to create the files. ")
            return 
   
    else:
        # edge creation using europepmc
        print("Downloading citation data from Europepmc.")
        
        # this is to create a list of the tools that actually had citations, otherwise they are not included in the graph. 
        included_tools = []  # TODO: is there a smarter way of generating the included_tools list? 
        edges = []

        # Get citations for each tool, and generate edges between them. 
        for pmid in tqdm(pmids, desc="Processing PMIDs"): 
            pmid = str(pmid) # EuropePMC requires str            
    
            citations = europepmc(pmid, page_size=1000)
            for citation in citations:
                edges.append((pmid, str(citation['id']))) # TODO: this is the wring way around? shoudl be citation to pmid, no? 
                if pmid not in included_tools:
                    included_tools.append(pmid) 
        
        print("Creating citation graph using igraph.")
        
        # Finding unique edges by converting list to a set (because tuples are hashable) and back to list.
        # TODO: maybe not super efficient? 
        unq_edges =  list(set(edges)) 
        print(f"{len(unq_edges)} unique out of {len(edges)} edges total!")

        # Creating a directed graph with unique edges
        G = igraph.Graph.TupleList(unq_edges, directed=True)

        # TODO: its starting to get messy man
        # Removing disconnected vertices (that are not tools) that do not have information value for the (current) metric
        print("Removing citations with degree less or equal to 1 (Non co-citations).")
        vertices_to_remove = [v.index for v in G.vs if v.degree() <= 1 and v['name'] not in included_tools] 
        G.delete_vertices(vertices_to_remove)
        vertices_to_remove = [v.index for v in G.vs if v.degree() == 0 ] # second run to remove the copletely detatched ones after first run sicne they wont give info anyways. 
        G.delete_vertices(vertices_to_remove) # This will remove isolated tools as well 

        # Updating included_tools to only contain lists that are in the graph  
        included_tools = [tool for tool in included_tools if tool in G.vs['name']] # TODO: since I am doing this anyways I could do it with pmids directly and not generate the included_tools list earier?


        # Saving edges, graph and tools included in the graph 
        if saveFiles:
            print(f"Saving data to 'edges{testSize}.pkl', 'graph{testSize}.pkl' and 'included_tools{testSize}.pkl'.") # sould make these filenames dynamic
            # and save them 
            #Do this nicer later? 
            with open(f'edges{testSize}.pkl', 'wb') as f:
                pickle.dump(unq_edges, f)

            with open(f'graph{testSize}.pkl', 'wb') as f:
                pickle.dump(G, f)

            with open(f'included_tools{testSize}.pkl', 'wb') as f:
                pickle.dump(included_tools, f)    

    # returns a graph and the pmids of the tools included in the graph (tools connected by cocitations)
    return G, included_tools 


## The metric

Create the cocitation network specifying if you want to load existing data, run on a smaller test set etc.  

In [186]:
G, included_tools = create_citation_network(testSize=100, loadData=False)


Bio.tools data loaded from existing CSV file.
Creating test-cocitation network of size 100. Random seed is 42.
Downloading citation data from Europepmc.


Processing PMIDs: 100%|██████████| 100/100 [00:30<00:00,  3.27it/s]

Creating citation graph using igraph.
5651 unique out of 6703 edges total!
Removing citations with degree less or equal to 1 (Non co-citations).
Saving data to 'edges100.pkl', 'graph100.pkl' and 'included_tools100.pkl'.





### Download workflow data


<div class="alert alert-block alert-warning">
<b>OBS:</b> This is not yet implemented. Currently drawing random tools to simulate workflows
</div>

In [187]:
# TODO: download workflows

# TODO: improve randomisation to have sequential networks

# number of edges in the workflow
num_pairs = 3

workflow_pairs = []
while len(workflow_pairs) < num_pairs:
    article1 = np.random.choice(included_tools)
    article2 = np.random.choice(included_tools)
    if article1 != article2:  # Ensure article1 and article2 are different
        workflow_pairs.append((article1, article2))

workflow_tools = np.unique([element for tuple in workflow_pairs for element in tuple])
print( "Tools in pseudo WF:", workflow_tools)
# Print the generated pairs
print("Generated workflow pairs (WF edges):")
for pair in workflow_pairs:
    print(pair)

Tools in pseudo WF: ['12403597' '23051804' '26335203' '26510693' '34395100' '36414245']
Generated workflow pairs (WF edges):
('26510693', '12403597')
('36414245', '23051804')
('34395100', '26335203')


### Calculate metric

<div class="alert alert-block alert-warning">
<b>OBS:</b> This is a simple placeholder implementation of a metric
</div>

In [188]:
def comet(graph, workflows): # cocitation metric 

    """
    Calculates the cocitation  metric for a given workflow and a given cocitation graph
    
    Parameters
    ----------
    graph : igraph.Graph
        Graph generated by igraph
    workflow, list of tuples with pairs of strings TODO: how do I write this? does it matter I wont have this format later anyways? 
        List of tuples of strings corresponding to the edges in the workflow. 

    """
    # List to  collect pairwise scores
    score_list = [] # TODO: can predefine the list length, does not matter this is temporary?

    for pair in workflows:
        cocite_score = 0
        neighbors_of_first = set(graph.neighbors(pair[0]))
        neighbors_of_second = set(graph.neighbors(pair[1]))

        # Count number of common neighbours
        common_neighbors = neighbors_of_first.intersection(neighbors_of_second)
        cocite_score = len(common_neighbors)
        score_list.append(cocite_score)

    # Then sum the scores or perform any other desired calculation
    # now normalising by WF length
    # maybe call this one "support", since that is basically what we have. 

    return sum(score_list)/len(score_list), score_list


In [189]:
metric_score, raw_results = comet(G, workflow_pairs)

print(raw_results)

print(f"Total cocitation score for workflow {workflow_pairs} is {metric_score}.")


[1, 0, 0]
Total cocitation score for workflow [('26510693', '12403597'), ('36414245', '23051804'), ('34395100', '26335203')] is 0.3333333333333333.


### Evaluate results 

## Visualisation


Using cytoscape to visualise the network (Alma add colouring of the workflow too)

In [190]:

import time # TODO: i switched to datetime, fix this later 
t = time.localtime()
current_time = time.strftime("%Y_%m_%d_%H_%M", t)


p4c.create_network_from_igraph(G, f"Cocitations_Size{len(included_tools)}_{current_time}")


print("Styling graph")
p4c.set_node_shape_default("ELLIPSE")
p4c.set_node_width_default(30)
p4c.set_node_height_default(30)
p4c.set_node_border_color_default("#000000")  # Black color in hexadecimal
p4c.set_node_border_width_default(1)
p4c.set_node_color_bypass(included_tools, "#FF0000")  # Red color in hexadecimal
p4c.set_node_size_bypass(included_tools, 100)  # remember it cannot be a np.array, just a list argh 

# colour the tools in the workflow 
p4c.set_node_color_bypass(list(workflow_tools), "#2F739A")  # they should be gradually coloured later maybe, sequentially



Applying default style...
Applying preferred layout
Styling graph
style_name not specified, so updating "default" style.


style_name not specified, so updating "default" style.
style_name not specified, so updating "default" style.
style_name not specified, so updating "default" style.
style_name not specified, so updating "default" style.
style_name not specified, so updating "default" style.
style_name not specified, so updating "default" style.


''