In [None]:
import config
import time

import pandas as pd
import matplotlib.pyplot as plt
import dataframe_image as dfi
import numpy as np

from neo4j import GraphDatabase

In [None]:
# set up connection
driver = GraphDatabase.driver(config.uri, auth=(config.user, config.password), encrypted=False)

# Check if connection did work
def check_connectivity(driver):
    try:
        with driver.session() as session:
            result = session.run("RETURN 1 as result")
            record = result.single()
            if record and record["result"] == 1:
                return True
            else:
                print('False')
                return False
    except Exception as e:
        return False


if check_connectivity(driver):
    print("Verbindung erfolgreich hergestellt.")
else:
    print("Fehler bei der Verbindungsherstellung.")

# Graph Database Algorithm

## Projection

In [None]:
def drop_projection() -> None:
    """
    Drop the projection of the graph.
    """
    with driver.session() as session:
        drop_projection_query = "CALL gds.graph.drop('gene_protein_graph')"
        
        session.run(drop_projection_query)
        print("Projection dropped.")

def create_projection() -> None:
    """
    Create a projection of the graph for the gene and protein nodes.
    """
    with driver.session() as session:
        projection_query = """
        CALL gds.graph.project(
            'gene_protein_graph',
            {
                gene: {
                    label: 'gene'
                },
                protein: {
                    label: 'protein'
                }
            },
            {
                interaction: {
                    type: 'interaction',
                    orientation: 'UNDIRECTED'
                },
                connection: {
                    type: 'connection',
                    orientation: 'UNDIRECTED'
                }
            }
        )
        """
        
        session.run(projection_query)
        print("Graph projection created.")

# drop_projection()
create_projection()

## Helper Functions

In [None]:
def drop_protein_nodes(df) -> pd.DataFrame:
    """
    Drop the protein nodes from the dataframe.
    :param df: The dataframe with the protein nodes.
    """
    return df[~df["Gene_ID"].str.contains("ENSP")]

## Pagerank

In [None]:
def run_pagerank() -> pd.DataFrame:
    """
    Pagerank algorithm performed on the genes.
    :return: 
    """
    with driver.session() as session:
        query = """
        CALL gds.pageRank.stream('gene_protein_graph')
        YIELD nodeId, score
        RETURN gds.util.asNode(nodeId).id AS Gene_ID,
               gds.util.asNode(nodeId).gene_name AS Gene_Name,
               score, 
               gds.util.asNode(nodeId).Δ_TPM AS Δ_TPM,
               gds.util.asNode(nodeId).Δ_TPM_relevant AS Δ_TPM_relevant
        ORDER BY score DESC
        """
    
        result = session.run(query)
        
        df_results = pd.DataFrame([dict(record) for record in result], columns=result.keys())
        df_results = drop_protein_nodes(df_results)
        
        df_results["score"] = df_results["score"].round(3)
    
        return df_results

start = time.time()

df_pagerank = run_pagerank()

end = time.time()
print(f"Time taken: {round(end - start, 0)} seconds.")

In [None]:
print("Only genes nodes:")
df_pagerank.head(100)

In [None]:
plt.hist(df_pagerank["score"], bins=100)
plt.xlabel("Pagerank Score")
plt.ylabel("Frequency")
plt.title("Histogram of Pagerank Scores for genes")

plt.savefig('../tex/figures/04_hist_pagerank.png')
plt.show()

### Relevant Genes

In [None]:
print("Only genes with cancer active nodes:")
df_pagerank_relevant = df_pagerank.where(df_pagerank["Δ_TPM_relevant"] == True).dropna()

# save the dataframe as an image
dfi.export(df_pagerank_relevant.reset_index(drop=True).head(10), '../tex/figures/03_03_df_pagerank_relevant.png')

df_pagerank_relevant.head(10)

In [None]:
plt.hist(df_pagerank_relevant["score"], bins=100)
plt.xlabel("Pagerank Score")
plt.ylabel("Frequency")
plt.title("Histogram of Pagerank Scores for relevant genes")

# highlight the top 10 genes
min = df_pagerank_relevant["score"].head(10).min()
max = df_pagerank_relevant["score"].head(10).max()
plt.axvspan(min, max, color='r', alpha=0.2)

plt.savefig('../tex/figures/04_hist_pagerank_relevant.png')
plt.show()

In [None]:
# Statistics
mean_score = df_pagerank_relevant["score"].mean()
median_score = df_pagerank_relevant["score"].median()
percentiles = np.percentile(df_pagerank_relevant["score"], [25, 50, 75, 90, 95, 99])
max_score = df_pagerank_relevant["score"].max()

# Ergebnisse ausgeben
print(f"Mean of Pagerank Scores: {mean_score:.2f}")
print(f"Median of Pagerank Scores: {median_score:.2f}")
print(f"25th Percentile: {percentiles[0]:.2f}")
print(f"50th Percentile (Median): {percentiles[1]:.2f}")
print(f"75th Percentile: {percentiles[2]:.2f}")
print(f"90th Percentile: {percentiles[3]:.2f}")
print(f"95th Percentile: {percentiles[4]:.2f}")
print(f"99th Percentile: {percentiles[5]:.2f}")
print(f"Maximum Pagerank Score: {max_score:.2f}")

In [None]:
# lowest Pagerank score - gene with a single protein
lowest = df_pagerank_relevant[df_pagerank_relevant["score"] == 0.151]
print(f"From the {df_pagerank_relevant.shape[0]} relevant genes, {lowest.shape[0]} have a minimum pagerank score of 0.151.")

lowest

In [None]:
df_nodes_genes = pd.read_csv('../processed_data/nodes_genes.csv')
relevant_genes = df_nodes_genes[(df_nodes_genes['Δ TPM relevant'] == True)]

min_significant = relevant_genes[relevant_genes['Δ TPM'] < 0]['Δ TPM'].max()
max_significant = relevant_genes[relevant_genes['Δ TPM'] > 0]['Δ TPM'].min()
min_delta = df_nodes_genes['Δ TPM'].min()
max_delta = df_nodes_genes['Δ TPM'].max()

### Top 10 Genes

In [None]:
counts, bins, patches = plt.hist(df_nodes_genes['Δ TPM'], bins=100)
plt.title('Histogram of Δ TPM')
plt.xlabel('Δ TPM Values')
plt.ylabel('Frequency')

# highlight the relevant genes
plt.axvspan(min_delta, min_significant, color='r', alpha=0.2)
plt.axvspan(max_significant, max_delta, color='r', alpha=0.2)

# highlight the known genes
highlight_values = df_pagerank_relevant["Δ_TPM"].head(10)
for value in highlight_values:
    for i in range(len(bins) - 1):
        if bins[i] <= value < bins[i + 1]:
            patches[i].set_facecolor('red')


plt.savefig('../tex/figures/04_delta_tpm_relevant.png')
plt.show()

### Known Cancer Genes

In [None]:
# EGFR, KRAS, MET, LKB1 = STK11 , BRAF, PIK3CA, ALK, RET, ROS1
cancer_known_genes = ["ALK", "BRAF", "EGFR", "KRAS", "MET", "PIK3CA", "RET", "ROS1", "STK11"]

df_known_genes = df_pagerank[df_pagerank["Gene_Name"].isin(cancer_known_genes)].sort_values("Gene_Name").reset_index(drop=True)

dfi.export(df_known_genes, '../tex/figures/05_01_df_known_genes.png')

df_known_genes

In [None]:
counts, bins, patches = plt.hist(df_pagerank_relevant["score"], bins=100)
plt.xlabel("Pagerank Score")
plt.ylabel("Frequency")
plt.title("Histogram of Pagerank Scores for relevant genes")

# highlight the top 10 genes
min = df_pagerank_relevant["score"].head(10).min()
max = df_pagerank_relevant["score"].head(10).max()
plt.axvspan(min, max, color='r', alpha=0.2)

highlight_values = df_known_genes["score"]
for value in highlight_values:
    for i in range(len(bins) - 1):
        if bins[i] <= value < bins[i + 1]:
            patches[i].set_facecolor('red')


plt.savefig('../tex/figures/05_01_pagerank_known_genes.png')
plt.show()

In [None]:
counts, bins, patches = plt.hist(df_nodes_genes['Δ TPM'], bins=100)
plt.title('Histogram of Δ TPM')
plt.xlabel('Δ TPM Values')
plt.ylabel('Frequency')

# highlight the relevant genes
plt.axvspan(min_delta, min_significant, color='r', alpha=0.2)
plt.axvspan(max_significant, max_delta, color='r', alpha=0.2)

# highlight the known genes
highlight_values = df_known_genes["Δ_TPM"]
for value in highlight_values:
    for i in range(len(bins) - 1):
        if bins[i] <= value < bins[i + 1]:
            patches[i].set_facecolor('red')


plt.savefig('../tex/figures/05_01_delta_tpm_relevant.png')
plt.show()