In [None]:
import pandas as pd
import dataframe_image as dfi
import os
import urllib.request
import gzip

## Import data

In [None]:
df_nodes_genes = pd.read_csv('../processed_data/nodes_genes.csv')

# Edges: Protein-Gene Connection
File from: https://www.ensembl.org/biomart/martview/

Ensembl Genes 112 > Human Genes (GRCh38.p14)

Protein - Gene Connections have these information:
* Gene ID
* Protein ID

In [None]:
df_edges_protein_gene = pd.read_csv('../import_data/ENSEMBLE/biomart_gene_protein.txt',  sep='\t')
df_edges_protein_gene.rename(columns={'Gene stable ID': 'Gene ID', 'Protein stable ID': 'Protein ID'}, inplace=True)

print(f"There are {len(df_edges_protein_gene)} rows in the imported dataset.")

In [None]:
# drop genes without gene or protein
df_edges_protein_gene.dropna(inplace=True)

# filter for genes in nodes
df_edges_protein_gene = df_edges_protein_gene[df_edges_protein_gene['Gene ID'].isin(df_nodes_genes['Gene ID'])]


print(f"There are {len(df_edges_protein_gene)} rows in the filtered dataset.")
df_edges_protein_gene.head(10)

# Edges: Protein-Protein Interaction
loaded from STRING database

Protein - Protein Interaction have these information:
* left Protein ID
* right Protein ID

In [None]:
# download STRING dataset
url_string = "https://stringdb-downloads.org/download/protein.links.full.v12.0/9606.protein.links.full.v12.0.txt.gz"

zip_file_name = "../import_data/STRING/protein.links.full.v12.0.txt.gz"

file_name = "../import_data/STRING/protein.links.full.v12.0.txt"

os.makedirs("../import_data/STRING", exist_ok=True)

if not os.path.exists(zip_file_name):
    urllib.request.urlretrieve(url_string, zip_file_name)

if not os.path.exists(file_name):
    with gzip.open(zip_file_name, 'rb') as f_in:
        with open(file_name, 'wb') as f_out:
            f_out.write(f_in.read())

In [None]:
df_edges_protein = pd.read_csv(file_name, sep=' ', usecols=[0, 1])

print('Length of STRING Links Dataset: ', len(df_edges_protein))

df_edges_protein.rename(columns={'protein1': 'left Protein ID', 'protein2': 'right Protein ID'}, inplace=True)

df_edges_protein['left Protein ID'] = df_edges_protein['left Protein ID'].apply(lambda x: x.split('.')[1])
df_edges_protein['right Protein ID'] = df_edges_protein['right Protein ID'].apply(lambda x: x.split('.')[1])

print(f"There are {len(df_edges_protein)} rows in the dataset.")
df_edges_protein.head()

### Filter for Proteins from Nodes
Leads to only direct connections between proteins that are connected to genes

→ Gene - Protein - Protein - Gene


But we also want to include proteins that are not connected to a gene

→ Gene - Protein - $Protein$ - Protein - Gene


In [None]:
"""df_edges_protein = df_edges_protein[df_edges_protein['protein1'].isin(df_nodes_protein)]
df_edges_protein = df_edges_protein[df_edges_protein['protein2'].isin(df_nodes_protein)]

df_edges_protein"""

In [None]:
# number of edges per gene
import matplotlib.pyplot as plt
edge_per_gene = df_edges_protein.groupby('left Protein ID').count().sort_values('right Protein ID', ascending=False)

# hist plot
plt.figure(figsize=(10, 5))
plt.hist(edge_per_gene['right Protein ID'], bins=100)
plt.xlabel('Number of Edges')

plt.show()

# Nodes: Protein
Protein Nodes have these information:
* Protein ID


In [None]:
# Proteins that are connected to genes
df_nodes_protein1 = df_edges_protein_gene.copy()
df_nodes_protein1 = df_nodes_protein1['Protein ID']

df_nodes_protein1 = df_nodes_protein1.drop_duplicates().reset_index(drop=True)

df_nodes_protein1

In [None]:
# proteins that are connected to other proteins
df_nodes_protein2 = pd.concat([df_edges_protein['left Protein ID'], df_edges_protein['right Protein ID']], ignore_index=True)
df_nodes_protein2 = df_nodes_protein2.drop_duplicates().reset_index(drop=True)
df_nodes_protein2.rename('Protein ID', inplace=True)

df_nodes_protein2

In [None]:
# combine both
df_nodes_protein = pd.concat([df_nodes_protein1, df_nodes_protein2], ignore_index=True)
df_nodes_protein = df_nodes_protein.drop_duplicates()

df_nodes_protein

# Export Data

In [None]:
df_nodes_protein.to_csv('../processed_data/nodes_protein.csv', index=False)
df_edges_protein.to_csv('../processed_data/edges_protein.csv', index=False)
df_edges_protein_gene.to_csv('../processed_data/edges_protein_gene.csv', index=False)

In [None]:
print(f"There are {len(df_nodes_protein)} Protein Nodes")
print(f"There are {len(df_edges_protein)} Protein-Protein Edges")
print(f"There are {len(df_edges_protein_gene)} Protein-Gene Edges")

In [None]:
df_edges_protein_plot = df_edges_protein.reset_index(drop=True)
df_edges_protein_gene_plot = df_edges_protein_gene.reset_index(drop=True)
df_nodes_protein_plot = df_nodes_protein.reset_index(drop=True)

df_nodes_protein_plot
# lib cant plot single columns

In [None]:
dfi.export(df_edges_protein_plot.head(5), '../tex/figures/03_02_protein_edges.png')
dfi.export(df_edges_protein_gene_plot.head(5), '../tex/figures/03_02_gene_protein_edges.png')