In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import zscore

# Gene Nodes

Putting together the Information for the nodes and save them as a csv file.

Gene Nodes have the following information:
- Gene ID
- Gene Name
- TPM value for lung cancer
- TPM value for healthy lung tissue
- TMP difference between cancer and healthy tissue

In [None]:
df_gtex = pd.read_csv('../processed_data/GTEX_healthy_mean.csv', usecols=['Gene ID', 'healthy TPM'])
df_cmp = pd.read_csv('../processed_data/CMP_cancer_mean.csv')

df_protein_gene = pd.read_csv('../import_data/ENSEMBLE/biomart_gene_protein.txt',  sep='\t')

plt.figure(figsize=(20, 5))

## List of genes that have a gene-protein connection

In [None]:
df_protein_gene = df_protein_gene[df_protein_gene['Protein stable ID'].notna()]
protein_genes = df_protein_gene['Gene stable ID'].unique()

len(protein_genes)

## Merge CMP (cancer) and GTEx (healthy) dataset

In [None]:
df_nodes_cancer = df_cmp.copy()
df_nodes_healthy = df_gtex.copy()

df_nodes_genes = df_nodes_cancer.merge(df_nodes_healthy, on='Gene ID', how='inner')

# filter out those without gene-protein connection
df_nodes_genes = df_nodes_genes[df_nodes_genes['Gene ID'].isin(protein_genes)]

print(f"There are {df_nodes_genes.shape[0]} rows in the merged dataset. All have a healthy and a cancer TPM value.")

df_nodes_genes

## Normalize TPM Values
TPM values are normalized to a range between 0 and 1 to make them easier comparable.

In [None]:
# logarithmic scaling with normalization [0, 1]
def log_norm(column, min_tpm, max_tpm):
    column_log = np.log1p(column)
    
    min_log = np.log1p(min_tpm)
    max_log = np.log1p(max_tpm)
    
    column = (column_log - min_log) / (max_log - min_log)
    return column

In [None]:
# Min
print(f"Min healthy TPM:\t{df_nodes_genes['healthy TPM'].min().round(3)}")
print(f"Min cancerous TPM:\t{df_nodes_genes['cancerous TPM'].min().round(3)}\n")

# Max
print(f"Max healthy TPM:\t{df_nodes_genes['healthy TPM'].max().round(3)}")
print(f"Max cancerous TPM:\t{df_nodes_genes['cancerous TPM'].max().round(3)}")

In [None]:
# min and max over two datasets
min_tpm = min(df_nodes_genes['healthy TPM'].min(), 
              df_nodes_genes['cancerous TPM'].min()
              )
max_tpm = max(df_nodes_genes['healthy TPM'].max(), 
              df_nodes_genes['cancerous TPM'].max()
              ) 

# perform log normalization
df_nodes_genes['norm healthy TPM'] = log_norm(df_nodes_genes['healthy TPM'], min_tpm, max_tpm)
df_nodes_genes['norm cancerous TPM'] = log_norm(df_nodes_genes['cancerous TPM'], min_tpm, max_tpm)

In [None]:
# norm healthy tpm
plt.hist(df_nodes_genes['norm healthy TPM'], bins=100)
plt.title('Histogram of normalized healthy TPM values')
plt.xlabel('normalized TPM values')
plt.ylabel('Frequency')

plt.savefig('../tex/figures/03_02_normalized_gtex_tpm.png')
plt.show()

In [None]:
# norm cmp tpm
plt.hist(df_nodes_genes['norm cancerous TPM'], bins=100)
plt.title('Histogram of normalized cancerous TPM values')
plt.xlabel('normalized TPM values')
plt.ylabel('Frequency')

plt.savefig('../tex/figures/03_02_normalized_cmp_tpm.png')
plt.show()

In [None]:
df_nodes_genes.drop(columns=['healthy TPM', 'cancerous TPM'], inplace=True)
df_nodes_genes

## Calculate `Δ TPM`

value of the difference between the normalized cancerous tpm and normalized healthy tpm

In [None]:
df_nodes_genes['Δ TPM'] = df_nodes_genes['norm cancerous TPM'] - df_nodes_genes['norm healthy TPM']
df_nodes_genes['Δ type'] = np.where(df_nodes_genes['Δ TPM'] > 0, 'increase', 'decrease')

df_nodes_genes

In [None]:
plt.hist(df_nodes_genes['Δ TPM'], bins=100)
plt.title('Histogram of Δ TPM')
plt.xlabel('Δ TPM Values')
plt.ylabel('Frequency')

plt.savefig('../tex/figures/03_02_delta_tpm.png')
plt.show()

In [None]:
min_delta = df_nodes_genes['Δ TPM'].min()
max_delta = df_nodes_genes['Δ TPM'].max()

print("Minimum Δ TPM value: {:.3f}".format(min_delta))
print("Maximum Δ TPM value: {:.3f}".format(max_delta))

# % of delta type distribution
print("\n")
print(round(df_nodes_genes['Δ type'].value_counts(normalize=True), 2))

## Calculate `Δ TPM relevant` with z score
Add a column that indicates if the change in the gene activity is relevant between cancer and healthy tissue.


Z Score of the Δ TPM values is calculated and used to determine if the gene is relevant for cancer.
This z score means how many standard deviations the Δ tpm value is away from the mean.


In [None]:
df_nodes_genes['z score'] = zscore(df_nodes_genes['Δ TPM'])

# 1.96 for the highest 5% and the lowest 5% of the normal distribution
z_threshold = 1.96
df_nodes_genes['Δ TPM relevant'] = df_nodes_genes['z score'].abs() > z_threshold
df_nodes_genes['z score'] = df_nodes_genes['z score'].round(3)

df_nodes_genes

In [None]:
print("There are {} genes that have a significant change in gene activity.".format(df_nodes_genes['Δ TPM relevant'].sum()))

# highest value with delta tpm relevant == true and value < 0
relevant_genes = df_nodes_genes[(df_nodes_genes['Δ TPM relevant'] == True)]

min_significant = relevant_genes[relevant_genes['Δ TPM'] < 0]['Δ TPM'].max()
max_significant = relevant_genes[relevant_genes['Δ TPM'] > 0]['Δ TPM'].min()

print(f"Minimum negative Δ TPM value for a significant change is: {min_significant:.3f}")
print(f"Maximum positive Δ TPM value for a significant change is: {max_significant:.3f}")


## Save

In [None]:
df_nodes_genes.to_csv('../processed_data/nodes_genes.csv', index=False)

In [None]:
print(f"There are {df_nodes_genes.shape[0]} rows as gene nodes.")

In [None]:
import dataframe_image as dfi
df_nodes_genes_plot = df_nodes_genes.copy()
df_nodes_genes_plot = df_nodes_genes_plot.reset_index(drop=True)

dfi.export(df_nodes_genes_plot.head(5), '../tex/figures/03_02_gene_nodes.png')