In [None]:
import pandas as pd
import dataframe_image as dfi
import zipfile as zf

import os
import urllib.request

# Data from Cell Modell Passport
Link: https://cellmodelpassports.sanger.ac.uk/downloads

## Model infos for cancer type
loaded from Model Annotation → list of all annotated models

Needed for filtering `all data` for lung cancer

In [None]:
# download the data

url_model_info = "https://cog.sanger.ac.uk/cmp/download/model_list_20241120.csv"
model_info_name = "../import_data/CMP/model_list_20240110.csv"

url_cmp_file = "https://cog.sanger.ac.uk/cmp/download/rnaseq_all_20220624.zip"
cmp_zip_file_name = "../import_data/CMP/rnaseq_all_20220624.zip"
cmp_file_name = "../import_data/CMP/rnaseq_all_data_20220624.csv"

os.makedirs("../import_data/CMP", exist_ok=True)


if not os.path.exists(model_info_name):
    urllib.request.urlretrieve(url_model_info, model_info_name)

if not os.path.exists(cmp_zip_file_name):
    urllib.request.urlretrieve(url_cmp_file, cmp_zip_file_name)

if not os.path.exists(cmp_file_name):
    zip_file = zf.ZipFile(cmp_zip_file_name)
    zip_file.extractall("../import_data/CMP")
    zip_file.close()


In [None]:
# load data
df_model_info = pd.read_csv(model_info_name, delimiter=',', usecols=['model_id', 'tissue', 'cancer_type', 'tissue_status', 'cancer_type_detail'])


# filter for lung cancer
lung_cancer = ['Small Cell Lung Carcinoma', 'Non-Small Cell Lung Carcinoma', 'Squamous Cell Lung Carcinoma']
df_model_info_lung = df_model_info.where(df_model_info["cancer_type"].isin(lung_cancer)).dropna()

df_model_info_lung.head(5)

In [None]:
# List of model_ids with lung cancer
model_ids_lung = df_model_info_lung["model_id"].to_list()

## Expression data
Expression Data → all RNA Seq processed Data

All tissues are cancer tissues. We need to filter for lung cancer.
**Output file format:**
* Ensemble ID
* Gene Name
* TPM value

In [None]:
# read in the data
model_info_file = "../import_data/CMP/rnaseq_all_data_20220624.csv"
df_cmp_all = pd.read_csv(model_info_file, delimiter=",", usecols=["gene_id", "gene_symbol", "model_id", "tpm"])

print("There are {} rows in the import_data.".format(df_cmp_all.shape[0]))

df_cmp_all.head()

### Clean Dataframe

In [None]:
df_cmp_all.rename(columns={"gene_symbol": "gene_name"}, inplace=True)

In [None]:
# filter rows with lung cancer model ids
df_cmp_all = df_cmp_all.where(df_cmp_all["model_id"].isin(model_ids_lung)).dropna()

print("There are {} rows with lung cancer data.".format(df_cmp_all.shape[0]))

### Analyze Dataset

In [None]:
# check for missing values
missing_values = df_cmp_all.isnull().sum()

# TPM Ranges
min_tpm = df_cmp_all["tpm"].min()
max_tpm = df_cmp_all["tpm"].max()

# genes
n_genes = df_cmp_all["gene_name"].nunique()

# tissues
n_tissues = df_cmp_all["model_id"].nunique()
df_cmp_all.drop(columns=["model_id"], inplace=True)

print(f"Missing values:\n"
      f"{missing_values}\n")

print(f"Min TPM: {min_tpm}")
print(f"Max TPM: {max_tpm}\n")

print(f"Number of genes: {n_genes}")
print(f"Number of tissues: {n_tissues}")


We then merged this file with our CMP data on the gene names to retrieve the ENS IDs for each gene.

After merging the data, we found that 3,760 genes had no ENS ID associated with them.
Since these genes were likely duplicates or did not exist in the Ensemble file, we removed them from our dataset to ensure consistency and accuracy of our analysis.
### Group Data to mean values

In [None]:
df_cmp_group = df_cmp_all.groupby(["gene_name", "gene_id"]).mean().reset_index()

print("There are {} rows in the grouped dataset.".format(df_cmp_group.shape[0]))
df_cmp_group

## Ensemble Dataset
Downloaded via Biomart



In [None]:
df_ensembl = pd.read_csv("../import_data/ENSEMBLE/ensemble_gene_id.txt", delimiter="\t")
df_ensembl.rename(columns={"gene_symbol": "gene_name"}, inplace=True)

# drop rows without gene_symbol
df_ensembl.drop(df_ensembl[df_ensembl["gene_name"].isnull()].index, inplace=True)

df_ensembl

In [None]:
duplicate_names = df_ensembl["gene_name"].duplicated(keep=False).sum()
rows = df_ensembl.shape[0]
print(f'{duplicate_names} from {rows} do not have a unique gene names in ENS Dataset')

PROBLEM: There are gene names that are not unique.

→ If the names are not unique, we cannot merge the data on the gene names with our dataset.

In [None]:
# delete all rows with not unique gene names
df_ensembl_unique = df_ensembl.drop_duplicates(subset=["gene_name"], keep=False)

df_ensembl_unique

## Merge Data

In [None]:
df_cmp_ens = pd.merge(df_cmp_group, df_ensembl_unique, on="gene_name", how="left")
df_cmp_ens

In [None]:
# check Data with missing ENS
missing_ens = df_cmp_ens[df_cmp_ens["Gene_stable_ID"].isnull()]

print(len(missing_ens), "/",len(df_cmp_ens),  "still have no ENS ID")
missing_ens

In [None]:
# show rows with duplicate names in df_cmp_ens
df_cmp_ens[df_cmp_ens["gene_name"].duplicated(keep=False)]

### Clean up

In [None]:
df_cmp_ens.dropna(subset=["Gene_stable_ID"], inplace=True)
df_cmp_ens.rename(columns={"Gene_stable_ID": "Gene ID", 
                           "tpm":"cancerous TPM", 
                           "gene_name": "Gene Name" }, inplace=True)

# reorder columns
df_cmp = df_cmp_ens[['Gene ID', 'Gene Name', 'cancerous TPM']]

df_cmp

### Save Data

In [None]:
df_cmp.to_csv("../processed_data/CMP_cancer_mean.csv", index=False)
print(f'There are {df_cmp.shape[0]} rows/genes in the saved dataset.')

In [None]:
dfi.export(df_cmp.head(5), "../tex/figures/03_01_CMP_cancer_mean.png")