In [1]:
import pandas as pd
import os

In [2]:
# Read the original data
data_dir = ""
original_rna_data = pd.read_csv(os.path.join(data_dir, "TCGA-COAD.htseq_fpkm_1.csv"))

In [3]:
len(original_rna_data)

60483

In [4]:
# filter genes that target Y chromosome
filtered_genes = pd.read_csv(os.path.join(data_dir, "rna_seq_filtered_y_chromosome.csv"))

In [5]:
filtered_gene_list = filtered_genes['x'].tolist()

In [6]:
pattern = '|'.join([x + r"\.\d+" for x in filtered_gene_list])

In [7]:
filtered_original_data = original_rna_data[original_rna_data['Ensembl_ID'].str.contains(pattern)]

In [8]:
len(filtered_original_data)

55821

In [40]:
# filter genes that have all zeros
columns_to_check = filtered_original_data.columns.difference(['Ensembl_ID'])
filtered_genes = filtered_original_data.replace(0, pd.NA).dropna(subset=columns_to_check, how='all')

In [41]:
len(filtered_genes)

54186

In [49]:
# Normalisation
filtered_genes = filtered_genes[columns_to_check].fillna(0)

In [50]:
clean_filtered_genes = filtered_genes.copy()

In [51]:
clean_filtered_genes[columns_to_check] = (
    clean_filtered_genes[columns_to_check] - clean_filtered_genes[columns_to_check].min()) / (
    clean_filtered_genes[columns_to_check].max() - clean_filtered_genes[columns_to_check].min())

In [66]:
clean_filtered_genes = clean_filtered_genes.join(filtered_original_data["Ensembl_ID"])

In [67]:
cols = list(clean_filtered_genes.columns)
cols = [cols[-1]] + cols[:-1]
clean_filtered_genes = clean_filtered_genes[cols]

In [69]:
clean_filtered_genes.reset_index(drop=True)

Unnamed: 0,Ensembl_ID,TCGA-QL-A97D-01A,TCGA-RU-A8FL-01A,TCGA-SS-A7HO-01A,TCGA-T9-A92H-01A,TCGA-WS-AB45-01A,TCGA-3L-AA1B-01A,TCGA-4N-A93T-01A,TCGA-4T-AA8H-01A,TCGA-5M-AAT4-01A,...,TCGA-NH-A6GB-01A,TCGA-NH-A6GC-01A,TCGA-NH-A8F7-01A,TCGA-NH-A8F7-06A,TCGA-NH-A8F8-01A,TCGA-QG-A5YV-01A,TCGA-QG-A5YW-01A,TCGA-QG-A5YX-01A,TCGA-QG-A5Z1-01A,TCGA-QG-A5Z2-01A
0,ENSG00000242268.2,0.000000,0.000000,0.000000,0.000000,0.002902,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,ENSG00000270112.3,0.000000,0.000000,0.000000,0.000000,0.000407,0.000945,0.000418,0.000000,0.000000,...,0.000578,0.000000,0.001731,0.000000,0.000000,0.000000,0.001422,0.000402,0.000637,0.000357
2,ENSG00000167578.15,0.111074,0.161675,0.126848,0.153750,0.136364,0.145428,0.181076,0.137881,0.117238,...,0.140789,0.155303,0.123286,0.132441,0.160627,0.131728,0.163576,0.135578,0.108194,0.162852
3,ENSG00000078237.5,0.212226,0.129755,0.151101,0.212189,0.189703,0.172303,0.131233,0.181418,0.182306,...,0.201954,0.137802,0.106115,0.205450,0.225953,0.171489,0.225895,0.217415,0.135701,0.122408
4,ENSG00000146083.10,0.320563,0.352324,0.389671,0.327009,0.275312,0.375720,0.340420,0.292997,0.310173,...,0.290776,0.329216,0.346549,0.321359,0.338111,0.343694,0.249915,0.272809,0.302615,0.338425
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
54181,ENSG00000273233.1,0.000000,0.014700,0.005347,0.009525,0.000000,0.005554,0.000000,0.007454,0.005054,...,0.000000,0.000000,0.005111,0.004574,0.000000,0.000000,0.015927,0.000000,0.007399,0.004212
54182,ENSG00000105063.17,0.319200,0.288503,0.369360,0.305565,0.338062,0.318408,0.279824,0.314924,0.329171,...,0.316603,0.276381,0.304403,0.305988,0.297056,0.332286,0.311110,0.345408,0.311048,0.340994
54183,ENSG00000231119.2,0.006232,0.012813,0.002550,0.002323,0.000000,0.010236,0.004633,0.010375,0.008208,...,0.009390,0.014631,0.004822,0.016266,0.021899,0.003550,0.006858,0.001134,0.006136,0.003972
54184,ENSG00000123685.7,0.039291,0.043760,0.021091,0.019251,0.123433,0.034858,0.021879,0.029243,0.025216,...,0.065667,0.043859,0.009426,0.011126,0.040516,0.022423,0.051319,0.018759,0.030967,0.035787


In [70]:
clean_filtered_genes.to_csv(os.path.join(data_dir, "preprocessed/rna-seq.csv"), index=False)