- Author: Peter Riesebos
- Purpose: Script used to filter the expression matrx on protein coding genes using pandas.
- Input: List of protein coding genes, expression matrix
- Output: Filtered expression matrix .txt.gz file 

## Imports

In [1]:
import pandas as pd

In [8]:
# read in protein coding genes list to dataframe
prot = pd.read_csv("/groups/umcg-fg/tmp04/projects/gut-bulk/ongoing/2024-02-07-GutPublicRNASeq/datasets/pub_rna/final_files_pub_rna/protein_coding_genes_list.txt", sep='\t', names=['Gene'])
prot

Unnamed: 0,Gene
0,ENSG00000186092.7
1,ENSG00000284733.2
2,ENSG00000284662.2
3,ENSG00000187634.13
4,ENSG00000188976.11
...,...
20065,ENSG00000277836.1
20066,ENSG00000278633.1
20067,ENSG00000276017.1
20068,ENSG00000278817.1


In [5]:
# read in expression matrix to dataframe
exp = pd.read_csv("/groups/umcg-fg/tmp04/projects/gut-bulk/ongoing/2024-02-07-GutPublicRNASeq/datasets/combined/combined_expression_matrix.txt.gz", sep='\t')
exp

Unnamed: 0,gene,GTEX-111CU,GTEX-111VG,GTEX-111YS,GTEX-1122O,GTEX-1128S,GTEX-117YW,GTEX-117YX,GTEX-11DXX,GTEX-11DXZ,...,SRR8774220,SRR8774218,SRR8774200,SRR8774237,SRR8774217,SRR8774234,SRR8774231,SRR8774203,SRR8774207,SRR8774235
0,ENSG00000000003,-0.125585,0.122961,0.190132,-0.217898,-0.051152,0.240522,-0.295547,-0.134806,-0.330283,...,0.332547,0.344190,0.522676,-0.475680,0.697863,-0.426169,1.074275,0.240747,-0.991953,-1.085401
1,ENSG00000000005,0.010117,-0.641715,0.010271,1.058705,-0.669473,0.328847,0.965188,-0.342902,-0.827583,...,-0.479884,0.005676,0.346633,-0.283573,-0.192958,-1.246924,-0.563771,-0.232208,0.453034,0.569829
2,ENSG00000000419,-0.423136,-0.317468,-0.737069,-0.245801,-0.537045,-0.104784,-0.096081,0.108634,0.508258,...,0.675924,-0.642768,0.754549,-0.752429,0.631987,-2.259803,-0.147150,-0.188850,0.180100,-0.966924
3,ENSG00000000457,-0.041449,-0.167390,-0.008200,-0.052758,0.311891,0.113554,0.285084,-0.026506,-0.271002,...,0.181457,-1.138333,1.219799,0.268599,-0.878440,-0.016553,-0.623251,1.187019,0.327071,0.860316
4,ENSG00000000460,-0.054366,0.012443,0.494128,0.692178,-0.029416,0.297539,0.419932,-0.049671,-0.365092,...,-0.927054,-0.087789,0.113627,-0.099343,-0.586983,-1.971774,0.590401,0.380119,-0.013840,0.422249
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60164,ENSG00000292314,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
60165,ENSG00000292316,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
60166,ENSG00000292319,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
60167,ENSG00000292348,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.190715,-0.123060,0.226153,0.186300,-0.117426,0.483238,0.648230,-0.327177,0.377526,1.287604


## Data manipulation

In [9]:
# strip off gene version numbers from "gene" column
prot_no_version = prot.copy()
prot_no_version['Gene'] = prot_no_version['Gene'].str.replace(r'\.\d+$', '', regex=True)
prot_no_version.Gene.head(3)

0    ENSG00000186092
1    ENSG00000284733
2    ENSG00000284662
Name: Gene, dtype: object

In [13]:
# create a new expression matrix dataframe filtered on protein coding genes
filtered_exp = exp[exp['gene'].isin(prot_no_version['Gene'])]
filtered_exp

Unnamed: 0,gene,GTEX-111CU,GTEX-111VG,GTEX-111YS,GTEX-1122O,GTEX-1128S,GTEX-117YW,GTEX-117YX,GTEX-11DXX,GTEX-11DXZ,...,SRR8774220,SRR8774218,SRR8774200,SRR8774237,SRR8774217,SRR8774234,SRR8774231,SRR8774203,SRR8774207,SRR8774235
0,ENSG00000000003,-0.125585,0.122961,0.190132,-0.217898,-0.051152,0.240522,-0.295547,-0.134806,-0.330283,...,0.332547,0.344190,0.522676,-0.475680,0.697863,-0.426169,1.074275,0.240747,-0.991953,-1.085401
1,ENSG00000000005,0.010117,-0.641715,0.010271,1.058705,-0.669473,0.328847,0.965188,-0.342902,-0.827583,...,-0.479884,0.005676,0.346633,-0.283573,-0.192958,-1.246924,-0.563771,-0.232208,0.453034,0.569829
2,ENSG00000000419,-0.423136,-0.317468,-0.737069,-0.245801,-0.537045,-0.104784,-0.096081,0.108634,0.508258,...,0.675924,-0.642768,0.754549,-0.752429,0.631987,-2.259803,-0.147150,-0.188850,0.180100,-0.966924
3,ENSG00000000457,-0.041449,-0.167390,-0.008200,-0.052758,0.311891,0.113554,0.285084,-0.026506,-0.271002,...,0.181457,-1.138333,1.219799,0.268599,-0.878440,-0.016553,-0.623251,1.187019,0.327071,0.860316
4,ENSG00000000460,-0.054366,0.012443,0.494128,0.692178,-0.029416,0.297539,0.419932,-0.049671,-0.365092,...,-0.927054,-0.087789,0.113627,-0.099343,-0.586983,-1.971774,0.590401,0.380119,-0.013840,0.422249
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60043,ENSG00000291309,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.280334,-0.607832,-0.609009,0.525254,0.171010,0.019461,0.281028,-0.082001,-0.674960,-0.805458
60045,ENSG00000291317,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,-1.204795,-0.552627,0.035092,0.545718,0.124836,-0.788326,0.754951,-0.137872,0.298090,-0.144593
60143,ENSG00000292277,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.189738,0.079479,0.841354,-1.007350,-0.657792,-0.244259,0.607207,0.216814,-0.703668,1.758570
60167,ENSG00000292348,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.190715,-0.123060,0.226153,0.186300,-0.117426,0.483238,0.648230,-0.327177,0.377526,1.287604


## Data export

In [14]:
# write dataframe to gzipped file
filtered_exp.to_csv("/groups/umcg-fg/tmp04/projects/gut-bulk/ongoing/2024-02-07-GutPublicRNASeq/datasets/combined/combined_expression_matrix_protein_coding_filtered.txt.gz", sep='\t', index=False)