In [None]:
# Import necessary libraries for data manipulation and encoding
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

**Dataset link:**

https://www.cbioportal.org/results/oncoprint?cancer_study_list=pancan_pcawg_2020&Z_SCORE_THRESHOLD=2.0&RPPA_SCORE_THRESHOLD=2.0&profileFilter=mutations&case_set_id=pancan_pcawg_2020_sequenced&gene_list=TP53&geneset_list=%20&tab_index=tab_visualize&Action=Submit

This dataset contains TCGA mRNA expression and mutation data for TP53.

In [None]:
# Define data directory and file paths
DATA_DIR = "../data/raw/tcga"
EXPR_FILE = f"{DATA_DIR}/data_mrna_seq_fpkm.txt"

MUTATIONS_FILE=f"{DATA_DIR}/data_mutations.txt"


In [None]:
# Load mRNA expression data
expr = pd.read_csv(EXPR_FILE, sep='\t', index_col=0)

# Load mutations file (skip comment lines, set index)
mutations = pd.read_csv(MUTATIONS_FILE,  sep='\t', comment = '#' ,low_memory=False, index_col=0)

In [None]:
# Preview the first few rows of the expression data
expr.head()

Unnamed: 0_level_0,SP89389,SP21193,SP13206,SP103623,SP32742,SP111095,SP8394,SP87446,SP36586,SP123902,...,SP15656,SP123888,SP59420,SP116679,SP1377,SP16269,SP122676,SP88776,SP64546,SP21057
Hugo_Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
TSPAN6,65.515874,93.507754,15.155525,9.751670,16.048850,22.724550,29.154328,18.058406,15.486367,43.248311,...,0.000000,27.329458,3.633029,2.302684,227.790781,0.007476,9.736614,29.283034,28.045150,62.010994
TNMD,0.046881,0.655002,0.000000,0.188735,0.014542,0.000000,0.110858,0.031887,0.000000,0.118371,...,0.000000,0.033938,0.064275,0.414507,0.345589,0.000000,0.062405,0.000000,0.026696,0.405261
DPM1,63.315522,23.608843,42.136192,14.507132,45.302557,24.728471,22.578201,21.447736,19.492883,20.331784,...,18.197652,38.290606,35.459968,31.888606,34.589654,23.290497,31.444337,23.118048,51.549932,30.790944
SCYL3,2.222857,1.933762,2.318698,2.342167,4.373694,4.050503,3.372809,2.359359,1.930601,1.886838,...,4.735389,1.356212,4.340347,5.257385,2.496924,6.466530,1.072155,3.699246,1.989817,3.313589
C1orf112,2.565836,1.861716,2.870440,0.454312,4.125029,1.877498,1.999630,0.747373,0.501911,0.369148,...,6.592257,0.259415,1.864762,2.262558,1.257206,6.384945,1.207673,0.539174,1.556905,1.757992
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
BX649553.1,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
BX649553.3,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
BX649553.4,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
RN7SL355P,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


In [None]:
# Preview the first few rows of the mutations data
mutations.head()

Unnamed: 0_level_0,Entrez_Gene_Id,Center,NCBI_Build,Chromosome,Start_Position,End_Position,Strand,Consequence,Variant_Classification,Variant_Type,...,HGVSc,HGVSp,HGVSp_Short,Transcript_ID,RefSeq,Protein_position,Codons,Hotspot,Annotation_Status,DNA_VAF
Hugo_Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
MATN1,4146.0,,GRCh37,1,31189041,31189041,+,missense_variant,Missense_Mutation,SNP,...,ENST00000373765.4:c.922T>C,p.Ser308Pro,p.S308P,ENST00000373765,NM_002379.3,308.0,Tca/Cca,0,SUCCESS,0.119
TFAP2E,339488.0,,GRCh37,1,36055608,36055608,+,missense_variant,Missense_Mutation,SNP,...,ENST00000373235.3:c.863G>A,p.Arg288His,p.R288H,ENST00000373235,NM_178548.3,288.0,cGc/cAc,0,SUCCESS,0.0851
FGGY,55277.0,,GRCh37,1,60228203,60228203,+,synonymous_variant,Silent,SNP,...,ENST00000303721.7:c.1603C>T,p.Leu535=,p.L535=,ENST00000303721,NM_018291.3,535.0,Ctg/Ttg,0,SUCCESS,0.4103
PGM1,5236.0,,GRCh37,1,64125315,64125315,+,missense_variant,Missense_Mutation,SNP,...,ENST00000371084.3:c.1658C>T,p.Thr553Met,p.T553M,ENST00000371084,NM_002633.2,553.0,aCg/aTg,0,SUCCESS,0.14
DEPDC1,55635.0,,GRCh37,1,68948436,68948436,+,missense_variant,Missense_Mutation,SNP,...,ENST00000456315.2:c.1055G>T,p.Arg352Ile,p.R352I,ENST00000456315,NM_001114120.1,352.0,aGa/aTa,0,SUCCESS,0.1765


In [None]:
# Extract TP53 mutation data from the mutations dataframe
# Select relevant columns and reset index
# Drop the 'Hugo_Symbol' column as it's redundant after filtering

tp53_mutations = mutations.loc['TP53']
tp_53 = tp53_mutations[['Tumor_Sample_Barcode', 'Variant_Type', 'Variant_Classification']].copy().reset_index().drop(["Hugo_Symbol"], axis=1)
# Preview the TP53 mutations data
tp53_mutations.head()

Unnamed: 0_level_0,Entrez_Gene_Id,Center,NCBI_Build,Chromosome,Start_Position,End_Position,Strand,Consequence,Variant_Classification,Variant_Type,...,HGVSc,HGVSp,HGVSp_Short,Transcript_ID,RefSeq,Protein_position,Codons,Hotspot,Annotation_Status,DNA_VAF
Hugo_Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
TP53,7157.0,,GRCh37,17,7577058,7577058,+,stop_gained,Nonsense_Mutation,SNP,...,ENST00000269305.4:c.880G>T,p.Glu294Ter,p.E294*,ENST00000269305,NM_001126112.2,294.0,Gag/Tag,0,SUCCESS,0.8846
TP53,7157.0,,GRCh37,17,7578475,7578475,+,missense_variant,Missense_Mutation,SNP,...,ENST00000269305.4:c.455C>T,p.Pro152Leu,p.P152L,ENST00000269305,NM_001126112.2,152.0,cCg/cTg,0,SUCCESS,0.1154
TP53,7157.0,,GRCh37,17,7577560,7577560,+,missense_variant,Missense_Mutation,SNP,...,ENST00000269305.4:c.721T>C,p.Ser241Pro,p.S241P,ENST00000269305,NM_001126112.2,241.0,Tcc/Ccc,0,SUCCESS,0.9375
TP53,7157.0,,GRCh37,17,7578540,7578548,+,inframe_deletion,In_Frame_Del,DEL,...,ENST00000269305.4:c.382_390del,p.Pro128_Leu130del,p.P128_L130del,ENST00000269305,NM_001126112.2,128.0,CCTGCCCTC/-,0,SUCCESS,0.8461
TP53,7157.0,,GRCh37,17,7577580,7577580,+,missense_variant,Missense_Mutation,SNP,...,ENST00000269305.4:c.701A>G,p.Tyr234Cys,p.Y234C,ENST00000269305,NM_001126112.2,234.0,tAc/tGc,0,SUCCESS,0.4524
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TP53,7157.0,,GRCh37,17,7577548,7577548,+,missense_variant,Missense_Mutation,SNP,...,ENST00000269305.4:c.733G>A,p.Gly245Ser,p.G245S,ENST00000269305,NM_001126112.2,245.0,Ggc/Agc,0,SUCCESS,0.5000
TP53,7157.0,,GRCh37,17,7578555,7578555,+,splice_acceptor_variant,Splice_Site,SNP,...,ENST00000269305.4:c.376-1G>A,,p.X126_splice,ENST00000269305,NM_001126112.2,126.0,,0,SUCCESS,1.0000
TP53,7157.0,,GRCh37,17,7577557,7577557,+,missense_variant,Missense_Mutation,SNP,...,ENST00000269305.4:c.724T>A,p.Cys242Ser,p.C242S,ENST00000269305,NM_001126112.2,242.0,Tgc/Agc,0,SUCCESS,0.3214
TP53,7157.0,,GRCh37,17,7578212,7578212,+,stop_gained,Nonsense_Mutation,SNP,...,ENST00000269305.4:c.637C>T,p.Arg213Ter,p.R213*,ENST00000269305,NM_001126112.2,213.0,Cga/Tga,0,SUCCESS,0.3913


In [None]:
# Preview the first few rows of the TP53 mutation subset
tp_53.head()

Unnamed: 0,Tumor_Sample_Barcode,Variant_Type,Variant_Classification
0,SP101724,SNP,Nonsense_Mutation
1,SP22031,SNP,Missense_Mutation
2,SP59388,SNP,Missense_Mutation
3,SP94588,DEL,In_Frame_Del
4,SP7692,SNP,Missense_Mutation
...,...,...,...
936,SP111033,SNP,Missense_Mutation
937,SP24815,SNP,Splice_Site
938,SP112245,SNP,Missense_Mutation
939,SP124415,SNP,Nonsense_Mutation


In [None]:
# Prepare the tp_53 data by selecting unique sample information
# Handle duplicates by keeping only unique sample-mutation pairs
# Set Tumor_Sample_Barcode as the index and strip whitespace
tp53 = tp_53[['Tumor_Sample_Barcode', 'Variant_Type', 'Variant_Classification']].drop_duplicates().set_index("Tumor_Sample_Barcode")
tp53.index = tp53.index.str.strip()

In [None]:
# Transpose the expression dataframe so samples are rows
expr = expr.transpose()
expr.index.name = 'Tumor_Sample_Barcode'
# Preview the transposed expression data
expr.head()

Hugo_Symbol,TSPAN6,TNMD,DPM1,SCYL3,C1orf112,FGR,CFH,FUCA2,GCLC,NFYA,...,AP000230.1,RP11-80H18.4,RP13-297E16.4,LL0YNC03-29C1.1,RP13-297E16.5,BX649553.1,BX649553.3,BX649553.4,RN7SL355P,MIR3690
Tumor_Sample_Barcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
SP89389,65.515874,0.046881,63.315522,2.222857,2.565836,2.167233,1.265078,34.199996,4.974365,29.891291,...,0.147707,0.245060,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
SP21193,93.507754,0.655002,23.608843,1.933762,1.861716,3.873639,3.268175,23.050032,4.019948,8.379595,...,0.011216,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
SP13206,15.155525,0.000000,42.136192,2.318698,2.870440,4.286835,9.066806,32.926751,3.909932,9.277749,...,0.027060,0.179581,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
SP103623,9.751670,0.188735,14.507132,2.342167,0.454312,13.098299,26.108567,25.074303,0.682193,8.342081,...,0.241574,1.479853,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
SP32742,16.048850,0.014542,45.302557,4.373694,4.125029,4.316576,16.914069,15.602162,8.076706,10.929168,...,0.040090,0.152030,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
SP16269,0.007476,0.000000,23.290497,6.466530,6.384945,235.016264,0.013622,5.548667,13.125604,17.631857,...,0.000000,0.432225,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
SP122676,9.736614,0.062405,31.444337,1.072155,1.207673,5.166039,5.904768,61.047477,2.502007,12.565596,...,0.000000,0.733968,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
SP88776,29.283034,0.000000,23.118048,3.699246,0.539174,0.450032,0.403973,25.097563,6.154079,9.255289,...,0.751576,1.280037,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
SP64546,28.045150,0.026696,51.549932,1.989817,1.556905,1.276369,9.101945,15.649566,6.412556,6.883272,...,0.087614,0.232576,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
# Strip whitespace from sample barcodes in the expression data
expr.index = expr.index.str.strip()

In [None]:
# Merge expression data with TP53 mutation data on Tumor_Sample_Barcode
merged_data = expr.join(tp53)

In [None]:
# Preview the first few rows of the merged dataframe
merged_data.head()

Unnamed: 0_level_0,TSPAN6,TNMD,DPM1,SCYL3,C1orf112,FGR,CFH,FUCA2,GCLC,NFYA,...,RP13-297E16.4,LL0YNC03-29C1.1,RP13-297E16.5,BX649553.1,BX649553.3,BX649553.4,RN7SL355P,MIR3690,Variant_Type,Variant_Classification
Tumor_Sample_Barcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
SP89389,65.515874,0.046881,63.315522,2.222857,2.565836,2.167233,1.265078,34.199996,4.974365,29.891291,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,SNP,Nonsense_Mutation
SP21193,93.507754,0.655002,23.608843,1.933762,1.861716,3.873639,3.268175,23.050032,4.019948,8.379595,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,SNP,Missense_Mutation
SP13206,15.155525,0.000000,42.136192,2.318698,2.870440,4.286835,9.066806,32.926751,3.909932,9.277749,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,
SP103623,9.751670,0.188735,14.507132,2.342167,0.454312,13.098299,26.108567,25.074303,0.682193,8.342081,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,
SP32742,16.048850,0.014542,45.302557,4.373694,4.125029,4.316576,16.914069,15.602162,8.076706,10.929168,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
SP16269,0.007476,0.000000,23.290497,6.466530,6.384945,235.016264,0.013622,5.548667,13.125604,17.631857,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,
SP122676,9.736614,0.062405,31.444337,1.072155,1.207673,5.166039,5.904768,61.047477,2.502007,12.565596,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,SNP,Nonsense_Mutation
SP88776,29.283034,0.000000,23.118048,3.699246,0.539174,0.450032,0.403973,25.097563,6.154079,9.255289,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,
SP64546,28.045150,0.026696,51.549932,1.989817,1.556905,1.276369,9.101945,15.649566,6.412556,6.883272,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,SNP,Missense_Mutation


In [None]:
# Print the shape of the merged dataframe
print(f"Data shape:{merged_data.shape}")

Data shape:(1222, 55923)


In [None]:
# Fill missing Variant_Classification values with 'No_Mutation'
merged_data.Variant_Classification = merged_data.Variant_Classification.fillna("No_Mutation")

In [None]:
# Count the number of samples for each mutation classification
merged_data.Variant_Classification.value_counts()

Variant_Classification
No_Mutation          803
Missense_Mutation    264
Nonsense_Mutation     50
Frame_Shift_Del       40
Splice_Site           33
Frame_Shift_Ins       15
In_Frame_Del           8
In_Frame_Ins           6
Splice_Region          2
Silent                 1
Name: count, dtype: int64

In [None]:
# Create a copy of the merged dataframe for further processing
merged_df = merged_data.copy()

In [None]:
# --- Encode mutation type (multiclass target) ---
le = LabelEncoder()
merged_df['Variant_Type'] = le.fit_transform(merged_df['Variant_Type'].astype(str))  # Encode NaN as a class if needed

# Save class label mapping
class_names =  {0: 'nan', 1: 'SNV', 2: 'deletion', 3: 'insertion', 4: 'substitution'}
print("Class labels:", class_names)

# --- Binary label: is TP53 mutated or not ---
# 0 if Variant_Type was NaN, else 1
merged_df['Mutated'] = np.where(merged_df['Variant_Type'] == le.transform(['nan'])[0], 0, 1)
# Clean up column names by removing text in parentheses and stripping whitespace
merged_df.columns = merged_df.columns.str.replace(r'\(.*?\)', '', regex=True).str.strip()

Class labels: {0: 'DEL', 1: 'DNP', 2: 'INS', 3: 'SNP', 4: 'nan'}


In [None]:
# Save the processed merged dataframe to CSV
merged_df.to_csv(f"../data/processed/tcga/merged_tcga.csv", index=True)