In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [4]:

DATA_DIR = "../data/ccle/ccle_broad_2019"
EXPR_FILE = f"{DATA_DIR}/data_mrna_seq_rpkm.txt"
META_EXPR_FILE = f"{DATA_DIR}/meta_mrna_seq_rpkm.txt"
META_STUDY_FILE = f"{DATA_DIR}/meta_study.txt"
MUTATIONS_FILE=f"{DATA_DIR}/data_mutations.txt"


In [5]:
# # 3. Load Data
# Read the expression matrix and metadata into pandas DataFrames.

# Expression data (genes x cell lines)
expr = pd.read_csv(EXPR_FILE, sep='\t', index_col=0)
# Metadata for expression samples
meta_expr = pd.read_csv(META_EXPR_FILE, sep='\t', index_col=0)
# Study metadata (cell line annotations)
meta_study = pd.read_csv(META_STUDY_FILE, sep='\t', index_col=0)

#mutations file
mutations = pd.read_csv(MUTATIONS_FILE,  sep='\t', comment = '#' ,low_memory=False, index_col=0)


In [6]:
def select_mutation_columns(file_path):
    """
    Load mutations data and select only Hugo_Symbol, Tumor_Sample_Barcode, 
    Variant_Type, and Variant_Classification columns
    """
    # Read the full dataset
    df = pd.read_csv(file_path, sep='\t', comment='#', low_memory=False)
    
    # Select only the columns you need
    columns_needed = [
        'Hugo_Symbol',           # Gene symbol
        'Tumor_Sample_Barcode',  # Sample name
        'Variant_Type',          # Variant type (SNP, INS, DEL, etc.)
        'Variant_Classification' # Variant classification (Missense, Silent, etc.)
    ]
    
    # Create subset with only these columns
    df_subset = df[columns_needed].copy()
    
    return df_subset


In [7]:
expr = expr.transpose()
expr.index.name = 'Tumor_Sample_Barcode'
expr

Hugo_Symbol,DDX11L1,WASH7P,MIR1302-11,FAM138A,OR4G4P,OR4G11P,OR4F5,RP11-34P13.7,CICP27,AL627309.1,...,MT-ND3,MT-TR,MT-TH,MT-TS2,MT-TL2,MT-ND5,MT-ND6,MT-TE,MT-CYB,MT-TT
Tumor_Sample_Barcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
22RV1_PROSTATE,0.00000,12.63011,0.04289,0.00000,0.00000,0.00000,0.00000,0.44714,0.37909,6.17780,...,1176.81189,0.00000,0.00000,0.00000,0.12335,2393.80298,1825.52332,0.25386,4923.78223,0.00000
2313287_STOMACH,0.03755,10.14155,0.01037,0.00869,0.00000,0.00000,0.00000,0.34327,0.09168,6.56648,...,1009.34979,0.00000,0.00000,0.00000,0.00000,1960.96802,2056.64697,0.00000,3827.00537,0.00000
253JBV_URINARY_TRACT,0.00000,6.20657,0.16955,0.22423,0.02886,0.01939,0.00000,0.16982,0.10995,1.03405,...,2816.78296,0.00000,0.39614,0.00000,0.51331,1896.28870,2184.88647,0.26410,1905.18481,0.00000
253J_URINARY_TRACT,0.06507,6.55835,0.23364,0.16558,0.03875,0.03904,0.00000,0.08922,0.09146,0.76888,...,7116.30469,0.14115,0.26594,0.00000,1.03378,3522.37695,4568.00195,0.00000,4590.01416,0.00000
42MGBA_CENTRAL_NERVOUS_SYSTEM,0.00000,7.61752,0.08765,0.00000,0.00000,0.00000,0.00000,0.09428,0.07513,1.01231,...,1049.68384,0.13768,0.00000,0.00000,0.25209,1178.93579,1616.96484,0.12970,2343.92407,0.00000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
UMUC9_URINARY_TRACT,0.03236,8.70551,0.07747,0.00000,0.00642,0.00000,0.01325,0.02958,0.10214,0.44584,...,2820.28882,0.09360,0.00000,0.00000,0.00000,1452.41895,996.53717,0.08817,3437.12769,0.00000
UPCISCC152_UPPER_AERODIGESTIVE_TRACT,0.00000,15.23348,0.07448,0.02674,0.00000,0.00000,0.00000,0.04402,0.08265,1.91472,...,995.56390,0.16713,0.00000,0.00000,0.30602,793.58710,593.61139,0.31489,1645.22168,0.00000
UW228_CENTRAL_NERVOUS_SYSTEM,0.03799,6.47732,0.01049,0.00879,0.00000,0.02279,0.00000,0.06945,0.01686,0.21741,...,441.25647,0.16482,0.00000,0.00000,0.00000,757.41357,472.01627,0.15526,973.56439,0.16232
Y79_AUTONOMIC_GANGLIA,0.02907,8.45568,0.39607,0.26449,0.00000,0.01163,0.01191,0.07971,0.09748,1.05557,...,2341.42505,0.50444,0.23760,0.27787,0.92361,1218.16992,729.65912,0.55439,1221.39954,0.08280


In [8]:
mutations

Unnamed: 0_level_0,Entrez_Gene_Id,Center,NCBI_Build,Chromosome,Start_Position,End_Position,Strand,Consequence,Variant_Classification,Variant_Type,...,RNAseq_AC,SangerRecalibWES_AC,SangerWES_AC,TCGAhsCnt,WES_AC,WGS_AC,cDNA_Change,isCOSMIChotspot,isDeleterious,isTCGAhotspot
Hugo_Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
DVL1,1855,,GRCh37,1,1277461,1277461,+,synonymous_variant,Silent,SNP,...,,,,0,87:39,,c.438G>A,False,False,False
AL590822.1,0,,GRCh37,1,2144416,2144416,+,missense_variant,Missense_Mutation,SNP,...,,,,0,23:43,,c.604C>T,False,False,False
PLCH2,9651,,GRCh37,1,2435359,2435359,+,3_prime_UTR_variant,3'UTR,SNP,...,,,,0,9:25,,,False,True,False
UBE4B,10277,,GRCh37,1,10177641,10177641,+,missense_variant,Missense_Mutation,SNP,...,,,,0,13:95,,c.934G>A,False,False,False
SRM,6723,,GRCh37,1,11119363,11119363,+,synonymous_variant,Silent,SNP,...,,,,0,113:57,,c.207C>A,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
MLLT4,4301,,GRCh37,6,168366687,168366689,+,downstream_gene_variant,3'Flank,DEL,...,12:91,,,0,,,c.5198_5200delACA,False,False,False
MT-ND2,4536,,37,M,4604,4605,+,,Frame_Shift_Ins,INS,...,9:18,,,0,,,c.135_136insA,False,True,False
POLR3H,171568,,GRCh37,22,41940458,41940459,+,upstream_gene_variant,5'Flank,INS,...,5:3,,,0,,,,False,True,False
MT-ND5,4540,,37,M,12384,12385,+,,Frame_Shift_Ins,INS,...,12:69,,,0,,,c.48_49insC,False,True,False


In [9]:
# Display the column names to understand the structure
print("Column names:")
print(mutations.columns.tolist())

# Filter for TP53 mutations
# The gene symbol appears to be in the 'Hugo_Symbol' column based on the data
tp53_mutations = mutations.loc['TP53']
tp53_mutations


Column names:
['Entrez_Gene_Id', 'Center', 'NCBI_Build', 'Chromosome', 'Start_Position', 'End_Position', 'Strand', 'Consequence', 'Variant_Classification', 'Variant_Type', 'Reference_Allele', 'Tumor_Seq_Allele1', 'Tumor_Seq_Allele2', 'dbSNP_RS', 'dbSNP_Val_Status', 'Tumor_Sample_Barcode', 'Matched_Norm_Sample_Barcode', 'Match_Norm_Seq_Allele1', 'Match_Norm_Seq_Allele2', 'Tumor_Validation_Allele1', 'Tumor_Validation_Allele2', 'Match_Norm_Validation_Allele1', 'Match_Norm_Validation_Allele2', 'Verification_Status', 'Validation_Status', 'Mutation_Status', 'Sequencing_Phase', 'Sequence_Source', 'Validation_Method', 'Score', 'BAM_File', 'Sequencer', 't_ref_count', 't_alt_count', 'n_ref_count', 'n_alt_count', 'HGVSc', 'HGVSp', 'HGVSp_Short', 'Transcript_ID', 'RefSeq', 'Protein_position', 'Codons', 'Hotspot', 'Broad_ID', 'COSMIChsCnt', 'Codon_Change', 'ExAC_AF', 'HC_AC', 'Protein_Change', 'RD_AC', 'RNAseq_AC', 'SangerRecalibWES_AC', 'SangerWES_AC', 'TCGAhsCnt', 'WES_AC', 'WGS_AC', 'cDNA_Change

Unnamed: 0_level_0,Entrez_Gene_Id,Center,NCBI_Build,Chromosome,Start_Position,End_Position,Strand,Consequence,Variant_Classification,Variant_Type,...,RNAseq_AC,SangerRecalibWES_AC,SangerWES_AC,TCGAhsCnt,WES_AC,WGS_AC,cDNA_Change,isCOSMIChotspot,isDeleterious,isTCGAhotspot
Hugo_Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
TP53,7157,,GRCh37,17,7576854,7576854,+,"missense_variant,splice_region_variant",Missense_Mutation,SNP,...,88:33,23:50,24:52,0,30:33,18:15,c.992A>G,True,True,False
TP53,7157,,GRCh37,17,7577120,7577120,+,missense_variant,Missense_Mutation,SNP,...,,30:1,30:1,181,205:0,,c.818G>A,True,False,True
TP53,7157,,GRCh37,17,7577536,7577536,+,missense_variant,Missense_Mutation,SNP,...,249:1,23:0,17:0,19,337:0,,c.745A>G,True,False,True
TP53,7157,,GRCh37,17,7578492,7578492,+,stop_gained,Nonsense_Mutation,SNP,...,35:2,,,9,57:2,,c.438G>A,True,True,True
TP53,7157,,GRCh37,17,7578400,7578400,+,missense_variant,Missense_Mutation,SNP,...,,23:19,23:21,7,235:260,,c.530C>T,True,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TP53,7157,,GRCh37,17,7578253,7578259,+,frameshift_variant,Frame_Shift_Del,DEL,...,22:6,,,9,,,c.590_596delTGGAAGG,True,True,True
TP53,7157,,GRCh37,17,7578259,7578260,+,frameshift_variant,Frame_Shift_Ins,INS,...,22:5,,,5,,,c.589_590insCC,True,True,True
TP53,7157,,GRCh37,17,7577150,7577159,+,"splice_acceptor_variant,coding_sequence_varian...",Splice_Site,DEL,...,,,,0,,,c.783_788delGTAGTGGTAA,True,True,False
TP53,7157,,GRCh37,17,7578479,7578480,+,frameshift_variant,Frame_Shift_Del,DEL,...,,,,22,,,c.450_451delAC,True,True,True


In [10]:
tp_53 = tp53_mutations[['Tumor_Sample_Barcode', 'Variant_Type', 'Variant_Classification']].copy().reset_index().drop(["Hugo_Symbol"], axis=1)

In [11]:
# First, let's prepare the tp_53 data by selecting unique sample information
# Since there might be multiple mutations per sample, we need to handle duplicates
tp53 = tp_53[['Tumor_Sample_Barcode', 'Variant_Type', 'Variant_Classification']].drop_duplicates().set_index("Tumor_Sample_Barcode")
tp53.index = tp53.index.str.strip()

In [12]:
expr.index = expr.index.str.strip()

In [27]:
merged_data = expr.join(tp53)

In [28]:
print(f"Data shape:{merged_data.shape}")

Data shape:(1257, 56320)


In [30]:
merged_data.Variant_Classification = merged_data.Variant_Classification.fillna("No_Mutation")

In [33]:
merged_data.Variant_Classification.value_counts()

Variant_Classification
Missense_Mutation    540
No_Mutation          402
Nonsense_Mutation    103
Splice_Site           74
Frame_Shift_Del       71
Frame_Shift_Ins       20
Silent                17
In_Frame_Del          15
Splice_Region         12
In_Frame_Ins           3
Name: count, dtype: int64

In [None]:
merged_data.is_mutated = merged_data