In [16]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder


In [17]:
# Read the two CSV files into dataframes
expression = pd.read_csv('../data/raw/OmicsExpressionProteinCodingGenesTPMLogp1.csv', low_memory=False)
mutation = pd.read_csv('../data/raw/OmicsSomaticMutations.csv', low_memory=False)

In [18]:
# Filter for TP53 mutations only
mutation = mutation[mutation['HugoSymbol'] == 'TP53']
# Keep only ModelID and VariantType columns
mutation_subset = mutation[['ModelID', 'VariantType']]

# Rename first column to 'ModelID' and remove duplicates in expression data
expression = expression.rename(columns={expression.columns[0]: 'ModelID'})
expression.drop_duplicates(inplace=True)

# Merge expression and mutation data on ModelID
merged_df = pd.merge(expression, mutation_subset, on='ModelID', how='left')

# Encode VariantType as multiclass target
le = LabelEncoder()
merged_df['VariantType'] = le.fit_transform(merged_df['VariantType'].astype(str))  # NaN will be treated as a separate class

# Save mapping of class labels
class_names =  {0: 'nan', 1: 'SNV', 2: 'deletion', 3: 'insertion', 4: 'substitution'}
print("Class labels:", class_names)

# Create binary label: 1 if TP53 mutated, 0 otherwise
merged_df['Mutated'] = np.where(merged_df['VariantType'] == le.transform(['nan'])[0], 0, 1)
merged_df['VariantLabel'] = merged_df['VariantType'].apply(lambda x: class_names.get(x))

# Clean column names: remove text in parentheses and strip whitespace
merged_df.columns = merged_df.columns.str.replace(r'\(.*?\)', '', regex=True).str.strip()

Class labels: {0: 'nan', 1: 'SNV', 2: 'deletion', 3: 'insertion', 4: 'substitution'}


In [19]:
merged_df.to_csv("../data/processed/ccle/merged_data.csv")