In [None]:
import pandas as pd
import numpy as np
import os
#import what is needed for pca
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score, roc_curve

In [None]:
mutations  = pd.read_csv('data/OmicsSomaticMutations.csv')
#filter by column Hugo_symbol == 'TP53'
mutations = mutations[mutations['HugoSymbol'] == 'TP53']

In [None]:
model = pd.read_csv('data/Model.csv')
expression = pd.read_csv('data/OmicsExpressionProteinCodingGenesTPMLogp1.csv')

In [None]:
#rename the first column of expression to 'ModelID'
expression.rename(columns={expression.columns[0]: 'ModelID'}, inplace=True)

In [None]:
merged = pd.merge(expression, mutations[['ModelID', 'HugoSymbol']], on='ModelID', how='left')

In [None]:
# create a boolean column 'Mutated' which is 1 if 'HugoSymbol' is not null and 0 if it is null
merged['Mutated'] = np.where(merged['HugoSymbol'].isnull(), 0, 1) 

In [None]:
#count number of mutated and non mutated samples
merged['Mutated'].value_counts()

In [None]:
#run pca on the merged dataframe
#drop the columns 'ModelID', 'HugoSymbol', 'Mutated'
pca = PCA(n_components=128)
#drop the columns 'ModelID', 'HugoSymbol', 'Mutated'
pca_input = merged.drop(columns=['ModelID', 'HugoSymbol', 'Mutated'])
#standardize the data
scaler = StandardScaler()
pca_input = scaler.fit_transform(pca_input)
#fit the pca model
pca.fit(pca_input)
#transform the data
pca_result = pca.transform(pca_input)
#convert to dataframe
pca_result = pd.DataFrame(data=pca_result, columns=[f'PC{i+1}' for i in range(pca_result.shape[1])])
#add the column 'Mutated' to the pca_result dataframe
pca_result['Mutated'] = merged['Mutated'].values
#add the column 'ModelID' to the pca_result 
pca_result['ModelID'] = merged['ModelID'].values

In [None]:
# build a model to predict the mutation status based on the PCA components

#split the data into train and test sets
X = pca_result.drop(columns=['Mutated', 'ModelID'])
y = pca_result['Mutated']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
#fit the model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)
#predict the test set
y_pred = model.predict(X_test)
#predict the probabilities
y_pred_proba = model.predict_proba(X_test)[:, 1]


In [None]:
#calculate the confusion matrix
cm = confusion_matrix(y_test, y_pred)
#calculate the classification report
report = classification_report(y_test, y_pred)
#calculate the roc_auc score
roc_auc = roc_auc_score(y_test, y_pred_proba)
# show the table
print("Confusion Matrix:")
print(cm)
print("Classification Report:")
print(report)
print("ROC AUC Score:")
print(roc_auc)


In [None]:
#run pca on the merged dataframe
#drop the columns 'ModelID', 'HugoSymbol', 'Mutated'
pca = PCA(n_components=128)
#drop the columns 'ModelID', 'HugoSymbol', 'Mutated'
pca_input = merged.drop(columns=['ModelID', 'HugoSymbol', 'Mutated'])
#fit the pca model
pca.fit(pca_input)
#transform the data
pca_result = pca.transform(pca_input)
#standardize the data
scaler = StandardScaler()
pca_result = scaler.fit_transform(pca_result)
#convert to dataframe
pca_result = pd.DataFrame(data=pca_result, columns=[f'PC{i+1}' for i in range(pca_result.shape[1])])
#add the column 'Mutated' to the pca_result dataframe
pca_result['Mutated'] = merged['Mutated'].values
#add the column 'ModelID' to the pca_result 
pca_result['ModelID'] = merged['ModelID'].values

In [None]:
# build a model to predict the mutation status based on the PCA components

#split the data into train and test sets
X = pca_result.drop(columns=['Mutated', 'ModelID'])
y = pca_result['Mutated']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
#fit the model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)
#predict the test set
y_pred = model.predict(X_test)
#predict the probabilities
y_pred_proba = model.predict_proba(X_test)[:, 1]


In [None]:
#calculate the confusion matrix
cm = confusion_matrix(y_test, y_pred)
#calculate the classification report
report = classification_report(y_test, y_pred)
#calculate the roc_auc score
roc_auc = roc_auc_score(y_test, y_pred_proba)
# show the table
print("Confusion Matrix:")
print(cm)
print("Classification Report:")
print(report)
print("ROC AUC Score:")
print(roc_auc)
