In [5]:
import os
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [6]:
## Create and Set Directory

run_dir = '../Split_4CV_0_outputs_test'
os.makedirs(run_dir, exist_ok=True)
os.chdir(run_dir)

In [4]:
## Loading dataset

df = pd.read_csv("../data_final_modelInn_oldnot_filtered_processed.csv",index_col=0)

FileNotFoundError: [Errno 2] No such file or directory: '../data_final_modelInn_oldnot_filtered_processed.csv'

In [None]:
## Defining metadata

metadata_col = ['Model','Tumor','Treatment']
response_col = ['BestResCatCalc']
df_meta = df[metadata_col + response_col]
df_meta.shape

In [None]:
## Custom functions

def column_counter(df):
    counts = {keyword: 0 for keyword in ["_cna_", "_snv_", "_gex_", "_cnum_", "Tr_", "Tumor_"]}
    for col in df.columns:
        for keyword in counts:
            if keyword in col:
                counts[keyword] += 1
                break
        else: print(col)
    return print(*(counts.values()))

# Preprocessing dataset

In [None]:
## Cleaning data of non-feature columns

df_train = df.drop(metadata_col+response_col, axis=1)
print(df.shape, df_train.shape)

In [None]:
column_counter(df_train)

#14220 + 13366 + 21107 +23852 +19 +5

In [None]:
## Cleaning data of cols with zero std

df_train = df_train[df_train.columns[np.where(df_train.std()!=0)]]
print(df_train.shape)
#72569 cols - hence no col dropped

# PCA Preprocessing

In [None]:
## Cleaning data of non-molecular features

treatment_cols = [col for col in df_train.columns if col.startswith('Tr_')]
tumor_cols = [col for col in df_train.columns if col.startswith('Tumor_')]

df_train_pca = df_train.drop(treatment_cols + tumor_cols, axis=1)
print(df_train_pca.shape,df_train_pca.dtypes.unique())
#72545 cols

In [None]:
## StandardScaling and Fitting

scaler = StandardScaler()
scaler.fit(df_train_pca)

In [None]:
## Scaler Transformation

df_train_pca_scaled = scaler.transform(df_train_pca)

In [None]:
## PCA initialization and fitting

n_components = 200
pca = PCA(n_components=n_components)
pca.fit(df_train_pca_scaled)

# Split Data into 4 Sets (for each Tu-Tr-pair)

In [None]:
df_meta_grouped = df_meta.groupby(['Tumor', 'Treatment'])

In [None]:
dict(tuple(df_meta_grouped)).keys()

In [None]:
split_idx  = {part: [] for part in range(4)}

In [None]:
dict(tuple(df_meta_grouped))[('BRCA', 'BGJ398')]

In [None]:
for _, group in df_meta_grouped:
   
    shuffled_group = group.sample(frac=1, random_state=100) 
    split_sizes = np.array_split(shuffled_group, 4)
    
    for part, split_data in enumerate(split_sizes):
        split_idx[part].append(split_data.index)

In [None]:
for part in range(4):
    
    # df_meta
    meta_indices = pd.Index(np.concatenate(split_idx[part]))
    X_meta = df_meta.loc[meta_indices]
    X_meta.to_csv(f'df_meta_{part+1}.csv', index=True)
    
    #y_values
    df_y = pd.DataFrame(X_meta['BestResCatCalc'])
    df_y.to_csv(f'df_y_{part+1}.csv', index=True)
    
    # df_train
    train_indices = pd.Index(np.concatenate(split_idx[part]))
    X_train = df_train.loc[train_indices]
    X_train.to_csv(f'df_train_{part+1}.csv', index=True)
    
    # df_train_pca_preprocessing
    treatment_cols = [col for col in X_train.columns if col.startswith('Tr_')]
    tumor_cols = [col for col in X_train.columns if col.startswith('Tumor_')]
    X_train_pca_in = X_train.drop(treatment_cols + tumor_cols, axis=1)
    X_train_pca_in.to_csv(f'df_train_{part+1}_pca_in.csv', index=True)
    
    # df_train_ss_transform
    X_train_sc = pd.DataFrame(scaler.transform(X_train_pca_in), columns=X_train_pca_in.columns, index=train_indices)
    X_train_sc.to_csv(f'df_train_{part+1}_pca_in_scaled.csv',index=True)
                      
    # df_train_pca_transform
    X_train_pca_out = pca.transform(X_train_sc)
    X_train_pca_out_df = pd.DataFrame(X_train_pca_out, columns=[f'PC {i+1}' for i in range(n_components)],index=train_indices)
    X_train_pca_out_df.to_csv(f'df_pca200_{part+1}.csv',index=True)
                           
    # df_train_tutr_features
    X_train_tutr = X_train[treatment_cols + tumor_cols]
    X_train_tutr.to_csv(f'df_tutr_{part+1}.csv',index=True)
                        
    print(X_meta.shape, X_train.shape)

In [None]:
504+493+475+466

In [None]:
# Save split indices

for part in range(4):
    combined_indices = np.concatenate(split_idx[part])
    indices_df = pd.DataFrame(combined_indices, columns=['Index'])
    indices_df.to_csv(f'split_indices_{part+1}.csv', index=False)