In [1]:
import os
import pandas as pd 
import numpy as np
from sklearn.linear_model import LinearRegression

import random

import warnings
warnings.filterwarnings('ignore')

In [2]:
get_tables_path = lambda fname: os.path.normpath('../local_data/processed_data/'+ fname)

In [3]:
file_cptac_proteomics = get_tables_path('CPTAC_proteomics_linkedomics.parquet')
file_cptac_sample_info = get_tables_path('CPTAC_sample_info.parquet')

In [4]:
cptac_sample = pd.read_parquet(file_cptac_sample_info)[['Study']]
print("Dimensions: ", cptac_sample.shape)
cptac_sample[:2]

Dimensions:  (2021, 1)


Unnamed: 0,Study
C3L-00004,ccRCC
C3L-00010,ccRCC


In [5]:
cptac_sample.Study.value_counts()

Study
BrCa2016    825
OvCa2016    174
Pdac        145
BrCa2020    122
LSCC        113
HNSCC       111
OvCa2020    111
ccRCC       110
LUAD        110
GBM         105
UCEC         95
Name: count, dtype: int64

In [6]:
cptac_sample_design = pd.get_dummies(cptac_sample)
cptac_sample_design[:2]

Unnamed: 0,Study_BrCa2016,Study_BrCa2020,Study_GBM,Study_HNSCC,Study_LSCC,Study_LUAD,Study_OvCa2016,Study_OvCa2020,Study_Pdac,Study_UCEC,Study_ccRCC
C3L-00004,False,False,False,False,False,False,False,False,False,False,True
C3L-00010,False,False,False,False,False,False,False,False,False,False,True


In [7]:
cptac_proteomics = pd.read_parquet(file_cptac_proteomics)
print(cptac_proteomics.shape)
cptac_proteomics[:2]

(14792, 1227)


Unnamed: 0,01OV007,01OV017,01OV018,01OV023,01OV026,01OV029,01OV030,01OV039,01OV041,01OV047,...,X20BR002,X20BR005,X20BR006,X20BR007,X20BR008,X21BR001,X21BR002,X21BR010,X22BR005,X22BR006
A1BG,0.133634,-0.432786,-0.013943,1.234203,-0.482872,-0.956518,-0.102617,-1.318564,-0.343398,0.336833,...,1.8732,-0.4227,1.5862,-0.297,1.6767,-0.661,-1.3735,1.1583,0.4948,0.5049
A1CF,,,,,,,,,,,...,,,,,,,,,,


In [8]:
# -- Normalise pancan data-set
def rm_batch(x, y):    
    common_samples = np.intersect1d(y.dropna().index, x.dropna().index)
    if(len(common_samples) == 0):
        return np.nan
    ys = y[common_samples]
    xs = x.loc[common_samples]
    lm = LinearRegression().fit(xs, ys)
    return ys - xs.dot(lm.coef_) - lm.intercept_

In [9]:
cptac_proteomics_corrected = pd.DataFrame.from_dict({p: rm_batch(cptac_sample_design, 
                                                                cptac_proteomics.loc[p]) for p in cptac_proteomics.index}).T
cptac_proteomics_corrected[:2]

Unnamed: 0,01OV007,01OV017,01OV018,01OV023,01OV026,01OV029,01OV030,01OV039,01OV041,01OV047,...,X20BR002,X20BR005,X20BR006,X20BR007,X20BR008,X21BR001,X21BR002,X21BR010,X22BR005,X22BR006
A1BG,0.335938,-0.230469,0.1875,1.4375,-0.28125,-0.753906,0.101562,-1.117188,-0.140625,0.539062,...,1.171875,-1.125,0.882812,-1.0,0.972656,-1.363281,-2.078125,0.457031,-0.207031,-0.199219
A1CF,,,,,,,,,,,...,,,,,,,,,,


In [10]:
cptac_proteomics_corrected.to_parquet(get_tables_path('CPTAC_Proteomics_processed.parquet'))