In [1]:
from torch import nn
import torch
import anndata as ad
import pandas as pd
import scanpy as sc
import numpy as np

In [2]:
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
import joblib
import pickle

def l1_loss(y_true, y_pred):
    return np.sum(np.abs(y_true-y_pred))

# Extract gene embedding from pretrained scBERT

In [3]:
class Gene2VecPositionalEmbedding(nn.Module):
    def __init__(self, dim, max_seq_len):
        super().__init__()
        gene2vec_weight = np.load('/Users/samuelchazy/ML_files_X/Applied_Projects/Competitions/Top_coders/Cancer_Immunotherapy/Winning_submissions/challenge_1_template/code/data/gene2vec_16906.npy')
        gene2vec_weight = np.concatenate((gene2vec_weight, np.zeros((1, gene2vec_weight.shape[1]))), axis=0)
        gene2vec_weight = torch.from_numpy(gene2vec_weight)
        self.emb = nn.Embedding.from_pretrained(gene2vec_weight)

    def forward(self, x):
        t = torch.arange(x.shape[1])
        return self.emb(t)

In [11]:
temp = np.load('/Users/samuelchazy/ML_files_X/Applied_Projects/Competitions/Top_coders/Cancer_Immunotherapy/Winning_submissions/challenge_1_template/code/data/gene2vec_16906.npy')
temp

array([[ 0.09901641,  0.0662446 , -0.03479731, ...,  0.19355905,
        -0.17810692, -0.23648332],
       [-0.0231158 , -0.27804175,  0.00556953, ..., -0.06118472,
        -0.38697961, -0.29563326],
       [ 0.0390633 ,  0.17098807,  0.21242471, ...,  0.20167874,
        -0.25721824, -0.11382486],
       ...,
       [-0.17179887, -0.24742457, -0.07134991, ...,  0.24341892,
         0.30512366,  0.10348674],
       [-0.18267053,  0.01702181, -0.0729536 , ...,  0.36628523,
         0.27509084, -0.11343211],
       [-0.15330462,  0.01417207,  0.01044654, ..., -0.07525061,
        -0.17254964, -0.15762754]])

In [4]:
dim = 200
SEQ_LEN = 16906 + 1 # gene_num + 1
max_seq_len = SEQ_LEN
pos_emb = Gene2VecPositionalEmbedding(dim, max_seq_len)


data = sc.read_h5ad("/Users/samuelchazy/ML_files_X/Applied_Projects/Competitions/Top_coders/Cancer_Immunotherapy/Winning_submissions/challenge_1_template/code/data/panglao_10000.h5ad")
emb = pos_emb(data.X)
genes = data.var_names.tolist()

  utils.warn_names_duplicates("obs")


# prepare training set

In [5]:
import os
os.getcwd()
os.chdir('/Users/samuelchazy/ML_files_X/Applied_Projects/Competitions/Top_coders/Cancer_Immunotherapy/data')

In [6]:
adata = sc.read_h5ad('./sc_training.h5ad')
df = adata.obs[['condition', 'state']]
prop = df.groupby(by=['condition', 'state'],as_index=False).size()
total = df.groupby(by='condition',as_index=False).size()

prop = prop.merge(total, on='condition', how='left')
prop['prop'] = prop.size_x /prop.size_y
prop = prop.pivot_table(index='condition', columns='state', values='prop')

prop = prop[['progenitor', 'effector', 'other', 'terminal exhausted', 'cycling']]
prop

state,progenitor,effector,other,terminal exhausted,cycling
condition,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Arid4b,0.056604,0.117925,0.011006,0.388365,0.426101
Arid5b,0.011241,0.044546,0.017485,0.412989,0.513739
Atf2,0.120567,0.248227,0.028369,0.304965,0.297872
Batf,0.666667,0.000000,0.166667,0.166667,0.000000
Crem,0.034166,0.425756,0.017083,0.214192,0.308804
...,...,...,...,...,...
Tpt1,0.440000,0.160000,0.000000,0.120000,0.280000
Unperturbed,0.067497,0.209723,0.017276,0.313379,0.392125
Yy1,0.600000,0.200000,0.000000,0.080000,0.120000
Zeb2,0.017483,0.115385,0.145688,0.282051,0.439394


In [7]:
conditions = list(set(adata.obs['condition'].tolist()))
conditions = [g for g in conditions if g != 'Unperturbed']
conditions = [g for g in conditions if g.upper() in genes]
conditions

['Foxm1',
 'Tox',
 'Arid4b',
 'Stat4',
 'Egr1',
 'Nr3c1',
 'Irf9',
 'Batf',
 'Id3',
 'Dvl2',
 'Rps6',
 'Dvl1',
 'Tcf3',
 'Rad21',
 'Runx2',
 'Tox2',
 'Fzd1',
 'Il12rb2',
 'Rela',
 'Hmgb2',
 'Foxo1',
 'Elf1',
 'Id2',
 'Yy1',
 'Satb1',
 'Dvl3',
 'Fzd6',
 'Il12rb1',
 'P2rx7',
 'Crem',
 'Dkk3',
 'Sp140',
 'Tcf7',
 'Tpt1',
 'Oxnad1',
 'Ep300',
 'Sox4',
 'Runx3',
 'Atf2',
 'Hmgb1',
 'Eomes',
 'Tbx21',
 'Zeb2',
 'Nr4a3',
 'Litaf',
 'Lef1',
 'Ctnnb1',
 'Eef2',
 'Sub1',
 'Arid5b',
 'Hif1a',
 'Sp100',
 'Gsk3b',
 'Ldhb',
 'Foxp1',
 'Lrp1',
 'Ikzf3',
 'Klf2',
 'Irf2',
 'Prdm1',
 'Ezh2',
 'Nr4a2',
 'Myb',
 'Nr4a1',
 'Fzd3']

In [8]:
X, Y = [], []

state_wt = np.array(prop.loc['Unperturbed'].values)

for g in conditions:
    x = np.array(emb[list(genes).index(g.upper())])
    y = np.array(prop.loc[g].values) #- state_wt
    X.append(x)
    Y.append(y)
    
X_scaler = StandardScaler()
X_scaler.fit(emb)
X = X_scaler.transform(X)

with open(f'/Users/samuelchazy/ML_files_X/Applied_Projects/Competitions/Top_coders/Cancer_Immunotherapy/Winning_submissions/challenge_1_template/code/data/X_scaler.pkl', 'wb') as f:
    pickle.dump(X_scaler, f)

X

array([[ 0.80876199,  0.49971498, -0.87779106, ..., -1.62549264,
        -0.03624708, -0.01300541],
       [-0.52122495, -0.58473698,  0.80769873, ..., -0.14130064,
        -0.45418966,  0.35438214],
       [ 0.95676034,  0.77947251,  0.02561703, ...,  1.53579194,
        -2.08241023,  0.01165814],
       ...,
       [-0.40003167,  0.909072  , -0.56395749, ...,  1.05832384,
         1.36434388, -1.20612593],
       [ 0.26626613, -1.19976797, -0.47012216, ..., -0.09801398,
        -0.18148835,  1.72442075],
       [ 0.73718578, -0.52320185, -0.88283549, ...,  1.70894623,
         0.01684131, -0.71974629]])

In [10]:
testset = ['Aqr', 'Bach2', 'Bhlhe40', 'Ets1', 'Fosb', 'Mafk', 'Stat3']
testset = [np.array(emb[list(genes).index(g.upper())]) for g in testset]
testset = X_scaler.transform(testset)
np.save("/Users/samuelchazy/ML_files_X/Applied_Projects/Competitions/Top_coders/Cancer_Immunotherapy/Winning_submissions/challenge_1_template/code/data/testset_gene_embed", np.array(testset))
testset

array([[ 0.4369856 ,  1.83163887,  1.30296681, ...,  1.29734927,
        -0.0239695 , -1.84059162],
       [ 0.05053682,  0.66980004, -0.2686294 , ...,  1.91119476,
         1.10372599,  0.50888215],
       [ 0.3255075 , -1.2597175 ,  0.00940757, ..., -0.26657041,
        -1.48304829,  1.80351912],
       ...,
       [ 2.06476318, -0.27354803,  1.96149128, ...,  1.87026806,
        -1.49645831,  0.08588371],
       [ 0.68527231, -1.20379274, -0.30885089, ...,  0.40130035,
        -0.09999667,  0.19613166],
       [-0.92363967, -0.55469237,  1.01179548, ...,  0.41534762,
        -0.77904279, -0.21411693]])

# Train and select regression model mapping gene embedding to 5-states

In [25]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.05)
X_train

array([[ 0.58005273, -0.68231329,  0.64224908, ...,  0.71163614,
         0.9745569 ,  0.08399899],
       [-0.2547744 , -0.45924609, -1.11227522, ..., -0.36873722,
        -0.2347909 , -0.04176791],
       [-0.62405007, -0.38217799,  0.14526796, ..., -0.66262246,
        -0.71583147,  0.33539301],
       ...,
       [ 1.21894386,  0.78942186,  1.6352367 , ...,  1.03179946,
        -0.42922112, -0.87048511],
       [-2.44141518, -0.17746259, -1.19192255, ..., -0.52812374,
        -0.30816534,  0.53162737],
       [-0.16422783,  0.73485964, -0.6723655 , ..., -0.44048724,
        -0.17341462,  1.55630008]])

In [26]:
num_pca_components = 40
pca = PCA(num_pca_components)
Xs_train_pca = pca.fit_transform(X_train)

In [27]:
knn_grid = [
    {
        'model': [KNeighborsRegressor()],
        'model__n_neighbors': [3, 5, 10],
        'model__weights': ['uniform', 'distance'],
        'model__algorithm': ['ball_tree', 'kd_tree', 'brute'],
        'model__leaf_size' : [15, 30],
        'model__p' : [1, 2],
    }
    ]

rfr_grid = [
    {
        'model': [RandomForestRegressor()],
        'model__n_estimators' : [20],
#         'model__criterion' : ['absolute_error'],
        'model__max_features': ['sqrt', 'log2'],
        'model__min_samples_split' : [5, 10],
        'model__min_samples_leaf': [1, 2, 4]
    }
]

In [34]:
cls_list = [KNeighborsRegressor, 
#             SVR, 
            RandomForestRegressor
           ]
param_grid_list = [knn_grid, 
#                    svm_grid, 
                   rfr_grid
                  ]

In [39]:
pipe = Pipeline([('pca',PCA(num_pca_components)),('model','passthrough')])

result_list = []
grid_list = []
for cls_name, param_grid in zip(cls_list, param_grid_list):
    print(cls_name)
    grid = GridSearchCV(
        estimator = pipe,
        param_grid = param_grid,
        scoring = 'neg_mean_absolute_error',
        verbose = 1,
        cv = 15, 
        n_jobs = -1 # use all available cores
    )
    grid.fit(X_train, Y_train)
    result_list.append(pd.DataFrame.from_dict(grid.cv_results_))
    grid_list.append(grid)

<class 'sklearn.neighbors._regression.KNeighborsRegressor'>
Fitting 15 folds for each of 72 candidates, totalling 1080 fits
<class 'sklearn.ensemble._forest.RandomForestRegressor'>
Fitting 15 folds for each of 12 candidates, totalling 180 fits


In [40]:
for grid in grid_list:
    print(grid.best_estimator_.get_params()["steps"][1][1]) # get the model details from the estimator
    preds = grid.predict(X_test)
    
    print([round(l1_loss(t,p),3) for t,p in zip(Y_test, preds)])
    
    print('\n', '-' * 80, '\n')

KNeighborsRegressor(algorithm='ball_tree', leaf_size=15, n_neighbors=10,
                    weights='distance')
[0.466, 0.54, 0.432, 0.507]

 -------------------------------------------------------------------------------- 

RandomForestRegressor(max_features='sqrt', min_samples_leaf=4,
                      min_samples_split=10, n_estimators=20)
[0.291, 0.513, 0.529, 0.525]

 -------------------------------------------------------------------------------- 



In [42]:
#save the best model (RandomForestRegressor)

best_model = grid.best_estimator_
joblib.dump(best_model, '/Users/samuelchazy/ML_files_X/Applied_Projects/Competitions/Top_coders/Cancer_Immunotherapy/Winning_submissions/challenge_1_template/code/data/best_model.pkl')

['/Users/samuelchazy/ML_files_X/Applied_Projects/Competitions/Top_coders/Cancer_Immunotherapy/Winning_submissions/challenge_1_template/code/data/best_model.pkl']