In [43]:
#jupyter notebook --NotebookApp.max_buffer_size=35536870912 
# to run jupyter notebook using more available memory
import os, gc, pickle, scipy.sparse, time, h5py, anndata, hdf5plugin#, lightgbm
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.decomposition import PCA
#from sklearn.svm import SVR takes way too long
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
#import scanpy as sc
import anndata
import hdf5plugin
from sys import getsizeof
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import Pipeline
import joblib
import h5py
from scipy.sparse import csr_matrix
from math import sqrt
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
import seaborn as sns

data_path = "/home/skovtun/Single cell/Single_cell_data"
metadata = os.path.join(data_path,"metadata.csv")

train_cite_inputs = os.path.join(data_path,"train_cite_inputs.h5")
train_cite_targets = os.path.join(data_path,"train_cite_targets.h5")
test_cite_inputs = os.path.join(data_path,"test_cite_inputs.h5")

train_multi_inputs = os.path.join(data_path,"train_multi_inputs.h5")
train_multi_targets = os.path.join(data_path,"train_multi_targets.h5")
test_multi_inputs = os.path.join(data_path,"test_multi_inputs.h5")

sample_submission = os.path.join(data_path,"sample_submission.csv")
evaluation_ids = os.path.join(data_path,"evaluation_ids.csv")

barcode = os.path.join(data_path,"TotalSeq_B_Universal_Cocktail_v1_Antibodies_399904_Barcodes_BioLegendUpdate.xlsx")


In [4]:
# Getting indexes of cell_id for train and test samples I defined in Random_sample_cells_id.
import csv

cite_train_sample = []
cite_test_sample = []

with open('cite_train_sample.csv', 'r') as csvfile:
    reader = csv.reader(csvfile)
    for row in reader:
        cite_train_sample.append(row)

with open('cite_test_sample.csv', 'r') as csvfile:
    reader = csv.reader(csvfile)
    for row in reader:
        cite_test_sample.append(row)

cite_train_sample = [item for sublist in cite_train_sample for item in sublist]
cite_test_sample = [item for sublist in cite_test_sample for item in sublist]

In [5]:
X = pd.read_hdf(train_cite_inputs)
y = pd.read_hdf(train_cite_targets)

In [170]:
X.shape

(70988, 311)

In [7]:
y.shape

(70988, 140)

In [None]:
col_names = list(X.columns)

In [37]:
# Targets (proteins)
y_col_names = list(y.columns)
y_col_names.sort()
print(y_col_names)

['CD101', 'CD103', 'CD105', 'CD107a', 'CD112', 'CD115', 'CD119', 'CD11a', 'CD11b', 'CD11c', 'CD122', 'CD123', 'CD124', 'CD127', 'CD13', 'CD134', 'CD137', 'CD14', 'CD141', 'CD142', 'CD146', 'CD152', 'CD154', 'CD155', 'CD158', 'CD158b', 'CD158e1', 'CD16', 'CD161', 'CD162', 'CD163', 'CD169', 'CD172a', 'CD18', 'CD185', 'CD19', 'CD192', 'CD194', 'CD195', 'CD196', 'CD1c', 'CD1d', 'CD2', 'CD20', 'CD21', 'CD22', 'CD223', 'CD224', 'CD226', 'CD23', 'CD24', 'CD244', 'CD25', 'CD26', 'CD268', 'CD27', 'CD270', 'CD272', 'CD274', 'CD278', 'CD279', 'CD28', 'CD29', 'CD3', 'CD303', 'CD304', 'CD31', 'CD314', 'CD319', 'CD32', 'CD328', 'CD33', 'CD335', 'CD35', 'CD352', 'CD36', 'CD38', 'CD39', 'CD4', 'CD40', 'CD41', 'CD42b', 'CD44', 'CD45', 'CD45RA', 'CD45RO', 'CD47', 'CD48', 'CD49a', 'CD49b', 'CD49d', 'CD49f', 'CD5', 'CD52', 'CD54', 'CD56', 'CD57', 'CD58', 'CD62L', 'CD62P', 'CD63', 'CD64', 'CD69', 'CD7', 'CD71', 'CD72', 'CD73', 'CD79b', 'CD8', 'CD81', 'CD82', 'CD83', 'CD85j', 'CD86', 'CD88', 'CD9', 'CD93', 

I need to separate 140 columns from X before applying dimentionality reducing techniks, those columns are known RNA that code for proteins. There is an external Excel file from BioLegend with compete Ensembl ID Information and corresponding proteins.
But there are a couple of protein without Ensembl ID, and there is one protein with 2 IDs, so I need to work around these issues.
Entries with missing Ensamble ID correspondts to test proteins that shouldn't be there, so we don't have correspondent RNA fragments in X. So we just take all Ensamble ID that are fillied and use this list as a filter for columns of X

In [45]:
barcodes = pd.read_excel(barcode)

In [47]:
barcodes.tail()

Unnamed: 0,DNA_ID,Description,clone,Sequence,Ensemble ID,TS-B Univ,GeneID,Other assiociated GeneID,Ensembl ID
135,B0918,anti-human HLA-E,3D12,GAGTCGAGAAATCAT,ENSG00000204592,3D12,3133.0,,ENSG00000204592
136,B0920,anti-human CD82,ASL-24,TCCCACTTCCGCTTT,ENSG00000085117,ASL-24,3732.0,,ENSG00000085117
137,B0944,anti-human CD101 (BB27),BB27,CTACTTCCCTGTCAA,ENSG00000134256,BB27,9398.0,,ENSG00000134256
138,B1046,anti-human CD88 (C5aR),S5/1,GCCGCATGAGAAACA,ENSG00000197405,S5/1,728.0,,ENSG00000197405
139,B1052,anti-human CD224,KF29,CTGATGAGATGTCAG,ENSG00000100031,KF29,2678.0,,ENSG00000100031; ENSG00000286070


In [48]:
barcodes.columns

Index(['DNA_ID', 'Description', 'clone', 'Sequence', 'Ensemble ID',
       'TS-B Univ', 'GeneID', 'Other assiociated GeneID', 'Ensembl ID'],
      dtype='object')

In [77]:
type(ens_id)

pandas.core.series.Series

In [78]:
ens_id = barcodes['Ensembl ID'].str.split(';').explode()
ens_id = ens_id.reset_index(drop=True)
# I need to clean this list, it has null values and '-' values, so I am going to keep only values 
#that are string and start from ENSG
ens_id_c = [s for s in ens_id if isinstance(s, str) and s.startswith('ENSG')]
print(len(ens_id_c))

131


In [82]:
X_columns_to_keep = [s for s in col_names if any(sub in s for sub in ens_id_c )]
X_proteins_rna = X.loc[:, X_columns_to_keep]
print(X_proteins_rna.shape)


(70988, 111)


In [83]:
# What happend to 20 Ensemble IDs, why are they not found? 
# Let's look at the Ensamble Id ens_id_c but not in X_columns_to_keep
missing_id = [sub for sub in ens_id_c if not any(sub in s for s in X_columns_to_keep)]
print(missing_id)

['ENSG00000153563', 'ENSG00000203747', 'ENSG00000188389', 'ENSG00000181847', 'ENSG00000181847', 'ENSG00000162493', 'ENSG00000110448', 'ENSG00000112486', 'ENSG00000163599', 'ENSG00000213809', 'ENSG00000109956', 'ENSG00000186265', 'ENSG00000088827', 'ENSG00000203618', 'ENSG00000178562', 'ENSG00000125498', 'ENSG00000135318', 'ENSG00000277725', 'ENSG00000167633']


In [86]:
barcodes[barcodes['Ensembl ID'].isin(missing_id)]

Unnamed: 0,DNA_ID,Description,clone,Sequence,Ensemble ID,TS-B Univ,GeneID,Other assiociated GeneID,Ensembl ID
11,B0046,anti-human CD8,SK1,GCGCAACTTGATGAT,ENSG00000153563,SK1,925.0,,ENSG00000153563
26,B0083,anti-human CD16,3G8,AAGTTCACTCTTTGC,ENSG00000203747,3G8,2214.0,,ENSG00000203747
28,B0087,anti-human CD45RO,UCHL1,CTCCGAATCATGTTG,ENSG00000081237,UCHL1,5788.0,,ENSG00000188389
29,B0088,anti-human CD279 (PD-1),EH12.2H7,ACAGCGCCGTATTTA,ENSG00000188389,EH12.2H7,5133.0,,ENSG00000181847
30,B0089,anti-human TIGIT (VSTM3),A15153G,TTGCTTACCGCCAGA,ENSG00000181847,A15153G,201633.0,,ENSG00000181847
38,B0127,anti-Human Podoplanin,NC-08,GGTTACTCGTTGTGT,ENSG00000162493,NC-08,10630.0,,ENSG00000162493
41,B0138,anti-human CD5,UCHT2,CATTAACGGGATGCC,ENSG00000110448,UCHT2,921.0,,ENSG00000110448
44,B0143,anti-human CD196 (CCR6),G034E3,GATCCCTTTGTCACT,ENSG00000112486,G034E3,1235.0,,ENSG00000112486
50,B0151,anti-human CD152 (CTLA-4),BNI3,ATGGTTCACGTAATC,ENSG00000163599,BNI3,1493.0,,ENSG00000163599
63,B0165,anti-human CD314 (NKG2D),1D11,CGTGTTTGTTCCTCA,ENSG00000213809,1D11,22914.0,,ENSG00000213809


In [88]:
CD8 = [ col for col in col_names if "CD8" in col]
print(CD8)

['ENSG00000121594_CD80', 'ENSG00000110651_CD81', 'ENSG00000238184_CD81-AS1', 'ENSG00000085117_CD82', 'ENSG00000112149_CD83', 'ENSG00000066294_CD84', 'ENSG00000114013_CD86', 'ENSG00000172116_CD8B', 'ENSG00000254126_CD8B2']


In [89]:
SK1 = [col for col in col_names if "SK1" in col]
print(SK1)

['ENSG00000160469_BRSK1', 'ENSG00000102109_PCSK1N', 'ENSG00000107140_TESK1']


In [90]:
CD16=[ col for col in col_names if "CD16" in col]
print(CD16)

['ENSG00000117281_CD160', 'ENSG00000177575_CD163', 'ENSG00000135535_CD164']


In [91]:
CD158 = [ col for col in col_names if "CD158" in col]
print(CD158)

[]


I don't see any mistakes, I tried to find missing ensemble ids by description, but I couldn't.
It means that we have 111 columns of X to keep and apply PCA to the rest of it.

In [94]:
X_proteins_rna_columns = X_proteins_rna.columns
X_to_pca_columns = [col for col in col_names if col not in X_proteins_rna_columns]
X_to_pca = X.loc[:,X_to_pca_columns]
print(X.shape[1] - X_to_pca.shape[1],X_proteins_rna.shape[1])

111 111


In [104]:
valid_indices_train = [ind for ind in cite_train_sample if ind in X_to_pca.index]
valid_indices_test = [ind for ind in cite_test_sample if ind in X_to_pca.index]
X_to_pca_train = X_to_pca.loc[valid_indices_train] 
X_to_pca_test = X_to_pca.loc[valid_indices_test]
X_proteins_rna_train = X_proteins_rna.loc[valid_indices_train]
X_proteins_rna_test = X_proteins_rna.loc[valid_indices_test]

In [132]:
y_train = y.loc[valid_indices_train]
y_test = y.loc[valid_indices_test]
print(y_train.shape, y_test.shape)

(54922, 140) (16066, 140)


In [135]:
type(y_train)

pandas.core.frame.DataFrame

In [124]:
X_to_pca_train_index = X_to_pca_train.index
start_time = time.time()
pipeline = Pipeline([('scaling', StandardScaler()), ('pca', PCA(n_components=200, random_state=42))])
X_train_reduced = pipeline.fit_transform(X_to_pca_train)
print(f"--- {time.time() - start_time} seconds ---")
print(pipeline.named_steps['pca'].explained_variance_ratio_.sum())
print(X_train_reduced.shape)

--- 49.791011810302734 seconds ---
0.08924206


In [126]:
X_to_pca_test_index = X_to_pca_test.index
X_test_reduced = pipeline.transform(X_to_pca_test)

In [127]:
X_test_reduced.shape

(16066, 200)

In [129]:
X_proteins_rna_train = X_proteins_rna.loc[valid_indices_train]
print(X_proteins_rna.shape, X_proteins_rna_train.shape)
X_proteins_rna_test = X_proteins_rna.loc[valid_indices_test]
print(X_proteins_rna.shape, X_proteins_rna_test.shape)
X_test_reduced_df = pd.DataFrame(X_test_reduced, index = X_to_pca_test_index )
print(X_test_reduced_df.shape, X_test_reduced.shape)
X_test = pd.concat([X_test_reduced_df,X_proteins_rna_test], axis = 1)
print(X_test.shape,X_test_reduced_df.shape,X_proteins_rna_test.shape)
X_train_reduced_df = pd.DataFrame(X_train_reduced, index = X_to_pca_train_index)
X_train = pd.concat([X_train_reduced_df,X_proteins_rna_train], axis = 1)
print(X_train.shape,X_train_reduced_df.shape,X_proteins_rna_train.shape)

(70988, 111) (54922, 111)
(70988, 111) (16066, 111)
(16066, 200) (16066, 200)
(16066, 311) (16066, 200) (16066, 111)
(54922, 311) (54922, 200) (54922, 111)


In [None]:
# Index was lost after applying pipeline to X_to_pca_test
#X_test_reduced = pipeline.transform(X_to_pca_test). Going back to change the code to save index and
# then to reapply it later.

In [144]:
start_time = time.time()
# Create an instance of XGBRegressor
xgb_model = XGBRegressor(n_jobs=-1,tree_method="gpu_hist", sampling_method='gradient_based')
#X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=.2, random_state=3)

# Define the parameter grid for GridSearchCV
param_grid = {
    #'n_estimators': [100, 200, 300],
    'max_depth': [3, 7, 9],
    'learning_rate': [0.1, 0.5, 1],
    'min_child_weight': [1, 5, 100],
    #'subsample': [0.7, 0.8, 0.9],
    #'colsample_bytree': [0.7, 0.8, 0.9],
    'reg_alpha': [1, 5, 100],
    'reg_lambda': [1, 5, 100]
}

# Create a GridSearchCV instance
grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, 
                           cv=5, scoring='r2',verbose = 3)

# Fit the GridSearchCV instance on the data
grid_search.fit(X_train, y_train.iloc[:,100])

# Print the best parameters
best_params = grid_search.best_params_
print("Best parameters:", best_params)

Fitting 5 folds for each of 243 candidates, totalling 1215 fits
[CV 1/5] END learning_rate=0.1, max_depth=3, min_child_weight=1, reg_alpha=1, reg_lambda=1;, score=0.544 total time=   1.0s
[CV 2/5] END learning_rate=0.1, max_depth=3, min_child_weight=1, reg_alpha=1, reg_lambda=1;, score=0.528 total time=   0.9s
[CV 3/5] END learning_rate=0.1, max_depth=3, min_child_weight=1, reg_alpha=1, reg_lambda=1;, score=0.444 total time=   0.9s
[CV 4/5] END learning_rate=0.1, max_depth=3, min_child_weight=1, reg_alpha=1, reg_lambda=1;, score=0.488 total time=   0.9s
[CV 5/5] END learning_rate=0.1, max_depth=3, min_child_weight=1, reg_alpha=1, reg_lambda=1;, score=0.490 total time=   0.9s
[CV 1/5] END learning_rate=0.1, max_depth=3, min_child_weight=1, reg_alpha=1, reg_lambda=5;, score=0.545 total time=   1.0s
[CV 2/5] END learning_rate=0.1, max_depth=3, min_child_weight=1, reg_alpha=1, reg_lambda=5;, score=0.529 total time=   0.9s
[CV 3/5] END learning_rate=0.1, max_depth=3, min_child_weight=1, reg

In [145]:
# Use the best model to make predictions on the train set
# Best parameters for 0 target value: {'learning_rate': 0.1, 'max_depth': 7, 'min_child_weight': 5, 'reg_alpha': 1, 'reg_lambda': 100}
# Best parameters for 100 target value: {'learning_rate': 0.1, 'max_depth': 9, 'min_child_weight': 100, 'reg_alpha': 1, 'reg_lambda': 1}
best_model_100 = XGBRegressor(
    n_jobs=-1,tree_method="gpu_hist", sampling_method='gradient_based', 
    learning_rate = best_params['learning_rate'],
    max_depth = best_params['max_depth'],
    min_child_weight = best_params['min_child_weight'],
    reg_alpha = best_params['reg_alpha'],
    reg_lambda = best_params['reg_lambda'])

best_model_100.fit(X_train,y_train.iloc[:,100] ) 
y_pred = best_model_0.predict(X_train) 
# Calculate the mean squared error on the train set
pe = scipy.stats.pearsonr(y_train.iloc[:,100], y_pred)
print("Pearson Error on train set:", pe)
# Use the best model to make predictions on the test set
y_pred_test = best_model_100.predict(X_test) 
# Calculate the mean squared error on the train set
pe_test = scipy.stats.pearsonr(y_test.iloc[:,100], y_pred_test)
print("Pearson Error on test set:", pe_test)

Pearson Error on train set: PearsonRResult(statistic=0.21761568785311752, pvalue=0.0)
Pearson Error on test set: PearsonRResult(statistic=0.7509284069406549, pvalue=0.0)


In [147]:
#{'learning_rate': 0.1, 'max_depth': 7, 'min_child_weight': 5, 'reg_alpha': 1, 'reg_lambda': 100}
best_model_100 = XGBRegressor(
    n_jobs=-1,tree_method="gpu_hist", sampling_method='gradient_based', 
    learning_rate = 0.1,
    max_depth = 9,
    min_child_weight = 100,
    reg_alpha = 5,
    reg_lambda = 5)

best_model_100.fit(X_train,y_train.iloc[:,100] ) 
y_pred = best_model_0.predict(X_train) 
# Calculate the mean squared error on the train set
pe = scipy.stats.pearsonr(y_train.iloc[:,100], y_pred)
print("Pearson Error on train set:", pe)
# Use the best model to make predictions on the test set
y_pred_test = best_model_100.predict(X_test) 
# Calculate the mean squared error on the train set
pe_test = scipy.stats.pearsonr(y_test.iloc[:,100], y_pred_test)
print("Pearson Error on test set:", pe_test)

Pearson Error on train set: PearsonRResult(statistic=0.21761568785311752, pvalue=0.0)
Pearson Error on test set: PearsonRResult(statistic=0.7495329117044199, pvalue=0.0)


I tested performance of train and test samples using parameters from the grid search for 0 and 100th target values on the model with 100th target:
Best parameters for 0 target value: {'learning_rate': 0.1, 'max_depth': 7, 'min_child_weight': 5, 'reg_alpha': 1, 'reg_lambda': 100} -- parameters from different grid search
Pearson Error on train set: PearsonRResult(statistic=0.21761568785311752, pvalue=0.0)
Pearson Error on test set: PearsonRResult(statistic=0.7447795335246583, pvalue=0.0)
Best parameters for 100 target value: {'learning_rate': 0.1, 'max_depth': 9, 'min_child_weight': 100, 'reg_alpha': 1, 'reg_lambda': 1}  -- the best found parameters
Pearson Error on train set: PearsonRResult(statistic=0.21761568785311752, pvalue=0.0)
Pearson Error on test set: PearsonRResult(statistic=0.7509284069406549, pvalue=0.0)
I don't see a benefit of running a very time consuming grid search on every target, so I am using this parameters for all my models: {'learning_rate': 0.1, 'max_depth': 7, 'min_child_weight': 5, 'reg_alpha': 1, 'reg_lambda': 100}  
learning_rate = 0.1,
    max_depth = 9,
    min_child_weight = 100,
    reg_alpha = 5,
    reg_lambda = 5
Parameters that were used for multi gives approximately the same result.
Pearson Error on train set: PearsonRResult(statistic=0.21761568785311752, pvalue=0.0)
Pearson Error on test set: PearsonRResult(statistic=0.7509284069406549, pvalue=0.0)


In [148]:
models_list = []
pearsons_coeff_list = []

for i in range(140):
    model = XGBRegressor(
        n_jobs=-1,
        tree_method="gpu_hist",
        sampling_method='gradient_based',
        learning_rate=0.1,
        max_depth=9,
        min_child_weight=100,
        reg_alpha=5,
        reg_lambda=5
    )

    model.fit(X_train, y_train.iloc[:, i])
   
    y_pred = model.predict(X_train)

    pearson_coefficient, _ = scipy.stats.pearsonr(y_train.iloc[:, i], y_pred)

    models_list.append(model)
    pearsons_coeff_list.append(pearson_coefficient)

     #joblib.dump(model, f'model_{i}.joblib')



In [151]:
pearsons_coeff_test = []

for i in range(140):
    model = models_list[i]
    y_pred = model.predict(X_test)
   
    pearson_coefficient, _ = scipy.stats.pearsonr(y_test.iloc[:, i], y_pred)

    pearsons_coeff_test.append(pearson_coefficient)


In [152]:
print(pearsons_coeff_test)

[0.47276210397276225, 0.26250190429384396, 0.3897818050210177, 0.6967528316867907, 0.791671190158771, 0.7112118447334573, 0.8515648275518932, 0.392524445248035, 0.4350155158246198, 0.5398611645867152, 0.11093221369364373, 0.1635297283599339, 0.24623663715037736, 0.42621880439710597, 0.6722161006209173, 0.5776993374165772, 0.7131060192214151, 0.8124021462559706, 0.68768010756945, 0.22473883290763538, 0.41904464122878576, 0.7949124204171367, 0.08039722238061234, 0.38539881780585983, 0.729330320041262, 0.20620317743129546, 0.07384554430322149, 0.19130891669602404, 0.46566512377425623, 0.3059262115781147, 0.27543079386203506, 0.26762712607809863, 0.19791292459419013, 0.16245870574307725, 0.14195144190029463, 0.17763944172021345, 0.43572157528479655, 0.741359491139051, 0.4140121757288859, 0.1522975359764894, 0.16133313747819866, 0.2887581601235607, 0.313721186465627, 0.8718239630591352, 0.2882037332254965, 0.21036542597482416, 0.437842344386405, 0.39177215510766517, 0.8238359041891049, 0.15

In [153]:
# Building a model on the whole set of X and y
#PCA
X_to_pca_index = X_to_pca.index
start_time = time.time()
pipeline = Pipeline([('scaling', StandardScaler()), ('pca', PCA(n_components=200, random_state=42))])
X_reduced = pipeline.fit_transform(X_to_pca)
print(f"--- {time.time() - start_time} seconds ---")
print(pipeline.named_steps['pca'].explained_variance_ratio_.sum())
print(X_reduced.shape)
X_reduced_df = pd.DataFrame(X_reduced, index = X_to_pca_index )
print(X_reduced_df.shape, X_reduced.shape)
X = pd.concat([X_reduced_df,X_proteins_rna], axis = 1)
print(X.shape,X_reduced_df.shape,X_proteins_rna.shape)
#Grid search
#Loop regression

--- 65.31348252296448 seconds ---
0.08711644897827517
(70988, 200)
(70988, 200) (70988, 200)
(70988, 311) (70988, 200) (70988, 111)


In [157]:
# Create an instance of XGBRegressor
xgb_model = XGBRegressor(n_jobs=-1,tree_method="gpu_hist", sampling_method='gradient_based')
#X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=.2, random_state=3)

# Define the parameter grid for GridSearchCV
param_grid = {
    #'n_estimators': [100, 200, 300],
    'max_depth': [3, 9],
    'learning_rate': [0.1, 1],
    'min_child_weight': [1, 5, 100],
    #'subsample': [0.7, 0.8, 0.9],
    #'colsample_bytree': [0.7, 0.8, 0.9],
    'reg_alpha': [1, 5, 100],
    'reg_lambda': [1, 5, 100]
}

# Create a GridSearchCV instance
grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, 
                           cv=5, scoring='r2',verbose = 3)

# Fit the GridSearchCV instance on the data
grid_search.fit(X, y.iloc[:,100])

# Print the best parameters
best_params = grid_search.best_params_
print("Best parameters:", best_params)

Fitting 5 folds for each of 108 candidates, totalling 540 fits
[CV 1/5] END learning_rate=0.1, max_depth=3, min_child_weight=1, reg_alpha=1, reg_lambda=1;, score=0.300 total time=   1.2s
[CV 2/5] END learning_rate=0.1, max_depth=3, min_child_weight=1, reg_alpha=1, reg_lambda=1;, score=0.548 total time=   1.1s
[CV 3/5] END learning_rate=0.1, max_depth=3, min_child_weight=1, reg_alpha=1, reg_lambda=1;, score=0.354 total time=   1.1s
[CV 4/5] END learning_rate=0.1, max_depth=3, min_child_weight=1, reg_alpha=1, reg_lambda=1;, score=0.273 total time=   1.1s
[CV 5/5] END learning_rate=0.1, max_depth=3, min_child_weight=1, reg_alpha=1, reg_lambda=1;, score=0.326 total time=   1.1s
[CV 1/5] END learning_rate=0.1, max_depth=3, min_child_weight=1, reg_alpha=1, reg_lambda=5;, score=0.294 total time=   1.1s
[CV 2/5] END learning_rate=0.1, max_depth=3, min_child_weight=1, reg_alpha=1, reg_lambda=5;, score=0.546 total time=   1.1s
[CV 3/5] END learning_rate=0.1, max_depth=3, min_child_weight=1, reg_

In [161]:
# Best parameters: {'learning_rate': 0.1, 'max_depth': 9, 'min_child_weight': 100, 'reg_alpha': 1, 'reg_lambda': 1}
#for 100th target
best_model_100 = XGBRegressor(
    n_jobs=-1,tree_method="gpu_hist", sampling_method='gradient_based', 
    learning_rate = 0.1,
    max_depth = 9,
    min_child_weight = 100,
    reg_alpha = 5,
    reg_lambda = 5)

best_model_100.fit(X,y.iloc[:,100] ) 
y_pred = best_model_0.predict(X) 
# Calculate the mean squared error on the train set
pe = scipy.stats.pearsonr(y.iloc[:,100], y_pred)
print("Pearson Error on train set:", pe)
#Pearson Error on train set: PearsonRResult(statistic=0.23037756183475758, pvalue=0.0)



Pearson Error on train set: PearsonRResult(statistic=0.23037756183475758, pvalue=0.0)


In [163]:
models_list = []
pearsons_coeff_list = []

for i in range(140):
    model = XGBRegressor(
        n_jobs=-1,
        tree_method="gpu_hist",
        sampling_method='gradient_based',
        learning_rate=0.1,
        max_depth=9,
        min_child_weight=100,
        reg_alpha=5,
        reg_lambda=5
    )

    model.fit(X, y.iloc[:, i])
   
    y_pred = model.predict(X)

    pearson_coefficient, _ = scipy.stats.pearsonr(y.iloc[:, i], y_pred)

    models_list.append(model)
    pearsons_coeff_list.append(pearson_coefficient)


In [164]:
print(pearsons_coeff_list)

[0.6379452165840824, 0.6918326091487852, 0.71804197225541, 0.8266226666347197, 0.8796247222425371, 0.8426589444890242, 0.9125447137295486, 0.6428629724353832, 0.7271668119852622, 0.7503616213271713, 0.7643830532934306, 0.7552400467791345, 0.7004935202713238, 0.6323742778485892, 0.8300786958429502, 0.6841490049408963, 0.8408408689441998, 0.8968349935326383, 0.8408970311453299, 0.6902020268263497, 0.719789608768313, 0.8714013401668774, 0.7402032962110174, 0.6856362713675137, 0.8444471854790607, 0.5968481274336638, 0.7569833844047563, 0.6651713826108254, 0.7110024998864312, 0.674207431196243, 0.735572277670759, 0.7391091797165735, 0.7329310192199661, 0.7395812054005804, 0.7527999656541107, 0.6097917188984794, 0.7273248249585119, 0.8524651092177833, 0.7239994726786566, 0.7451397991762363, 0.5976919745948222, 0.6683158672145251, 0.6342041352914158, 0.9229711755676071, 0.5972255570317414, 0.7149701859067598, 0.7151590339174346, 0.6420566212925816, 0.903978268960972, 0.7548007283725282, 0.732

In [167]:
test = pd.read_hdf(test_cite_inputs)

In [168]:
test.shape

(48663, 22050)

In [173]:
test_proteins_rna = test.loc[:, X_columns_to_keep]
test_to_pca = test.loc[:,X_to_pca_columns]
test_to_pca_index = test_to_pca.index
print(test.shape[1] - test_to_pca.shape[1],test_proteins_rna.shape[1])
test_reduced = pipeline.transform(test_to_pca)
test_reduced_df = pd.DataFrame(test_reduced, index = test_to_pca_index )
print(test_reduced_df.shape, test_reduced.shape)
test_final = pd.concat([test_reduced_df,test_proteins_rna], axis = 1)
print(test_final.shape)

111 111
(48663, 200) (48663, 200)
(48663, 311)


In [178]:
test_final.index

Index(['c2150f55becb', '65b7edf8a4da', 'c1b26cb1057b', '917168fa6f83',
       '2b29feeca86d', '0fd801488185', '526647a698f8', 'ab8f207a3dec',
       '57f730249c87', '08df3dcce25c',
       ...
       '87644788920d', '4b759a227f82', '294b63574ccd', 'd4876939c837',
       '0b0479644c65', 'a9b4d99f1f50', '0e2c1d0782af', 'a3cbc5aa0ec3',
       '75b350243add', 'ad5a949989b2'],
      dtype='object', name='cell_id', length=48663)

In [175]:
test_cite_targets = np.empty((len(test_final), 0))

# Iterate over the models and generate predictions
for i in range(140):
    model = models_list[i]
    y_pred = model.predict(test_final)
    
    # Add the predictions as a new column in the 2D array
    test_cite_targets = np.column_stack((test_cite_targets, y_pred))

In [180]:
test_cite_targets_df = pd.DataFrame(test_cite_targets, index = test_final.index, columns = )

In [202]:
sample_submission = os.path.join(data_path,"sample_submission.csv")
evaluation_ids = os.path.join(data_path,"evaluation_ids.csv")

sample_submission_df = pd.read_csv(sample_submission)

sample_submission_df.head()

# Reading evaluation_ids.csv and creating mapping for every cell_id and gene_id,
# where it's values got replaced by position number to be able to find the correspondent target value



In [203]:
evaluation_ids_df.head()

Unnamed: 0,row_id,cell_id,gene_id
0,0,c2150f55becb,CD86
1,1,c2150f55becb,CD274
2,2,c2150f55becb,CD270
3,3,c2150f55becb,CD155
4,4,c2150f55becb,CD112


In [212]:
evaluation_ids_df = pd.read_csv(evaluation_ids)
value_to_position_cell_id = {value: index for index, value in enumerate(test_final.index)}
value_to_position_gene_id = {value: index for index, value in enumerate(list(y.columns))}
evaluation_ids_df['cell_id'] = evaluation_ids_df['cell_id'].map(value_to_position_cell_id).fillna(-1).astype(int)
evaluation_ids_df['gene_id'] = evaluation_ids_df['gene_id'].map(value_to_position_gene_id).fillna(-1).astype(int)
evaluation_ids_df_cite = evaluation_ids_df[evaluation_ids_df['cell_id'] != -1]

In [213]:
evaluation_ids_df_cite.shape

(6812820, 3)

In [220]:
t1 = evaluation_ids_df_cite.head()
print(t1)

   row_id  cell_id  gene_id
0       0        0        0
1       1        0        1
2       2        0        2
3       3        0        3
4       4        0        4


In [224]:
# Adding target value from the test_multi_500_traget to the evaluation_ids
def extract_value(row):
    row_num = row['cell_id']
    col_num = row['gene_id']
    try:
        value = test_cite_targets[row_num, col_num]
    except IndexError:
        value = np.nan
    
    return value
    
evaluation_ids_df_cite.loc[:,'Value'] = evaluation_ids_df_cite.apply(extract_value, axis=1)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  evaluation_ids_df_cite.loc[:,'Value'] = evaluation_ids_df_cite.apply(extract_value, axis=1)


In [225]:
evaluation_ids_df_cite.head()

Unnamed: 0,row_id,cell_id,gene_id,Value
0,0,0,0,0.288894
1,1,0,1,0.100747
2,2,0,2,0.560028
3,3,0,3,4.095116
4,4,0,4,4.65839


In [227]:
evaluation_ids_df_cite = evaluation_ids_df_cite.rename(columns={'Value': 'target'})
submission_cite = evaluation_ids_df_cite[['row_id','target']].reset_index(drop=True)
submission_cite.head()

Unnamed: 0,row_id,target
0,0,0.288894
1,1,0.100747
2,2,0.560028
3,3,4.095116
4,4,4.65839


In [228]:
submitssion_cite_path = os.path.join(data_path,"submission_cite.csv")
submission_cite.to_csv(submitssion_cite_path, index=False)

In [230]:
submitssion_multi_path = os.path.join(data_path,"submission_multi.csv")
submission_multi = pd.read_csv(submitssion_multi_path)

In [231]:
type(submission_multi)

pandas.core.frame.DataFrame

In [232]:
submission_multi.head()

Unnamed: 0,row_id,target
0,6812820,0.000888
1,6812821,10.283843
2,6812822,0.984811
3,6812823,1.083188
4,6812824,0.002385


In [233]:
submission = pd.concat([submission_cite,submission_multi], axis = 0)
submission.shape

(65744180, 2)

In [234]:
submission.head()

Unnamed: 0,row_id,target
0,0,0.288894
1,1,0.100747
2,2,0.560028
3,3,4.095116
4,4,4.65839


In [235]:
submission = submission.sort_values(by='row_id', ascending=False)

In [236]:
submitssion_path = os.path.join(data_path,"submission.csv")
submission.to_csv(submitssion_path, index=False)