In [1]:
# Data processing
import pandas as pd
import numpy as np
# Standardize the data
from sklearn.preprocessing import StandardScaler
# Modeling 
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
# Hyperparameter tuning
from sklearn.model_selection import StratifiedKFold, GridSearchCV, RandomizedSearchCV, cross_val_score
from hyperopt import tpe, STATUS_OK, Trials, hp, fmin, STATUS_OK, space_eval
from sklearn.decomposition import PCA

In [2]:
train_dt = pd.read_csv("/home/wt/meta_target/data/train_dtV2.csv")
dt = pd.read_csv("/home/data/sdc/wt/model_data/ml_input_pca.csv")

In [4]:
cell_exp = pd.read_csv("/home/data/sdb/wt/model_data/cell_gene_exp_vs_normal_filter.csv")
gene_pc = pd.read_csv("/home/wt/meta_target/data/gene_pca.csv")

In [5]:
gene_feat = pd.read_csv("/home/wt/meta_target/data/all_train_gene_cpg.csv")

In [6]:
sample_info = pd.read_csv("/home/data/sdc/wt/model_data/new_model/cell_net_filter_exp/raw/train_cell_info.csv")

In [7]:
X_train = dt.drop(columns=["is_dep","cell","gene"])
y_train = dt.is_dep

In [8]:
# Initiate scaler
sc = StandardScaler()
# Standardize the training dataset
X_train_transformed = pd.DataFrame(sc.fit_transform(X_train),index=X_train.index, columns=X_train.columns)

In [9]:
###逻辑回归
clf = SGDClassifier(loss="log_loss", penalty="elasticnet", fit_intercept=True, early_stopping=True,n_iter_no_change=100)

# use a full grid over all parameters
param_grid = {
    'max_iter': [500,1000,2000,3000,4000,5000],
    'learning_rate': ['optimal', 'invscaling', 'adaptive']
}

# run grid search
grid_search = GridSearchCV(clf, param_grid=param_grid, n_jobs = 30, scoring="f1")

In [14]:
grid_search.fit(X_train_transformed, y_train)

60 fits failed out of a total of 90.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
60 fits failed with the following error:
Traceback (most recent call last):
  File "/home/wt/miniconda3/envs/dl/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/wt/miniconda3/envs/dl/lib/python3.9/site-packages/sklearn/linear_model/_stochastic_gradient.py", line 892, in fit
    self._more_validate_params()
  File "/home/wt/miniconda3/envs/dl/lib/python3.9/site-packages/sklearn/linear_model/_stochastic_gradient.py", line 149, in _more_validate_params
    raise ValueError("eta0 must be > 0")
ValueError: eta0 must be > 0

        nan        nan   

In [None]:
grid_search.best_params_

{'learning_rate': 'optimal', 'max_iter': 1000}

In [None]:
####CV
from sklearn.model_selection import KFold
splits = KFold(n_splits=10,shuffle=True,random_state=2023052701)

for fold, (train_idx,val_idx) in enumerate(splits.split(np.arange(len(sample_info)))):
    print('Fold {}'.format(fold + 1))
    ###get fold train and test data
    X_train_idx = train_dt.cell.isin(sample_info.cell[train_idx])
    X_train = train_dt[X_train_idx]
    X_test_idx = train_dt.cell.isin(sample_info.cell[val_idx])
    X_test = train_dt[X_test_idx]
    ###get fold train and test data
    fold_train_exp_idx = cell_exp.cell.isin(sample_info.cell[train_idx])
    fold_train_exp = cell_exp[fold_train_exp_idx].reset_index()
    pca_exp = PCA(n_components=100)
    fold_train_exp_pca = pd.DataFrame(pca_exp.fit_transform(fold_train_exp.iloc[:,2:7995]), 
                                      columns=["exp_pca-" + str(i) for i in range(100)])
    fold_train_exp_pca["cell"] = fold_train_exp.cell
    
    fold_train_gene_idx = gene_feat.id.isin(X_train.id)
    fold_train_gene_feat = gene_feat[fold_train_gene_idx].reset_index()
    pca_gene = PCA(n_components=100)
    fold_train_gene_pca = pd.DataFrame(pca_gene.fit_transform(fold_train_gene_feat.iloc[:,2:3249]),
                                       columns=["gene_pca-" + str(i) for i in range(100)])
    fold_train_gene_pca["id"] = fold_train_gene_feat.id
    
    X_train_dt = X_train.merge(fold_train_exp_pca, on='cell', how='left').merge(fold_train_gene_pca, on='id', how='left').drop(columns=["id","cell","is_dep"])
    y_train = X_train.is_dep
    
    fold_test_exp_idx = cell_exp.cell.isin(sample_info.cell[val_idx])
    fold_test_exp = cell_exp[fold_test_exp_idx].reset_index()
    fold_test_exp_pca = pd.DataFrame(pca_exp.transform(fold_test_exp.iloc[:,2:7995]), 
                                     columns=["exp_pca-" + str(i) for i in range(100)])
    fold_test_exp_pca["cell"] = fold_test_exp.cell
    
    fold_test_gene_idx = gene_feat.id.isin(X_test.id)
    fold_test_gene_feat = gene_feat[fold_test_gene_idx].reset_index()
    fold_test_gene_pca = pd.DataFrame(pca_gene.transform(fold_test_gene_feat.iloc[:,2:3249]), 
                                      columns=["gene_pca-" + str(i) for i in range(100)])
    fold_test_gene_pca["id"] = fold_test_gene_feat.id
    
    X_test_dt = X_test.merge(fold_test_exp_pca, on='cell', how='left').merge(fold_test_gene_pca, on='id', how='left').drop(columns=["id","cell","is_dep"])
    y_test = np.array(X_test.is_dep)
    y_test_gene = np.array(X_test.id)
    
    # Initiate scaler
    sc = StandardScaler()
    # Standardize the training dataset
    X_train_transformed = pd.DataFrame(sc.fit_transform(X_train_dt),index=X_train_dt.index, columns=X_train_dt.columns)
    X_test_transformed = pd.DataFrame(sc.transform(X_test_dt),index=X_test_dt.index, columns=X_test_dt.columns)
    ###fit model
    clf = SGDClassifier(loss="log_loss", penalty="elasticnet", fit_intercept=True,
                        learning_rate="optimal", max_iter=1000, n_iter_no_change=100, early_stopping=True)
    clf.fit(X_train_transformed, y_train)
    y_pred = clf.predict(X_test_transformed)
    y_pred_raw = clf.predict_proba(X_test_transformed)
    y_pred_raw = y_pred_raw[:,1]
    res = pd.DataFrame({"preds":y_pred,"preds_raw":y_pred_raw,"label":y_test,"genes":y_test_gene})
    res.to_csv("~/meta_target/data/cv/ml_logistic/fold_"+str(fold)+".csv")

In [15]:
##SVM
from sklearn.kernel_approximation import Nystroem
from sklearn.pipeline import Pipeline

# feature_map_nystroem = Nystroem(random_state=2023061001)
linear_svc = SGDClassifier(loss="hinge", penalty="elasticnet", fit_intercept=True, n_iter_no_change=100, early_stopping=True)
# nystroem_approx_svm = Pipeline(
#     [("feature_map", feature_map_nystroem), 
#      ("svm", linear_svc)]
# )

grid_param = {
    # "feature_map__gamma": [0.1,1,10],
    # "feature_map__n_components": [100,200,500,1000],
    "max_iter": [500,1000,2000,3000,4000,5000],
    "learning_rate": ['optimal', 'invscaling', 'adaptive']
}

In [None]:
gridsearch = GridSearchCV(linear_svc, param_grid=grid_param, n_jobs = 30, scoring="f1") # Fit grid search
gridsearch.fit(X_train_transformed, y_train)

In [None]:
gridsearch.best_params_

In [8]:
####CV
from sklearn.model_selection import KFold
from sklearn.kernel_approximation import Nystroem
from sklearn.pipeline import Pipeline

splits = KFold(n_splits=10,shuffle=True,random_state=2023052701)

for fold, (train_idx,val_idx) in enumerate(splits.split(np.arange(len(sample_info)))):
    print('Fold {}'.format(fold + 1))
    ###get fold train and test data
    fold_train_exp_idx = cell_exp.cell.isin(sample_info.cell[train_idx])
    fold_train_exp = cell_exp[fold_train_exp_idx].reset_index()
    pca = PCA(n_components=100)
    fold_train_exp_pca = pd.DataFrame(pca.fit_transform(fold_train_exp.iloc[:,2:7995]), columns=["pca-" + str(i) for i in range(100)])
    fold_train_exp_pca["cell"] = fold_train_exp.cell
    
    X_train_idx = train_dt.cell.isin(sample_info.cell[train_idx])
    X_train = train_dt[X_train_idx]
    X_train_dt = X_train.merge(fold_train_exp_pca, on='cell', how='left').merge(gene_pc, on='id', how='left').drop(columns=["id","cell","is_dep"])
    y_train = X_train.is_dep
    
    fold_test_exp_idx = cell_exp.cell.isin(sample_info.cell[val_idx])
    fold_test_exp = cell_exp[fold_test_exp_idx].reset_index()
    fold_test_exp_pca = pd.DataFrame(pca.transform(fold_test_exp.iloc[:,2:7995]), columns=["pca-" + str(i) for i in range(100)])
    fold_test_exp_pca["cell"] = fold_test_exp.cell
    
    X_test_idx = train_dt.cell.isin(sample_info.cell[val_idx])
    X_test = train_dt[X_test_idx]
    X_test_dt = X_test.merge(fold_test_exp_pca, on='cell', how='left').merge(gene_pc, on='id', how='left').drop(columns=["id","cell","is_dep"])
    y_test = np.array(X_test.is_dep)
    y_test_gene = np.array(X_test.id)
    
    # Initiate scaler
    sc = StandardScaler()
    # Standardize the training dataset
    X_train_transformed = pd.DataFrame(sc.fit_transform(X_train_dt),index=X_train_dt.index, columns=X_train_dt.columns)
    X_test_transformed = pd.DataFrame(sc.transform(X_test_dt),index=X_test_dt.index, columns=X_test_dt.columns)
    ###fit model
    #feature_map_nystroem = Nystroem(random_state=2023061001, gamma=0.1, n_components=1000)
    linear_svc = SGDClassifier(loss="hinge", penalty="elasticnet", fit_intercept=True, n_iter_no_change=100, 
                               early_stopping=True,
                               learning_rate="optimal",max_iter=4000)
    # nystroem_approx_svm = Pipeline(
    #     [("feature_map", feature_map_nystroem), 
    #      ("svm", linear_svc)]
    # )
    linear_svc.fit(X_train_transformed, y_train)
    y_pred = linear_svc.predict(X_test_transformed)
    res = pd.DataFrame({"preds":y_pred,"label":y_test,"genes":y_test_gene})
    res.to_csv("/home/wt/meta_target/data/cv/ml_svm/fold_"+str(fold)+".csv")

Fold 1
Fold 2
Fold 3
Fold 4
Fold 5
Fold 6
Fold 7
Fold 8
Fold 9
Fold 10
