In [1]:
import numpy as np
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing
from sklearn.decomposition import PCA
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import KFold
from sklearn.metrics import log_loss
import optuna

from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import os
import warnings
warnings.filterwarnings('ignore')

# Configuration

In [2]:
ROOT = os.path.join("..","..")
INPUT = "input"
LISH_MOA = "lish-moa"
NUM_FOLD = 5
NUM_OPTUNA_TRIAL = 30
N_COMP_GENES = 50
N_COMP_CELLS = 15

# Read data

In [3]:
train = pd.read_csv(os.path.join(ROOT, INPUT, LISH_MOA, "train_features.csv"))
test = pd.read_csv(os.path.join(ROOT, INPUT, LISH_MOA, "test_features.csv"))
train_targets_scored = pd.read_csv(os.path.join(ROOT, INPUT, LISH_MOA, "train_targets_scored.csv"))
train_targets_nonscored = pd.read_csv(os.path.join(ROOT, INPUT, LISH_MOA, "train_targets_nonscored.csv"))
sub = pd.read_csv(os.path.join(ROOT, INPUT, LISH_MOA, "sample_submission.csv"))

FileNotFoundError: [Errno 2] File b'..\\input\\lish-moa\\train_features.csv' does not exist: b'..\\input\\lish-moa\\train_features.csv'

In [None]:
train_targets_scored.columns

In [None]:
for col in train_targets_scored.columns:
    if col != 'sig_id':
        c = train_targets_scored[col].value_counts()
        if c[1] <= NUM_FOLD:
            print(col)

## train_features.csv

In [None]:
print(train.shape)

In [None]:
train.head()

## train_targets_socred.csv

In [None]:
print(train_targets_scored.shape)

In [None]:
train_targets_scored.head()

## test_features.csv

In [None]:
print(test.shape)

In [None]:
test.head()

## train_targets_nonscored.csv (not use)

In [None]:
print(train_targets_nonscored.shape)

In [None]:
train_targets_nonscored.head()

## submission.csv

In [None]:
sub.head()

# Preprocessing

## PCA features + Existing features

In [None]:
def make_pca_features(df_train:pd.DataFrame, df_test:pd.DataFrame, n_components:int, use_cols:list, gene_or_cell:str, concat_flg:bool):
    data = pd.concat([pd.DataFrame(df_train[use_cols]), pd.DataFrame(df_test[use_cols])])
    data_pca = PCA(n_components=n_components, random_state=334).fit_transform(data[use_cols])

    train_pca = data_pca[:df_train.shape[0]]
    test_pca = data_pca[-df_test.shape[0]:]

    train_pca = pd.DataFrame(train_pca, columns=['pca_'+gene_or_cell+str(i) for i in range(n_components)])
    test_pca = pd.DataFrame(test_pca, columns=['pca_'+gene_or_cell+str(i) for i in range(n_components)])

    if concat_flg:
        ret_df_train = pd.concat([df_train, train_pca], axis=1)
        ret_df_test = pd.concat([df_test, test_pca], axis=1)
    else:
        ret_df_train = pd.concat([df_train['sig_id'], train_pca], axis=1)
        ret_df_test = pd.concat([df_test['sig_id'], test_pca], axis=1)
    return ret_df_train, ret_df_test

In [None]:
GENES = [col for col in train.columns if col.startswith('g-')]
CELLS = [col for col in train.columns if col.startswith('c-')]

In [None]:
train, test = make_pca_features(train, test, N_COMP_GENES, GENES, 'G', True)
train, test = make_pca_features(train, test, N_COMP_CELLS, GENES, 'C', True)

## Label Encoding

In [None]:
def label_encoding(train: pd.DataFrame, test: pd.DataFrame, encode_cols):
    n_train = len(train)
    train = pd.concat([train, test], sort=False).reset_index(drop=True)
    for f in encode_cols:
        try:
            lbl = preprocessing.LabelEncoder()
            train[f] = lbl.fit_transform(list(train[f].values))
        except:
            print(f)
    test = train[n_train:].reset_index(drop=True)
    train = train[:n_train]
    return train, test

In [None]:
train.select_dtypes(include=['object']).columns

In [None]:
# train['cp_type'] = train['cp_type'].astype(str)
# train['cp_dose'] = train['cp_dose'].astype(str)
# test['cp_type'] = test['cp_type'].astype(str)
# test['cp_dose'] = test['cp_dose'].astype(str)

In [None]:
train, test = label_encoding(train, test, ['cp_type', 'cp_dose'])

## Standrization

In [None]:
tr_mean = train.iloc[:, 1:].mean()
tr_std = train.iloc[:, 1:].std()
train.iloc[:, 1:] = (train.iloc[:, 1:] - tr_mean) / tr_std
test.iloc[:, 1:] = (test.iloc[:, 1:] - tr_mean) / tr_std

In [None]:
train.head()

In [None]:
test.head()

# Cross Validation by using Optuna

In [None]:
def logistic_elasticnet_cv(data, targets, target_col, cv):
    def objective(trial):
        param = {
            'penalty': 'l1',
            'C': trial.suggest_loguniform('C', 2**(-10), 2**5),
            'solver': 'liblinear',
            'n_jobs': 8
        }
        
        df_x = data.drop(["sig_id"], axis=1)
        df_y = targets[target_col]
        score_logloss = 0.
        
        # cross validation
        for fold_id, (train_index, valid_index) in enumerate(tqdm(cv.split(df_x, df_y))):
            X_train = df_x.loc[train_index, :]
            y_train = df_y[train_index]
            X_valid = df_x.loc[valid_index, :]
            y_valid = df_y.loc[valid_index]
            
            model = LogisticRegression(**param)
            model.fit(X_train, y_train)
            y_pred = model.predict_proba(X_valid)
            score_logloss += log_loss(y_valid, y_pred, labels=[0, 1])

        score_logloss /= NUM_FOLD
        return score_logloss
    return objective
        

In [None]:
cv = StratifiedKFold(n_splits=NUM_FOLD, shuffle=True, random_state=0)

In [None]:
skip_cols = ["atp-sensitive_potassium_channel_antagonist", "erbb2_inhibitor"]

In [None]:
for target_col in tqdm(train_targets_scored.columns[150:]):
    if (target_col != "sig_id") and not (target_col in skip_cols):
        print('##################### CV START: {0} #####################'.format(target_col))
        
        # optimize by optuna
        study = optuna.create_study()
        study.optimize(logistic_elasticnet_cv(train, train_targets_scored, target_col, cv), n_trials=NUM_OPTUNA_TRIAL)
        
        # save optuna log
        df_trial = study.trials_dataframe()
        df_trial.to_pickle(os.path.join('report', 'LR_PCA_OptunaLog_'+target_col+'.pkl'))

        # save best params found by CV
        bestparam_filename = 'LR_PCA_BestParamsSelectedByCV_'+target_col+'.pkl'
        with open(os.path.join('result', 'best_param_cv', bestparam_filename), 'wb') as f:
            pickle.dump(study.best_params, f)


# Train & Predict by best model

In [None]:
submission = sub.copy()

In [None]:
def logistic_elasticnet_bestparam(data_train, targets_train, data_test, target_col, best_params, submission):
    print('##################### TraingByBestParam START: {0} #####################'.format(target_col))
    
    # prepare data set
    X_train = data_train.drop(["sig_id"], axis=1)
    y_train = targets_train[target_col]
    X_test = data_test.drop(["sig_id"], axis=1)
    
    # train XGBoost by using best_params
    model = LogisticRegression(**best_params)
    model.fit(X_train, y_train)
    
    # predict for test
    submission[target_col] = model.predict_proba(X_test)[:, 1]
    
    # save model for test
    modelfile = 'LR_PCA_BestModel4test_'+target_col+'.pkl'
    with open(os.path.join('result', 'best_model', modelfile), 'wb') as f:
        pickle.dump(model, f)
    

In [None]:
# for target_col in tqdm(train_targets_scored.columns):
for target_col in tqdm(train_targets_scored.columns[150:]):
    if target_col != "sig_id" and not (target_col in skip_cols):
        # read best param selected by CV
        bestparam_filename = 'LR_PCA_BestParamsSelectedByCV_'+target_col+'.pkl'
        with open(os.path.join('result', 'best_param_cv', bestparam_filename), 'rb') as f:
            best_params = pickle.load(f)
            
        best_params['penalty'] = 'l1'
        best_params['n_jobs'] = 8
        best_params['solver'] = 'liblinear'
        
        logistic_elasticnet_bestparam(train, train_targets_scored, test, target_col, best_params, submission)
    elif target_col in skip_cols:
        submission[target_col] = 1e-05

In [None]:
submission.to_csv('submission.csv', index=False)