## **Sparse SVM Training**

In [1]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
import os
import glob
import pickle
import matplotlib.pyplot as plt
import matplotlib as mpl
mpl.rcParams['agg.path.chunksize'] = 100000

import gc
from myUtils import *
from feature_generator import feature_v1
import xgboost as xgb
from tqdm import tqdm
import shutil
import copy

from sklearn.preprocessing import LabelEncoder

In [None]:
def label_encode(df, cols):
    for col in cols:
        le = LabelEncoder()
        tmp = df[col].fillna("NaN")
        df[col] = pd.Series(le.fit_transform(tmp), index=tmp.index)

    return df


def check_columns(necessary_cols,cols):
    
    cols = set(cols) # make set
    
    lack_cols = [c for c in necessary_cols if c not in cols]
    
    print("-- column check completed --")
    if len(lack_cols) == 0:
        print("  columns are satisfied")
        return True
    else:
        print("  !!columns are lacked!!")
        print("   lacked columns:",lack_cols)
        return False


class FeaturesMaker_v1(object):

    def __init__(self,target_col):
        self.name = "featuresV1"
        self.feature_exp = "simple features which "

        self.target_col = target_col
        self.necessary_col =  ["sig_id",'cp_type',"cp_time","cp_dose","data_part"] + [target_col]

    def make_feature(self,df):

        # check existstance of necessary columns
        if check_columns(self.necessary_col,df.columns):

            # label encoding
            cols = ['cp_type',"cp_time","cp_dose"]
            df = label_encode(df, cols=cols)


            # split train and test
            df = df.set_index(["sig_id"],drop=True)

            features = [c for c in df.columns if "g-" in c]
            features = features + [c for c in df.columns if "c-" in c]
            features = features + ['cp_type',"cp_time","cp_dose"]

            print("-- ",self.name," --")
            print("dim:",len(features))
            print("N:",len(df))
            print("-----------------")

            return {sub[0]:(sub[1][features],sub[1][self.target_col]) for sub in df.groupby(by="data_part")}

        else:
            return False

In [4]:
train_targets = pd.read_csv(os.path.join("..","input","lish-moa","train_targets_scored.csv"))
train_features = pd.read_csv(os.path.join("..","input","lish-moa","train_features.csv"))
test_features =  pd.read_csv(os.path.join("..","input","lish-moa","test_features.csv"))

for target in train_targets.columns[1:]:
    feature_maker = FeaturesMaker_v1(target_col=target)

    train_data = copy.copy(train_features)
    train_data = pd.merge(train_data,train_targets[["sig_id",target]],on="sig_id",how="right")
    train_data["data_part"] = "train"

    test_data = copy.copy(test_features)
    test_data[target] = np.nan
    test_data["data_part"] = "test"

    data = pd.concat([train_data,test_data])
    data = feature_maker.make_feature(data)

    # training XGBoost
    model = xgb.XGBClassifier(objective="binary:logistic",
                                 tree_method="gpu_hist",
                                 random_state=0,
                                 verbose=2,
                                 n_estimators=1000,
                             )

    model.fit(X=data["train"][0], y=data["train"][1], 
              #sample_weight=None, 
              #base_margin=None, 
              eval_set=[data["train"]], 
              #eval_metric=None, 
              early_stopping_rounds=100, 
              verbose=False, 
              eval_metric="logloss"
              #xgb_model=None, 
              #sample_weight_eval_set=None
              )
    
    dir = os.path.join("..","input",feature_maker.name+"_XGBoost")
    if not(os.path.exists(dir)):
        os.makedirs(dir)
    
    model_path = os.path.join(dir,target+".mdl")         
    model.save_model(model_path)

-- column check completed --
  columns are satisfied
--  featuresV1  --
dim: 875
N: 27796
-----------------
-- column check completed --
  columns are satisfied
--  featuresV1  --
dim: 875
N: 27796
-----------------
-- column check completed --
  columns are satisfied
--  featuresV1  --
dim: 875
N: 27796
-----------------
-- column check completed --
  columns are satisfied
--  featuresV1  --
dim: 875
N: 27796
-----------------
-- column check completed --
  columns are satisfied
--  featuresV1  --
dim: 875
N: 27796
-----------------
-- column check completed --
  columns are satisfied
--  featuresV1  --
dim: 875
N: 27796
-----------------
-- column check completed --
  columns are satisfied
--  featuresV1  --
dim: 875
N: 27796
-----------------
-- column check completed --
  columns are satisfied
--  featuresV1  --
dim: 875
N: 27796
-----------------
-- column check completed --
  columns are satisfied
--  featuresV1  --
dim: 875
N: 27796
-----------------
-- column check completed --