## **Logisitc Regression with L1 penalty prediction**

In [11]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
import os
import glob
import pickle
import matplotlib.pyplot as plt
import matplotlib as mpl
mpl.rcParams['agg.path.chunksize'] = 100000

import gc
from myUtils import *
from feature_generator import feature_v1
import xgboost as xgb
from tqdm import tqdm
import shutil
import copy

from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression

import pickle

import warnings
warnings.simplefilter('ignore')

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [12]:
def label_encode(df, cols):
    for col in cols:
        le = LabelEncoder()
        tmp = df[col].fillna("NaN")
        df[col] = pd.Series(le.fit_transform(tmp), index=tmp.index)

    return df


def check_columns(necessary_cols,cols):
    
    cols = set(cols) # make set
    
    lack_cols = [c for c in necessary_cols if c not in cols]
    
    print("-- column check completed --")
    if len(lack_cols) == 0:
        print("  columns are satisfied")
        return True
    else:
        print("  !!columns are lacked!!")
        print("   lacked columns:",lack_cols)
        return False


class FeaturesMaker_v1(object):

    def __init__(self,target_col):
        self.name = "featuresV1"
        self.feature_exp = "simple features which "

        self.target_col = target_col
        self.necessary_col =  ["sig_id",'cp_type',"cp_time","cp_dose","data_part"] + [target_col]

    def make_feature(self,df):

        # check existstance of necessary columns
        if check_columns(self.necessary_col,df.columns):

            # label encoding
            cols = ['cp_type',"cp_time","cp_dose"]
            df = label_encode(df, cols=cols)


            # split train and test
            df = df.set_index(["sig_id"],drop=True)

            features = [c for c in df.columns if "g-" in c]
            features = features + [c for c in df.columns if "c-" in c]
            features = features + ['cp_type',"cp_time","cp_dose"]

            print("-- ",self.name," --")
            print("dim:",len(features))
            print("N:",len(df))
            print("-----------------")

            return {sub[0]:(sub[1][features],sub[1][self.target_col]) for sub in df.groupby(by="data_part")}

        else:
            return False

In [None]:
train_targets = pd.read_csv(os.path.join("..","input","lish-moa","train_targets_scored.csv"))
train_features = pd.read_csv(os.path.join("..","input","lish-moa","train_features.csv"))
test_features =  pd.read_csv(os.path.join("..","input","lish-moa","test_features.csv"))

for target in tqdm(train_targets.columns[1:]):
    feature_maker = FeaturesMaker_v1(target_col=target)

    train_data = copy.copy(train_features)
    train_data = pd.merge(train_data,train_targets[["sig_id",target]],on="sig_id",how="right")
    train_data["data_part"] = "train"

    test_data = copy.copy(test_features)
    test_data[target] = np.nan
    test_data["data_part"] = "test"

    data = pd.concat([train_data,test_data])
    data = feature_maker.make_feature(data)

    # training XGBoost
    model = LogisticRegression(penalty="l1",n_jobs=8)

    model.fit(X=data["train"][0], y=data["train"][1])
    
    dir = os.path.join("..","input",feature_maker.name+"_LogisticRegL1")
    if not(os.path.exists(dir)):
        os.makedirs(dir)
    
    model_path = os.path.join(dir,target+".mdl")         
    
    with open(model_path,"wb") as f:
        pickle.dump(model,f)

  0%|                                                                                          | 0/206 [00:00<?, ?it/s]

-- column check completed --
  columns are satisfied
--  featuresV1  --
dim: 875
N: 27796
-----------------


  0%|▍                                                                                 | 1/206 [00:03<12:48,  3.75s/it]

-- column check completed --
  columns are satisfied
--  featuresV1  --
dim: 875
N: 27796
-----------------


  1%|▊                                                                                 | 2/206 [00:08<13:55,  4.09s/it]

-- column check completed --
  columns are satisfied
--  featuresV1  --
dim: 875
N: 27796
-----------------


  1%|█▏                                                                                | 3/206 [00:12<13:46,  4.07s/it]

-- column check completed --
  columns are satisfied
--  featuresV1  --
dim: 875
N: 27796
-----------------


  2%|█▌                                                                                | 4/206 [00:55<52:37, 15.63s/it]

-- column check completed --
  columns are satisfied
--  featuresV1  --
dim: 875
N: 27796
-----------------


  2%|█▉                                                                              | 5/206 [01:32<1:13:45, 22.02s/it]

-- column check completed --
  columns are satisfied
--  featuresV1  --
dim: 875
N: 27796
-----------------


  3%|██▎                                                                             | 6/206 [01:48<1:07:21, 20.21s/it]

-- column check completed --
  columns are satisfied
--  featuresV1  --
dim: 875
N: 27796
-----------------


  3%|██▊                                                                               | 7/206 [01:58<56:57, 17.18s/it]

-- column check completed --
  columns are satisfied
--  featuresV1  --
dim: 875
N: 27796
-----------------


  4%|███▏                                                                              | 8/206 [02:14<55:46, 16.90s/it]

-- column check completed --
  columns are satisfied
--  featuresV1  --
dim: 875
N: 27796
-----------------


  4%|███▌                                                                              | 9/206 [02:17<41:23, 12.60s/it]

-- column check completed --
  columns are satisfied
--  featuresV1  --
dim: 875
N: 27796
-----------------


  5%|███▊                                                                           | 10/206 [02:50<1:01:28, 18.82s/it]

-- column check completed --
  columns are satisfied
--  featuresV1  --
dim: 875
N: 27796
-----------------


  5%|████▏                                                                          | 11/206 [03:31<1:22:32, 25.40s/it]

-- column check completed --
  columns are satisfied
--  featuresV1  --
dim: 875
N: 27796
-----------------


  6%|████▌                                                                          | 12/206 [03:37<1:03:12, 19.55s/it]

-- column check completed --
  columns are satisfied
--  featuresV1  --
dim: 875
N: 27796
-----------------


  6%|█████                                                                            | 13/206 [03:39<46:23, 14.42s/it]

-- column check completed --
  columns are satisfied
--  featuresV1  --
dim: 875
N: 27796
-----------------


  7%|█████▌                                                                           | 14/206 [03:44<37:17, 11.65s/it]

-- column check completed --
  columns are satisfied
--  featuresV1  --
dim: 875
N: 27796
-----------------


  7%|█████▉                                                                           | 15/206 [03:48<29:09,  9.16s/it]

-- column check completed --
  columns are satisfied
--  featuresV1  --
dim: 875
N: 27796
-----------------


  8%|██████▎                                                                          | 16/206 [03:51<23:06,  7.30s/it]

-- column check completed --
  columns are satisfied
--  featuresV1  --
dim: 875
N: 27796
-----------------


  8%|██████▋                                                                          | 17/206 [03:59<23:57,  7.61s/it]

-- column check completed --
  columns are satisfied
--  featuresV1  --
dim: 875
N: 27796
-----------------


  9%|███████                                                                          | 18/206 [04:22<38:14, 12.21s/it]

-- column check completed --
  columns are satisfied
--  featuresV1  --
dim: 875
N: 27796
-----------------


  9%|███████▍                                                                         | 19/206 [04:36<40:11, 12.90s/it]

-- column check completed --
  columns are satisfied
--  featuresV1  --
dim: 875
N: 27796
-----------------


 10%|███████▊                                                                         | 20/206 [04:43<34:31, 11.14s/it]

-- column check completed --
  columns are satisfied
--  featuresV1  --
dim: 875
N: 27796
-----------------


 10%|████████▎                                                                        | 21/206 [04:49<29:21,  9.52s/it]

-- column check completed --
  columns are satisfied
--  featuresV1  --
dim: 875
N: 27796
-----------------


 11%|████████▋                                                                        | 22/206 [04:59<29:32,  9.63s/it]

-- column check completed --
  columns are satisfied
--  featuresV1  --
dim: 875
N: 27796
-----------------


 11%|█████████                                                                        | 23/206 [05:02<23:10,  7.60s/it]

-- column check completed --
  columns are satisfied
--  featuresV1  --
dim: 875
N: 27796
-----------------


 12%|█████████▍                                                                       | 24/206 [05:07<20:38,  6.80s/it]

-- column check completed --
  columns are satisfied
--  featuresV1  --
dim: 875
N: 27796
-----------------


 12%|█████████▊                                                                       | 25/206 [05:10<17:31,  5.81s/it]

-- column check completed --
  columns are satisfied
--  featuresV1  --
dim: 875
N: 27796
-----------------


 13%|██████████▏                                                                      | 26/206 [05:14<15:22,  5.12s/it]

-- column check completed --
  columns are satisfied
--  featuresV1  --
dim: 875
N: 27796
-----------------


 13%|██████████▌                                                                      | 27/206 [05:17<13:39,  4.58s/it]

-- column check completed --
  columns are satisfied
--  featuresV1  --
dim: 875
N: 27796
-----------------


 14%|███████████                                                                      | 28/206 [05:21<12:59,  4.38s/it]

-- column check completed --
  columns are satisfied
--  featuresV1  --
dim: 875
N: 27796
-----------------


 14%|███████████▍                                                                     | 29/206 [05:37<22:48,  7.73s/it]

-- column check completed --
  columns are satisfied
--  featuresV1  --
dim: 875
N: 27796
-----------------


 15%|███████████▊                                                                     | 30/206 [05:42<20:26,  6.97s/it]

-- column check completed --
  columns are satisfied
--  featuresV1  --
dim: 875
N: 27796
-----------------


 15%|████████████▏                                                                    | 31/206 [05:46<18:10,  6.23s/it]

-- column check completed --
  columns are satisfied
--  featuresV1  --
dim: 875
N: 27796
-----------------


 16%|████████████▌                                                                    | 32/206 [05:54<19:10,  6.61s/it]

-- column check completed --
  columns are satisfied
--  featuresV1  --
dim: 875
N: 27796
-----------------


 16%|████████████▉                                                                    | 33/206 [06:00<18:42,  6.49s/it]

-- column check completed --
  columns are satisfied
--  featuresV1  --
dim: 875
N: 27796
-----------------


 17%|█████████████▎                                                                   | 34/206 [06:03<15:17,  5.34s/it]

-- column check completed --
  columns are satisfied
--  featuresV1  --
dim: 875
N: 27796
-----------------


 17%|█████████████▊                                                                   | 35/206 [06:05<12:37,  4.43s/it]

-- column check completed --
  columns are satisfied
--  featuresV1  --
dim: 875
N: 27796
-----------------


 17%|██████████████▏                                                                  | 36/206 [06:08<11:08,  3.93s/it]

-- column check completed --
  columns are satisfied
--  featuresV1  --
dim: 875
N: 27796
-----------------


 18%|██████████████▌                                                                  | 37/206 [06:18<16:40,  5.92s/it]

-- column check completed --
  columns are satisfied
--  featuresV1  --
dim: 875
N: 27796
-----------------


 18%|██████████████▉                                                                  | 38/206 [06:21<14:07,  5.05s/it]

-- column check completed --
  columns are satisfied
--  featuresV1  --
dim: 875
N: 27796
-----------------


 19%|███████████████▎                                                                 | 39/206 [06:27<14:17,  5.14s/it]

-- column check completed --
  columns are satisfied
--  featuresV1  --
dim: 875
N: 27796
-----------------


 19%|███████████████▋                                                                 | 40/206 [06:29<12:12,  4.41s/it]

-- column check completed --
  columns are satisfied
--  featuresV1  --
dim: 875
N: 27796
-----------------


 20%|████████████████                                                                 | 41/206 [06:38<15:57,  5.80s/it]

-- column check completed --
  columns are satisfied
--  featuresV1  --
dim: 875
N: 27796
-----------------


 20%|████████████████▌                                                                | 42/206 [06:48<19:04,  6.98s/it]

-- column check completed --
  columns are satisfied
--  featuresV1  --
dim: 875
N: 27796
-----------------


 21%|████████████████▉                                                                | 43/206 [06:54<18:19,  6.75s/it]

-- column check completed --
  columns are satisfied
--  featuresV1  --
dim: 875
N: 27796
-----------------


 21%|█████████████████▎                                                               | 44/206 [07:23<35:45, 13.24s/it]

-- column check completed --
  columns are satisfied
--  featuresV1  --
dim: 875
N: 27796
-----------------


 22%|█████████████████▋                                                               | 45/206 [07:44<41:52, 15.60s/it]

-- column check completed --
  columns are satisfied
--  featuresV1  --
dim: 875
N: 27796
-----------------


 22%|██████████████████                                                               | 46/206 [08:07<47:43, 17.90s/it]

-- column check completed --
  columns are satisfied
--  featuresV1  --
dim: 875
N: 27796
-----------------


 23%|██████████████████▍                                                              | 47/206 [08:10<35:23, 13.35s/it]

-- column check completed --
  columns are satisfied
--  featuresV1  --
dim: 875
N: 27796
-----------------


 23%|██████████████████▊                                                              | 48/206 [08:16<29:20, 11.14s/it]

-- column check completed --
  columns are satisfied
--  featuresV1  --
dim: 875
N: 27796
-----------------


 24%|███████████████████▎                                                             | 49/206 [08:21<24:51,  9.50s/it]

-- column check completed --
  columns are satisfied
--  featuresV1  --
dim: 875
N: 27796
-----------------


 24%|███████████████████▋                                                             | 50/206 [08:31<24:34,  9.45s/it]

-- column check completed --
  columns are satisfied
--  featuresV1  --
dim: 875
N: 27796
-----------------


 25%|████████████████████                                                             | 51/206 [08:35<20:38,  7.99s/it]

-- column check completed --
  columns are satisfied
--  featuresV1  --
dim: 875
N: 27796
-----------------


 25%|████████████████████▍                                                            | 52/206 [08:42<19:28,  7.59s/it]

-- column check completed --
  columns are satisfied
--  featuresV1  --
dim: 875
N: 27796
-----------------


 26%|████████████████████▊                                                            | 53/206 [08:46<16:53,  6.63s/it]

-- column check completed --
  columns are satisfied
--  featuresV1  --
dim: 875
N: 27796
-----------------


 26%|█████████████████████▏                                                           | 54/206 [08:49<13:45,  5.43s/it]

-- column check completed --
  columns are satisfied
--  featuresV1  --
dim: 875
N: 27796
-----------------
