In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import statistics
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
import lightgbm as lgb

ライブラリのimportをします。

# データの中身の確認

In [None]:
train_features = pd.read_csv('/kaggle/input/lish-moa/train_features.csv')
train_targets = pd.read_csv('/kaggle/input/lish-moa/train_targets_scored.csv')
test_features = pd.read_csv('/kaggle/input/lish-moa/test_features.csv')
submission = pd.read_csv('/kaggle/input/lish-moa/sample_submission.csv')

データを読み取ります。
X_trainがtrain_features、Y_trainがtrain_targets、X_testがtest_featuresにあたります。

In [None]:
train_features.head()

In [None]:
train_targets.head()

In [None]:
test_features.head()

In [None]:
submission.head()

それぞれのファイルの中身を確認します。
なお、ここでは欠損値の確認を省略していますが、欠損値はいずれのデータにも含まれていませんでした。

submissionでは、全ての確率を0.5としているようです。

# データの前処理

ここからデータの前処理に入ります。
train_features及びtest_featuresのデータの分布について確認します。

In [None]:
train_features.describe()

In [None]:
test_features.describe()

g-0からc-99までの間で最小値と最大値が平均から大きく外れているデータがあるのが確認できます。


全部のデータを表示するのは困難なので、とりあえずg-0からc-99の中からランダムに選んだ4つのデータを表示してみることにします。

In [None]:
plt.figure(figsize=(8,6))

plt.subplot(2,2,1)
plt.hist(train_features["g-0"])
 
plt.subplot(2,2,2)
plt.hist(train_features["c-0"])

plt.subplot(2,2,3)
plt.hist(train_features["g-178"])
 
plt.subplot(2,2,4)
plt.hist(train_features["c-32"])

plt.show()

In [None]:
plt.figure(figsize=(8,6))

plt.subplot(2,2,1)
plt.hist(test_features["g-0"])
 
plt.subplot(2,2,2)
plt.hist(test_features["c-0"])

plt.subplot(2,2,3)
plt.hist(test_features["g-178"])
 
plt.subplot(2,2,4)
plt.hist(test_features["c-32"])

plt.show()

トレインデータにもテストデータにも外れ値がある以上、単に平均から外れているからといって外れ値として処理するより、上手くデータを処理した方がいいのではないかと考えました。

前処理をする前に一時的にトレインデータとテストデータを結合します

In [None]:
ALL=pd.concat([train_features,test_features])

In [None]:
ALL.shape

In [None]:
ALL.head()

まずは、先ほどの外れ値の処理の前にcp_type、cp_dose、を整数値(0,1)に変換します。
また、cp_timeは3種類(24,48,72)しかないので、単純に24を0、48を1、72を2にします。

In [None]:
ALL["cp_type"].replace(["trt_cp","ctl_vehicle"],[0,1],inplace=True)
ALL["cp_dose"].replace(["D1","D2"],[0,1],inplace=True)
ALL["cp_time"]=ALL["cp_time"]//24-1

g-0からc-99まで、平均を0、標準偏差を1とする正規化を行います。

In [None]:
def standardization(l):
    l_mean=statistics.mean(l)
    l_stdev=statistics.stdev(l)
    ret=[]
    for x in l:
        y=(x-l_mean)/l_stdev
        ret.append(y)
    return ret

In [None]:
for x in ALL:
    if x in ["sig_id","cp_type","cp_time","cp_dose"]:
        continue
    ALL[x]=standardization(ALL[x])

In [None]:
ALL.head()

In [None]:
plt.figure(figsize=(8,6))

plt.subplot(2,2,1)
plt.hist(ALL["g-0"])
 
plt.subplot(2,2,2)
plt.hist(ALL["c-0"])

plt.subplot(2,2,3)
plt.hist(ALL["g-178"])
 
plt.subplot(2,2,4)
plt.hist(ALL["c-32"])

plt.show()

# 特徴量エンジニアリング

特徴量を追加します。

In [None]:
ALL.columns

In [None]:
GENES = [col for col in train_features.columns if col.startswith('g-')]
CELLS = [col for col in train_features.columns if col.startswith('c-')]

In [None]:
ALL['g_sum'] = ALL[GENES].sum(axis = 1)
ALL['g_std'] = ALL[GENES].std(axis = 1)
ALL['g_kurt'] = ALL[GENES].kurtosis(axis = 1)#尖度
ALL['g_skew'] = ALL[GENES].skew(axis = 1)#歪み

ALL['c_sum'] = ALL[CELLS].sum(axis = 1)
ALL['c_std'] = ALL[CELLS].std(axis = 1)
ALL['c_kurt'] = ALL[CELLS].kurtosis(axis = 1)#尖度
ALL['c_skew'] = ALL[CELLS].skew(axis = 1)#歪み

ALL['gc_sum'] = ALL[CELLS+GENES].sum(axis = 1)
ALL['gc_std'] = ALL[CELLS+GENES].std(axis = 1)
ALL['gc_kurt'] = ALL[CELLS+GENES].kurtosis(axis = 1)#尖度
ALL['gc_skew'] = ALL[CELLS+GENES].skew(axis = 1)#歪み

In [None]:
ADD_CNT=12
for x in GENES:
    ALL[x+"High"]=(ALL[x]**2)**0.5
    MAX=ALL[x+"High"].max()
    for y in ALL[x+"High"]:
        if y<=MAX/2:
            y=0
        else:
            y=y-MAX/2
    ALL[x+"High"]=ALL[x+"High"]*(abs(ALL[x])/ALL[x])
    ADD_CNT+=1
print("OK")

for x in CELLS:
    ALL[x+"High"]=(ALL[x]**2)**0.5
    MAX=ALL[x+"High"].max()
    for y in ALL[x+"High"]:
        if y<=MAX/2:
            y=0
        else:
            y=y-MAX/2
    ALL[x+"High"]=ALL[x+"High"]*(abs(ALL[x])/ALL[x])
    ADD_CNT+=1

In [None]:
for x in ALL:
    if x in ALL.columns[-ADD_CNT:]:
        ALL[x]=standardization(ALL[x])

In [None]:
ALL.describe()

In [None]:
ALL=ALL.drop(["sig_id"],axis=1)

In [None]:
ALL.head()

まずは列数が多いので、主成分分析をします

In [None]:
from sklearn.decomposition import PCA
pca=PCA(150)
ALL=pca.fit_transform(ALL)
Columns=[]
for i in range(150):
    Columns.append("d"+str(i+1))
ALL=pd.DataFrame(ALL,columns=Columns)
categorical_features=Columns

In [None]:
ALL.head()

In [None]:
pca.explained_variance_ratio_

In [None]:
pca.explained_variance_ratio_.cumsum()

In [None]:
def plus(l):
    ret=[]
    p=max(-min(l),0)
    for x in l:
        ret.append(x+p)
    return ret

In [None]:
for x in ALL:
    ALL[x]=plus(ALL[x])

In [None]:
ALL.head()

# 学習(light_gbm+交差検証)

In [None]:
from sklearn.model_selection import train_test_split
y_Train=train_targets
x_train,x_test=train_test_split(ALL,train_size=len(train_features),shuffle=False)
x_test = x_test.reset_index()
x_test=x_test.drop(["index"],axis=1)

In [None]:
x_train.head()

In [None]:
x_test.head()

In [None]:
y_Train.head()

In [None]:
categorical_features=list(x_train.columns)
categorical_features

In [None]:
print(x_train.shape)
print(x_test.shape)
print(y_Train.shape)

In [None]:
ALL_SCORE=[0,0]
for x in y_Train:
    if x=="sig_id":
        continue
    
    y_train=y_Train[x]
    
    y_preds = []
    models = []
    oof_train = np.zeros((len(x_train),))
    cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=0)

    params = {
        'objective': 'binary',
        'max_bin': 450,
        'learning_rate': 0.02,
        'num_leaves': 63
    }

    for fold_id, (train_index, valid_index) in enumerate(cv.split(x_train,y_train)):
        x_tr = x_train.loc[train_index,:]
        x_val = x_train.loc[valid_index,:]
        y_tr = y_train[train_index]
        y_val = y_train[valid_index]
        lgb_train = lgb.Dataset(x_tr, y_tr, categorical_feature=categorical_features)
        lgb_eval = lgb.Dataset(x_val, y_val, reference=lgb_train, categorical_feature=categorical_features)
        model = lgb.train(
            params, lgb_train,
            valid_sets=[lgb_train, lgb_eval],
            verbose_eval=10,
            num_boost_round=1000,
            early_stopping_rounds=10
        )

        oof_train[valid_index] = model.predict(x_val, num_iteration=model.best_iteration)
        
        y_pred = model.predict(x_test, num_iteration=model.best_iteration)
        
        y_preds.append(y_pred)
        models.append(model)
    
    submission[x]=sum(y_preds)/len(y_preds)
    
    scores = [
        m.best_score['valid_1']['binary_logloss'] for m in models
    ]
    ALL_SCORE[0]+=sum(scores)
    ALL_SCORE[1]+=len(scores)

In [None]:
submission.head()

In [None]:
print("CV_score is ",ALL_SCORE[0]/ALL_SCORE[1])

In [None]:
submission.to_csv('submission.csv', index=False)