In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# データ読み込み

In [None]:
train = pd.read_csv('../input/spaceship-titanic/train.csv')
test = pd.read_csv('../input/spaceship-titanic/test.csv')
submit = pd.read_csv('../input/spaceship-titanic/sample_submission.csv')

# EDA

In [None]:
train.info()

In [None]:
test.info()

In [None]:
submit.head()

In [None]:
!pip install sweetviz
import sweetviz as sv

In [None]:
# train のEDA
my_report_train = sv.analyze(train)
my_report_train.show_html("sweetviz_report_Spaceship_train_V1.html")

# train と test の関係
my_report_trainVStest = sv.compare([train, "Train"], [test, "Test"], "Transported")
my_report_trainVStest.show_html("sweetviz_report_Spaceship_trainVStest_V1.html")

# 前処理 
1. Cabin分裂(deck(encoding),side(encoding),num(そのまま))
2. HomePlanetとDestination合併
3. 家族(nameから)
4. 同室人数
5. カテゴリ変数の欠損値補完
6. カテゴリ変数の変換 (HomePlanet・Destination・CryoSleep・VIP・Transportedを数値変換)
7. PassengerId・Cabin・Nameを削除
8. clipping
9. binning

In [None]:
# Cabin分裂(deck(encoding),side(encoding),num(そのまま))

CabinAry_train = train["Cabin"].str.split("/", expand=True)
CabinAry_test = train["Cabin"].str.split("/", expand=True)

train["Cabin_Deck"] = CabinAry_train[0]
train["Cabin_Num"] = CabinAry_train[1]
train["Cabin_Side"] = CabinAry_train[2]

test["Cabin_Deck"] = CabinAry_test[0]
test["Cabin_Num"] = CabinAry_test[1]
test["Cabin_Side"] = CabinAry_test[2]

# Cabin_Num がoblect型になっていてlightgbmに突っ込めないからfloat型にする
train["Cabin_Num"] = train["Cabin_Num"].astype(float)
test["Cabin_Num"] = test["Cabin_Num"].astype(float)

In [None]:
# HomePlanetとDestination合併

train["Home×Dest"] = train["HomePlanet"] + train["Destination"]
test["Home×Dest"] = test["HomePlanet"] + test["Destination"]

In [None]:
# 家族(nameから)

train["Family"] = train["Name"].str.split(" ", expand=True)[1]
test["Family"] = test["Name"].str.split(" ", expand=True)[1]

In [None]:
# 同室人数

#初期値=0を設定
train["SameRoomNum"] = test["SameRoomNum"] = 0
CabinList_train = train["Cabin"].tolist()
CabinList_test = test["Cabin"].tolist()

for i in train.index.values:
    train["SameRoomNum"][i] = CabinList_train.count(train["Cabin"][i])
for i in test.index.values:
    test["SameRoomNum"][i] = CabinList_test.count(test["Cabin"][i])

train["SameRoomNum"].replace(199,np.nan, inplace=True)
test["SameRoomNum"].replace(100,np.nan, inplace=True)

> 👆👆　**199をNANに戻したいけどうまくいかない ,　全探索だと時間がかかりすぎる**

In [None]:
test["SameRoomNum"].head(20)

In [None]:
train["SameRoomNum"].head(20)

In [None]:
# カテゴリ変数の欠損値補完

cat_columns_train = ["HomePlanet","Destination","CryoSleep","VIP","Cabin_Deck","Cabin_Side", "Home×Dest", "Family", "Transported"]
cat_columns_test = ["HomePlanet","Destination","CryoSleep","VIP","Cabin_Deck","Cabin_Side", "Home×Dest", "Family"]

# 欠損値にunknowを代入
for c in cat_columns_train:
    train[c].fillna("unknow")
for c in cat_columns_test:
    test[c].fillna("unknow")

In [None]:
# カテゴリ変数の変換 (HomePlanet,Destination,CryoSleep,VIP,Cabin_Deck,Cabin_Side,Home×Dest,Family,Transported を数値変換)

from sklearn.preprocessing import LabelEncoder

for c in cat_columns_train:
    le = LabelEncoder()
    le.fit(train[c])
    train[c] = le.transform(train[c])

for c in cat_columns_test:
    le = LabelEncoder()
    le.fit(test[c])
    test[c] = le.transform(test[c])

In [None]:
train.head(20)

In [None]:
train

In [None]:
test

In [None]:
# 前処理後の train のEDA
my_report_train = sv.analyze(train)
my_report_train.show_html("sweetviz_report_Spaceship_train_V2.html")

# 前処理後の train と test の関係
my_report_trainVStest = sv.compare([train, "Train"], [test, "Test"], "Transported")
my_report_trainVStest.show_html("sweetviz_report_Spaceship_trainVStest_V2.html")

In [None]:
# PassengerId・Cabin・Nameを削除

train.drop(['PassengerId', 'Cabin', 'Name'], axis=1, inplace=True)
test.drop(['PassengerId', 'Cabin', 'Name'], axis=1, inplace=True)

In [None]:
# clipping

In [None]:
# binning

# 学習

In [None]:
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score

In [None]:
X_train = train.drop(["Transported"], axis=1)
y_train = train["Transported"]

In [None]:
# StratifiedKFold fold=5 でとりあえず学習
fold = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)

score_list = []
models = []

for train_index, valid_index in fold.split(X_train, y_train):
    train_x = X_train.iloc[train_index]
    valid_x = X_train.iloc[valid_index]
    train_y = y_train.iloc[train_index]
    valid_y = y_train.iloc[valid_index]
    
    lgb_train = lgb.Dataset(train_x, train_y)
    lgb_valid = lgb.Dataset(valid_x, valid_y)
    
    # パラメータを定義
    lgbm_params = {'oblective': 'binary'}
    
    # 学習する
    gbm = lgb.train(params = lgbm_params,
                   train_set = lgb_train,
                   valid_sets = [lgb_train, lgb_valid],
                   early_stopping_rounds = 20, 
                   verbose_eval = -1 # 学習の状況を表示しない
                   )
    
    # スコアを算出する
    oof = (gbm.predict(valid_x) > 0.5).astype(int)
    score_list.append(round(accuracy_score(valid_y, oof)*100 ,2))
    models.append(gbm)
print(score_list, "平均score", round(np.mean(score_list), 2))

In [None]:
# テストデータを与えて予測する

test_pred = np.zeros((len(test), 5))

for fold_, gbm in enumerate(models):
    pred_ = gbm.predict(test)
    test_pred[:, fold_] = pred_
    
pred = (np.mean(test_pred, axis=1) > 0.5).astype(int)

submit["Transported"] = pred
Transporteds = {0:False, 1:True}
submit["Transported"] = submit["Transported"].map(Transporteds)

submit.to_csv("spaceship_StratifiedKFold_5-fold_CV.csv", index=False)

In [None]:
# 特徴量重要度の表示

# for fold_, gbm in enumerate(models):
#     print(f"fold{fold_ + 1} feature_importance\n")
pd.DataFrame({"特徴":X_train.columns, "importance":gbm.feature_importance(importance_type="gain")}).sort_values("importance", ascending=False)

In [None]:
# 特徴量重要度の可視化

import seaborn as sns
import matplotlib.pyplot as plt
def visualize_importance(models, feat_train_df):
    """lightGBM の model 配列の feature importance を plot する
    CVごとのブレを boxen plot として表現します.

    args:
        models:
            List of lightGBM models
        feat_train_df:
            学習時に使った DataFrame
    """
    feature_importance_df = pd.DataFrame()
    for i, model in enumerate(models):
        _df = pd.DataFrame()
        _df["feature_importance"] = model.feature_importance(importance_type="gain")
        _df["column"] = feat_train_df.columns
        _df["fold"] = i + 1
        feature_importance_df = pd.concat([feature_importance_df, _df], 
                                          axis=0, ignore_index=True)

    order = feature_importance_df.groupby("column")\
        .sum()[["feature_importance"]]\
        .sort_values("feature_importance", ascending=False).index[:50]

    fig, ax = plt.subplots(figsize=(8, max(6, len(order) * .25)))
    sns.boxenplot(data=feature_importance_df, 
                  x="feature_importance", 
                  y="column", 
                  order=order, 
                  ax=ax, 
                  palette="viridis", 
                  orient="h")
    ax.tick_params(axis="x", rotation=90)
    ax.set_title("Importance")
    ax.grid()
    fig.tight_layout()
    return fig, ax

fig, ax = visualize_importance(models, train_x)

In [None]:
X_train_dpCryo = train.drop(["Transported", "CryoSleep"], axis=1)
y_train = train["Transported"]

test_dpCryo = test.drop(["CryoSleep"], axis=1)

In [None]:
# CryoSleep 落として実験
fold = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)

score_list = []
models = []

for train_index, valid_index in fold.split(X_train_dpCryo, y_train):
    print(f"fold{fold_ + 1} start")
    train_x = X_train_dpCryo.iloc[train_index]
    valid_x = X_train_dpCryo.iloc[valid_index]
    train_y = y_train.iloc[train_index]
    valid_y = y_train.iloc[valid_index]
    
    lgb_train = lgb.Dataset(train_x, train_y)
    lgb_valid = lgb.Dataset(valid_x, valid_y)
    
    # パラメータを定義
    lgbm_params = {'oblective': 'binary'}
    
    # 学習する
    gbm = lgb.train(params = lgbm_params,
                   train_set = lgb_train,
                   valid_sets = [lgb_train, lgb_valid],
                   early_stopping_rounds = 20, 
                   verbose_eval = -1 # 学習の状況を表示しない
                   )
    
    # スコアを算出する
    oof = (gbm.predict(valid_x) > 0.5).astype(int)
    score_list.append(round(accuracy_score(valid_y, oof)*100 ,2))
    models.append(gbm)
print(score_list, "平均score", round(np.mean(score_list), 2))

In [None]:
# テストデータを与えて予測する

test_pred = np.zeros((len(test_dpCryo), 5))

for fold_, gbm in enumerate(models):
    pred_ = gbm.predict(test_dpCryo)
    test_pred[:, fold_] = pred_
    
pred = (np.mean(test_pred, axis=1) > 0.5).astype(int)

submit["Transported"] = pred
Transporteds = {0:False, 1:True}
submit["Transported"] = submit["Transported"].map(Transporteds)

submit.to_csv("spaceship_StratifiedKFold_5-fold_CV_dpCryo.csv", index=False)

In [None]:
# 特徴量重要度の可視化

import seaborn as sns
import matplotlib.pyplot as plt
def visualize_importance(models, feat_train_df):
    """lightGBM の model 配列の feature importance を plot する
    CVごとのブレを boxen plot として表現します.

    args:
        models:
            List of lightGBM models
        feat_train_df:
            学習時に使った DataFrame
    """
    feature_importance_df = pd.DataFrame()
    for i, model in enumerate(models):
        _df = pd.DataFrame()
        _df["feature_importance"] = model.feature_importance(importance_type="gain")
        _df["column"] = feat_train_df.columns
        _df["fold"] = i + 1
        feature_importance_df = pd.concat([feature_importance_df, _df], 
                                          axis=0, ignore_index=True)

    order = feature_importance_df.groupby("column")\
        .sum()[["feature_importance"]]\
        .sort_values("feature_importance", ascending=False).index[:50]

    fig, ax = plt.subplots(figsize=(8, max(6, len(order) * .25)))
    sns.boxenplot(data=feature_importance_df, 
                  x="feature_importance", 
                  y="column", 
                  order=order, 
                  ax=ax, 
                  palette="viridis", 
                  orient="h")
    ax.tick_params(axis="x", rotation=90)
    ax.set_title("Importance")
    ax.grid()
    fig.tight_layout()
    return fig, ax

fig, ax = visualize_importance(models, train_x)

# 課題
1. trainとtestをdfでまとめて特徴量を作る
2. for分を減らして実行速度上げる
3. 