In [1]:
from sklearn import preprocessing, impute, model_selection
import pandas as pd
import numpy as np
import random
import os

In [2]:
def seed_everything(seed=42):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
seed_everything()

In [3]:
DATA_DIR = "spaceship-titanic"

def filepath(filename):
    return os.path.join(DATA_DIR,filename)

In [38]:
train_df = pd.read_csv(filepath("train.csv"),index_col="PassengerId")
test_df  = pd.read_csv(filepath("test.csv"), index_col="PassengerId")

# Add PasssengerId sinze we need it for feature engineering
# PAssengerIdは学習に使用したいのでカラムとして追加する．

train_df["PassengerId"] = train_df.index
test_df["PassengerId"]  = test_df.index

In [39]:
len(train_df),len(test_df)

(8693, 4277)

In [40]:
train_df.head()

Unnamed: 0_level_0,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,PassengerId
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False,0001_01
0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True,0002_01
0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False,0003_01
0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False,0003_02
0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True,0004_01


In [41]:
train_df["Destination"].unique()

array(['TRAPPIST-1e', 'PSO J318.5-22', '55 Cancri e', nan], dtype=object)

### 個人的データ解釈
- PassengerId   :そのまま`[str]`
- HomePlanet    :おうちのある星(Europa,Earth,Mars,nan)`[str]`
- CryoSleep     :コールドスリープしてるかどうか`[bool]`
- Cabin         :"[A-Z]/[0-9]/[SorP]"`[str]`
- Destination   :目的地("TRAPPIST-1e","PSO J318.5-22","55 Cancri e",nan)`[str]`
- Age           :年齢`[int]`
- VIP           :VIPかどうか`[bool]`
- RoomService   :ルームサービスでつかった金額       `[int]`
- FoodCourt     :フードコートでつかった金額         `[int]`
- ShoppingMall  :ショッピングモールでつかった金額   `[int]`
- Spa           :スパで使った金額                  `[int]`
- VRDeck        :(?)  `[int]`
- Name          :名前 `[str]`
- Transported   :到着したかどうか   `[int]`

### Initial Feature Engineering
### 特徴量エンジニアリングの初期化
#### See Spaceship Titanic - Exploratory Data Analysis
> Note: All features extraced from `Cabin` will be engineered after missing values are imputed but function is created here.  
> 注: `キャビン`から引き渡されたすべての機能は，欠損値が入力された後に設計されますが，関数はここで作成されます(?)

In [42]:
# お金使ったやつまとめる
expenditure_columns = ["RoomService","FoodCourt","ShoppingMall","Spa","VRDeck"]

#### Idの分解
- PassengerIdはGroupIdとGroupSizeに分けられる

In [43]:
def from_passengerId(df:pd.DataFrame) -> pd.DataFrame:
    """PassengerIdからグループ人数の追加

    Args:
        df (pd.DataFrame): DataFrame of csv

    Returns:
        pd.DataFrame: add "GroupId" and "GroupSize"
    """
    
    split_id = df["PassengerId"].str.split("_",expand=True)
    df["GroupId"]   = split_id[0]
    df["GroupSize"] = df.groupby("GroupId")["GroupId"].transform("count")

    # Indicates whether the passenger was traveling alone or not
    # 乗客が一人か一人ではないか

    df["Alone"] = (df["GroupSize"] == 1)

    return df

In [44]:
train_df = from_passengerId(train_df)
test_df  = from_passengerId(test_df)

In [45]:
train_df.head()

Unnamed: 0_level_0,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,PassengerId,GroupId,GroupSize,Alone
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False,0001_01,1,1,True
0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True,0002_01,2,1,True
0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False,0003_01,3,2,False
0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False,0003_02,3,2,False
0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True,0004_01,4,1,True


### Presence of Missing Values
### 欠損値の存在
- 関数`missing_values_features()`は列のリストを取得し，欠損値が存在するかどうかを示す新しい列を追加する．nullが無視されないときにTotalExpense(すべての支出が列の合計)が欠落しているかどうかを示す`TotalExpense_missing`という機能の追加

In [46]:
def missing_value_features(df:pd.DataFrame,columns:list,expenditure_columns:list) ->pd.DataFrame:
    """お金を使うcolumnの中でnaの値かどうか格納されたmissingカラムの追加

    Args:
        df (pd.DataFrame): DataFrame
        columns (list): お金を使うカラム
        expenditure_columns (list): 上と同じ感じ

    Returns:
        pd.DataFrame: カラムが追加されたdf
    """ 
    for column in columns:
        df[f"{column}_missing"] = df[column].isna()
    
    df["TotalExpense_missing"] = df[expenditure_columns].sum(axis=1,skipna=False).isna()
    return df

In [47]:
columns = ["RoomService", "FoodCourt", "ShoppingMall", "Cabin", "VIP"]
train_df = missing_value_features(train_df, columns, expenditure_columns)
test_df  = missing_value_features(test_df,  columns, expenditure_columns)


In [48]:
train_df.head()

Unnamed: 0_level_0,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,...,PassengerId,GroupId,GroupSize,Alone,RoomService_missing,FoodCourt_missing,ShoppingMall_missing,Cabin_missing,VIP_missing,TotalExpense_missing
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,...,0001_01,1,1,True,False,False,False,False,False,False
0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,...,0002_01,2,1,True,False,False,False,False,False,False
0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,...,0003_01,3,2,False,False,False,False,False,False,False
0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,...,0003_02,3,2,False,False,False,False,False,False,False
0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,...,0004_01,4,1,True,False,False,False,False,False,False


# 特徴量抽出
`from_expendingture_feature`はすべての支出カラムからその合計(特徴量)を抽出する(nullは無視)

In [49]:
def from_expenditure_feantures(df:pd.DataFrame,expenditure_columns:list) -> pd.DataFrame:
    #一人ずつ合計支出カラムの追加
    df["TotalExpense"] = df[expenditure_columns].sum(axis=1)

    return df

In [50]:
train_df = from_expenditure_feantures(train_df,expenditure_columns)
test_df  = from_expenditure_feantures(test_df,expenditure_columns)

In [51]:
train_df.head()

Unnamed: 0_level_0,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,...,GroupId,GroupSize,Alone,RoomService_missing,FoodCourt_missing,ShoppingMall_missing,Cabin_missing,VIP_missing,TotalExpense_missing,TotalExpense
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,...,1,1,True,False,False,False,False,False,False,0.0
0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,...,2,1,True,False,False,False,False,False,False,736.0
0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,...,3,2,False,False,False,False,False,False,False,10383.0
0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,...,3,2,False,False,False,False,False,False,False,5176.0
0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,...,4,1,True,False,False,False,False,False,False,1091.0


### cabinの分割
`from_cabin`によって`Deck`,`Num`,`Side`の三つのカラムに分割します

In [52]:
def from_cabin(df:pd.DataFrame) -> pd.DataFrame:
    df[["CabinDeck","CabinNum","CabinSide"]] = df["Cabin"].str.split("/",expand=True)

    return df

### カテゴリかる変数のna埋め
`HomePlanet`と`CryoSleep`,`Destiation`は`feature-level`のモードによって，`Cabin`はグループモードによって埋められる(?)

In [53]:
def simple_mode_replacement(df:pd.DataFrame,columns:list) -> pd.DataFrame:
    df[columns] = df[columns].fillna(df[columns].mode().iloc[0])
    
    return df

In [54]:
columns = ["HomePlanet","CryoSleep","Destination"]
train_df= simple_mode_replacement(train_df,columns)
test_df = simple_mode_replacement(test_df,columns)

In [55]:
train_df[columns].isna().any()

HomePlanet     False
CryoSleep      False
Destination    False
dtype: bool

In [56]:
# Cabinのほう
def group_mode_replacement(df:pd.DataFrame, groupby:str or list,column:str) -> pd.DataFrame:
    # Find all passengers belonging to groups where at least one member has a non-null column value
    # 少なくとも一人，nullではないカラム値を持つメンバーを検索する
    temp = df.groupby(groupby).filter(lambda x: x[column].notna().any())
    func = lambda x: x.fillna(x.mode().iloc[0]) if x.isna().any() else x
    temp[column] = temp.groupby(groupby)[column].transform(func)

    df.loc[temp.index,column] = temp[column]

    return df

In [57]:
train_df = group_mode_replacement(train_df,groupby="GroupId",column="Cabin")
test_df  = group_mode_replacement(test_df,groupby="GroupId",column="Cabin")


今はまだtrainデータに99人の乗客が,testデータに63人が`Cabin`についてnullです．  
これの解決のために`HomePlanet`と`Destination`の最頻値によって埋めます

In [58]:
train_df = group_mode_replacement(train_df,groupby=["HomePlanet","Destination"],column="Cabin")
test_df = group_mode_replacement(test_df,["HomePlanet","Destination"],"Cabin")

### Cabinの分割
Cabinの穴埋めができたので改めてCabinを分割

In [59]:
train_df = from_cabin(train_df)
test_df = from_cabin(test_df)

In [60]:
train_df["VIP"].isna().sum(),test_df["VIP"].isna().sum()

(203, 93)

### VIPに対する考察
データより以下のことが言える
- 支出が0でコールドスリープをしていない乗客はVIPではない
- 12歳以下の乗客はVIPではない
- 地球からの乗客はVIPではない
- 火星からのVIPは18歳以上で，コールドスリープをせず，`5 Cancri e`にはいかない
このことより

In [61]:
def impute_vip_for_no_spend(df):
    #"VIP"がnull & 合計支出が0 & "CryoSleep"をしていない
    df.loc[(df["VIP"].isna()) & (df["TotalExpense"] == 0.0) & (~df["CryoSleep"]),"VIP"] = False
    return df

def impute_vip_for_children(df):
    #"VIP"がnull & "Age"が12以下
    df.loc[(df["VIP"].isna()) & (df["Age"] <= 12), "VIP"] = False
    return df

def impute_vip_for_earthling(df):
    #"VIP"がnull & "HomePlanet"が"Earth"
    df.loc[(df["VIP"].isna()) & (df["HomePlanet"] == "Earth"), "VIP"] = False
    return df

def impute_vip_for_martians(df):
    #"VIP"がnull & "Age"が18 & "CryoSleep"はしていない & "Destination"は"55 cancri e"ではない
    df.loc[(df["VIP"].isna()) & (df["Age"] >= 18) & (~df["CryoSleep"]) & (df["Destination"] != "55 cancri e"),"VIP"] = True
    return df

def impute_vip(df):
    df = impute_vip_for_no_spend(df)
    df = impute_vip_for_children(df)
    df = impute_vip_for_earthling(df)
    df = impute_vip_for_martians(df)
    return df

train_df = impute_vip(train_df)
test_df  = impute_vip(test_df)

In [62]:
train_df = impute_vip(train_df)
test_df  = impute_vip(test_df)

In [63]:
test_df["VIP"].isna().sum()

22

In [64]:
(test_df["VIP"]).sum()

99

In [65]:
#ランダムで決める
def impute_vip_by_prob(df):
    probs = df["VIP"].value_counts() / df["VIP"].notna().sum()
    values = np.random.choice([False, True], size=df["VIP"].isna().sum(), p=probs)
    df.loc[df["VIP"].isna(), "VIP"] = values
    df["VIP"] = df["VIP"].astype(bool)
    return df

#多いほうに合わせる
def impute_vip_by_majority(df):
    vip_sum = df["VIP"].sum()
    vip_count = df["VIP"].count()
    if vip_count > vip_sum:
        df.loc[df["VIP"].isna(),"VIP"] = False
    else:
        df.loc[df["VIP"].isna(),"VIP"] = True
    return df

train_df = impute_vip_by_majority(train_df)
test_df = impute_vip_by_majority(test_df)

In [66]:
test_df["VIP"].isna().sum()

0

### いらない特徴量の削除
`PassengerId`,`Cabin`,`Name`は関係ないので消します.

In [67]:
drop = ["PassengerId","Cabin","Name"]
train_df = train_df.drop(drop,axis=1)
test_df  = test_df.drop(drop,axis=1)

In [68]:
# nullの確認
train_df.isna().any()

HomePlanet              False
CryoSleep               False
Destination             False
Age                      True
VIP                     False
RoomService              True
FoodCourt                True
ShoppingMall             True
Spa                      True
VRDeck                   True
Transported             False
GroupId                 False
GroupSize               False
Alone                   False
RoomService_missing     False
FoodCourt_missing       False
ShoppingMall_missing    False
Cabin_missing           False
VIP_missing             False
TotalExpense_missing    False
TotalExpense            False
CabinDeck               False
CabinNum                False
CabinSide               False
dtype: bool

### 特徴量のエンコーディング
二つのデータセットをつなげる`concat_train_test()`と分ける`split_train_test()`  
でも`CabinNum`と`GroupSize`はexperimentの一つなので無視します．

In [69]:
def concat_train_test(train:pd.DataFrame,test:pd.DataFrame,has_labels=False) -> tuple:
    transported = None
    
    #testデータにこのラベルはないのでもしもあったら削除しなきゃ
    if has_labels is True:
        transported = train["Transported"].copy()
        train = train.drop("Transported",axis=1)
    
    train_index = train.index
    test_index  = test.index

    df = pd.concat([train,test])

    return df,train_index,test_index,transported

def split_train_test(df,train_index,test_index,transported=None):
    train_df = df.loc[train_index,:]
    if transported is not None:
        train_df["Transported"] = transported
    
    test_df = df.loc[test_index,:]

    return train_df,test_df

In [70]:
df, train_idx, test_idx, transported = concat_train_test(train_df, test_df, has_labels=True)
df.head()

Unnamed: 0_level_0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,...,RoomService_missing,FoodCourt_missing,ShoppingMall_missing,Cabin_missing,VIP_missing,TotalExpense_missing,TotalExpense,CabinDeck,CabinNum,CabinSide
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0001_01,Europa,False,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,...,False,False,False,False,False,False,0.0,B,0,P
0002_01,Earth,False,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,...,False,False,False,False,False,False,736.0,F,0,S
0003_01,Europa,False,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,...,False,False,False,False,False,False,10383.0,A,0,S
0003_02,Europa,False,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,...,False,False,False,False,False,False,5176.0,A,0,S
0004_01,Earth,False,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,...,False,False,False,False,False,False,1091.0,F,1,S


### bool2int
ロジスティック回帰ではカテゴリ変数をboolのまま操作できないのでintに変換します

In [71]:
def bool2int(df):
    columns = [column for column in df.columns if df[column].dtype.name == "bool"]
    df[columns] = df[columns].astype(int)
    return df

In [72]:
df = bool2int(df)

In [73]:
df.head()
# VIPとかがbinaryになった！

Unnamed: 0_level_0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,...,RoomService_missing,FoodCourt_missing,ShoppingMall_missing,Cabin_missing,VIP_missing,TotalExpense_missing,TotalExpense,CabinDeck,CabinNum,CabinSide
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0001_01,Europa,0,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0.0,B,0,P
0002_01,Earth,0,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,...,0,0,0,0,0,0,736.0,F,0,S
0003_01,Europa,0,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,...,0,0,0,0,0,0,10383.0,A,0,S
0003_02,Europa,0,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,...,0,0,0,0,0,0,5176.0,A,0,S
0004_01,Earth,0,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,...,0,0,0,0,0,0,1091.0,F,1,S


### CabinSideって…
CabinSideは`S`と`P`からなるのでbinaryに変換できます．

In [74]:
df["CabinSide"] = df["CabinSide"].map({"S":0,"P":1})

### カテゴリ変数をダミー変数にする
owari

In [75]:
to_be_encoded = ["HomePlanet","Destination","GroupSize","CabinDeck"]
df = pd.get_dummies(df,columns=to_be_encoded)

In [76]:
df.head()

Unnamed: 0_level_0,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,GroupId,Alone,...,GroupSize_7,GroupSize_8,CabinDeck_A,CabinDeck_B,CabinDeck_C,CabinDeck_D,CabinDeck_E,CabinDeck_F,CabinDeck_G,CabinDeck_T
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0001_01,0,39.0,False,0.0,0.0,0.0,0.0,0.0,1,1,...,0,0,0,1,0,0,0,0,0,0
0002_01,0,24.0,False,109.0,9.0,25.0,549.0,44.0,2,1,...,0,0,0,0,0,0,0,1,0,0
0003_01,0,58.0,True,43.0,3576.0,0.0,6715.0,49.0,3,0,...,0,0,1,0,0,0,0,0,0,0
0003_02,0,33.0,False,0.0,1283.0,371.0,3329.0,193.0,3,0,...,0,0,1,0,0,0,0,0,0,0
0004_01,0,16.0,False,303.0,70.0,151.0,565.0,2.0,4,1,...,0,0,0,0,0,0,0,1,0,0


### 数値の欠損値を埋める
埋める前にtrainとtestデータを分割する

In [77]:
train_df, test_df = split_train_test(df, train_idx, test_idx, transported=transported)

In [78]:
train_df.head()

Unnamed: 0_level_0,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,GroupId,Alone,...,GroupSize_8,CabinDeck_A,CabinDeck_B,CabinDeck_C,CabinDeck_D,CabinDeck_E,CabinDeck_F,CabinDeck_G,CabinDeck_T,Transported
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0001_01,0,39.0,False,0.0,0.0,0.0,0.0,0.0,1,1,...,0,0,1,0,0,0,0,0,0,False
0002_01,0,24.0,False,109.0,9.0,25.0,549.0,44.0,2,1,...,0,0,0,0,0,0,1,0,0,True
0003_01,0,58.0,True,43.0,3576.0,0.0,6715.0,49.0,3,0,...,0,1,0,0,0,0,0,0,0,False
0003_02,0,33.0,False,0.0,1283.0,371.0,3329.0,193.0,3,0,...,0,1,0,0,0,0,0,0,0,False
0004_01,0,16.0,False,303.0,70.0,151.0,565.0,2.0,4,1,...,0,0,0,0,0,0,1,0,0,True


### KNN
数値特徴量の欠損値をKNNを使用してインプットする．
- GroupIdとCabinNumに対してエンコードをしていちばんいいのにしたい

In [79]:
def impute_missing_using_knn(df, numeric_cols, has_labels=False):
    x = df
    
    if has_labels is True:
        transported = df["Transported"]
        x = df.drop("Transported", axis=1)
        
    scaler = preprocessing.StandardScaler()
    x[numeric_cols] = scaler.fit_transform(x[numeric_cols])
    
    imputer = impute.KNNImputer(n_neighbors=5, weights="distance")
    x = imputer.fit_transform(x)
    
    if has_labels is True:
        x = np.hstack((x, transported.values.reshape(-1, 1)))
        
    return pd.DataFrame(x, columns=df.columns, index=df.index)

In [80]:
train_cabin_num = train_df["CabinNum"]
train_group_id  = train_df["GroupId"]

test_cabin_num  = test_df["CabinNum"]
test_group_df   = test_df["GroupId"]

to_drop = ["GroupId", "CabinNum"]
numeric_cols = ["Age", "TotalExpense"] + expenditure_columns

train_df = impute_missing_using_knn(train_df.drop(to_drop, axis=1), numeric_cols, has_labels=True)
test_df = impute_missing_using_knn(test_df.drop(to_drop, axis=1), numeric_cols)

In [81]:
train_df.head()

Unnamed: 0_level_0,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Alone,RoomService_missing,...,GroupSize_8,CabinDeck_A,CabinDeck_B,CabinDeck_C,CabinDeck_D,CabinDeck_E,CabinDeck_F,CabinDeck_G,CabinDeck_T,Transported
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0001_01,0.0,0.702095,0.0,-0.337025,-0.284274,-0.287317,-0.273736,-0.266098,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0002_01,0.0,-0.333233,0.0,-0.173528,-0.278689,-0.245971,0.209267,-0.227692,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
0003_01,0.0,2.01351,1.0,-0.272527,1.934922,-0.287317,5.634034,-0.223327,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0003_02,0.0,0.287964,0.0,-0.337025,0.511931,0.32625,2.655075,-0.097634,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0004_01,0.0,-0.885407,0.0,0.117466,-0.240833,-0.03759,0.223344,-0.264352,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0


In [82]:
train_df.isna().any()

CryoSleep                    False
Age                          False
VIP                          False
RoomService                  False
FoodCourt                    False
ShoppingMall                 False
Spa                          False
VRDeck                       False
Alone                        False
RoomService_missing          False
FoodCourt_missing            False
ShoppingMall_missing         False
Cabin_missing                False
VIP_missing                  False
TotalExpense_missing         False
TotalExpense                 False
CabinSide                    False
HomePlanet_Earth             False
HomePlanet_Europa            False
HomePlanet_Mars              False
Destination_55 Cancri e      False
Destination_PSO J318.5-22    False
Destination_TRAPPIST-1e      False
GroupSize_1                  False
GroupSize_2                  False
GroupSize_3                  False
GroupSize_4                  False
GroupSize_5                  False
GroupSize_6         

### 分割しよう
5個に分ける

In [83]:
train_df = train_df.reset_index()

# kfoldカラムの追加
train_df["kfold"] = -1
kf = model_selection.KFold(n_splits=5,random_state=42,shuffle=True)

for idx, (_,val_idx) in enumerate(kf.split(train_df)):
    train_df.loc[val_idx,"kfold"] = idx

#PassengerIdをIndexにする
train_df = train_df.set_index("PassengerId")
train_df.head()

Unnamed: 0_level_0,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Alone,RoomService_missing,...,CabinDeck_A,CabinDeck_B,CabinDeck_C,CabinDeck_D,CabinDeck_E,CabinDeck_F,CabinDeck_G,CabinDeck_T,Transported,kfold
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0001_01,0.0,0.702095,0.0,-0.337025,-0.284274,-0.287317,-0.273736,-0.266098,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
0002_01,0.0,-0.333233,0.0,-0.173528,-0.278689,-0.245971,0.209267,-0.227692,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,3
0003_01,0.0,2.01351,1.0,-0.272527,1.934922,-0.287317,5.634034,-0.223327,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4
0003_02,0.0,0.287964,0.0,-0.337025,0.511931,0.32625,2.655075,-0.097634,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4
0004_01,0.0,-0.885407,0.0,0.117466,-0.240833,-0.03759,0.223344,-0.264352,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,4


In [84]:
train_df["kfold"].unique()
#5個に分けられた！やった～

array([0, 3, 4, 2, 1], dtype=int64)

In [85]:
train_df.head()

Unnamed: 0_level_0,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Alone,RoomService_missing,...,CabinDeck_A,CabinDeck_B,CabinDeck_C,CabinDeck_D,CabinDeck_E,CabinDeck_F,CabinDeck_G,CabinDeck_T,Transported,kfold
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0001_01,0.0,0.702095,0.0,-0.337025,-0.284274,-0.287317,-0.273736,-0.266098,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
0002_01,0.0,-0.333233,0.0,-0.173528,-0.278689,-0.245971,0.209267,-0.227692,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,3
0003_01,0.0,2.01351,1.0,-0.272527,1.934922,-0.287317,5.634034,-0.223327,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4
0003_02,0.0,0.287964,0.0,-0.337025,0.511931,0.32625,2.655075,-0.097634,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4
0004_01,0.0,-0.885407,0.0,0.117466,-0.240833,-0.03759,0.223344,-0.264352,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,4


### データクレンジング終わり！！！！！！！！！
長すぎるわばか

In [86]:
# 確認
for column in train_df.columns:
    isna = train_df[column].isna().any()
    print(F"{column}:{isna}")

CryoSleep:False
Age:False
VIP:False
RoomService:False
FoodCourt:False
ShoppingMall:False
Spa:False
VRDeck:False
Alone:False
RoomService_missing:False
FoodCourt_missing:False
ShoppingMall_missing:False
Cabin_missing:False
VIP_missing:False
TotalExpense_missing:False
TotalExpense:False
CabinSide:False
HomePlanet_Earth:False
HomePlanet_Europa:False
HomePlanet_Mars:False
Destination_55 Cancri e:False
Destination_PSO J318.5-22:False
Destination_TRAPPIST-1e:False
GroupSize_1:False
GroupSize_2:False
GroupSize_3:False
GroupSize_4:False
GroupSize_5:False
GroupSize_6:False
GroupSize_7:False
GroupSize_8:False
CabinDeck_A:False
CabinDeck_B:False
CabinDeck_C:False
CabinDeck_D:False
CabinDeck_E:False
CabinDeck_F:False
CabinDeck_G:False
CabinDeck_T:False
Transported:False
kfold:False


### クレンジングしたデータ保存してくれ
to_csvしよう

In [87]:
train_df.to_csv("train_prepared_vip_majority.csv",index=False)
test_df.to_csv("test_prepared_vip_majority.csv")