In [20]:
from sklearn import linear_model, preprocessing, impute, model_selection, metrics
from scipy.stats import boxcox
import pandas as pd
import numpy as np
import random
import os
import seaborn as sns
import matplotlib.pyplot as plt
import japanize_matplotlib

In [22]:
sns.set_theme()
sns.set_style("ticks")
sns.despine()

%matplotlib inline

In [24]:
def seed_everything(seed=42):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
seed_everything()

In [25]:
# CONFIG
DATA_DIR = "spaceship-titanic"

def filepath(filename):
    return os.path.join(DATA_DIR,filename)

In [27]:
train_df = pd.read_csv(filepath("train.csv"),index_col="PassengerId")
test_df  = pd.read_csv(filepath("test.csv"), index_col="PassengerId")

# Add PasssengerId sinze we need it for feature engineering
# PAssengerIdは学習に使用したいのでカラムとして追加する．

train_df["PassengerId"] = train_df.index
test_df["PassengerId"]  = test_df.index

In [28]:
len(train_df),len(test_df)

(8693, 4277)

In [29]:
train_df.head()

Unnamed: 0_level_0,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,PassengerId
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False,0001_01
0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True,0002_01
0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False,0003_01
0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False,0003_02
0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True,0004_01


In [33]:
train_df["Destination"].unique()

array(['TRAPPIST-1e', 'PSO J318.5-22', '55 Cancri e', nan], dtype=object)

### 個人的データ解釈
- PassengerId   :そのまま`[str]`
- HomePlanet    :おうちのある星(Europa,Earth,Mars,nan)`[str]`
- CryoSleep     :コールドスリープしてるかどうか`[bool]`
- Cabin         :"[A-Z]/[0-9]/[SorP]"`[str]`
- Destination   :目的地("TRAPPIST-1e","PSO J318.5-22","55 Cancri e",nan)`[str]`
- Age           :年齢`[int]`
- VIP           :VIPかどうか`[bool]`
- RoomService   :ルームサービスでつかった金額       `[int]`
- FoodCourt     :フードコートでつかった金額         `[int]`
- ShoppingMall  :ショッピングモールでつかった金額   `[int]`
- Spa           :スパで使った金額                  `[int]`
- VRDeck        :(?)  `[int]`
- Name          :名前 `[str]`
- Transported   :到着したかどうか   `[int]`

### Initial Feature Engineering
### 特徴量エンジニアリングの初期化
#### See Spaceship Titanic - Exploratory Data Analysis
> Note: All features extraced from `Cabin` will be engineered after missing values are imputed but function is created here.  
> 注: `キャビン`から引き渡されたすべての機能は，欠損値が入力された後に設計されますが，関数はここで作成されます(?)

In [34]:
# お金使ったやつまとめる
expenditure_columns = ["RoomService","foodCourt","ShoppingMall","Spa","VRDeck"]

#### Idの分解
- PassengerIdはGroupIdとGroupSizeに分けられる

In [35]:
def from_passengerId(df:pd.DataFrame) -> pd.DataFrame:
    """PassengerIdからグループ人数の追加

    Args:
        df (pd.DataFrame): DataFrame of csv

    Returns:
        pd.DataFrame: add "GroupId" and "GroupSize"
    """
    
    split_id = df["PassengerId"].str.split("_",expand=True)
    df["GroupId"]   = split_id[0]
    df["GroupSize"] = df.groupby("GroupId")["GroupId"].transform("count")

    # Indicates whether the passenger was traveling alone or not
    # 乗客が一人か一人ではないか

    df["Alone"] = (df["GroupSize"] == 1)

    return df

In [36]:
train_df = from_passengerId(train_df)
test_df  = from_passengerId(test_df)

In [37]:
train_df.head()

Unnamed: 0_level_0,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,PassengerId,GroupId,GroupSize,Alone
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False,0001_01,1,1,True
0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True,0002_01,2,1,True
0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False,0003_01,3,2,False
0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False,0003_02,3,2,False
0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True,0004_01,4,1,True


### Presence of Missing Values
### 欠損値の存在
- 関数`missing_values_features()`は列のリストを取得し，欠損値が存在するかどうかを示す新しい列を追加する．nullが無視されないときにTotalExpense(すべての支出が列の合計)が欠落しているかどうかを示す`TotalExpense_missing`という機能の追加

In [None]:
def missing_value_features(df:pd.DataFrame,columns:list,expenditure_columns:list) ->pd.DataFrame:
    for column in columns:
        df[f"{columns}_missing"] = df[column].isna()
    
    df["TotalExpense_missing"] = df[expenditure_columns].sum(axis=1,skipna=False).isna()
    return df