In [1]:
#データ処理用の基本パッケージ
import seaborn as sns
import pandas as pd
import numpy as np
#線形代入時に使用
from sklearn.linear_model import LinearRegression
#スケーリングに使用
from sklearn.preprocessing import RobustScaler
#モデルの推定に使用
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

##データの前処理

In [2]:
#使用データ(titanic)のダウンロード
dataset = sns.load_dataset('titanic')

In [3]:
#カラムの確認
dataset.columns

Index(['survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare',
       'embarked', 'class', 'who', 'adult_male', 'deck', 'embark_town',
       'alive', 'alone'],
      dtype='object')

In [4]:
#大雑把な記述統計
dataset.describe()

Unnamed: 0,survived,pclass,age,sibsp,parch,fare
count,891.0,891.0,714.0,891.0,891.0,891.0
mean,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,0.0,1.0,0.42,0.0,0.0,0.0
25%,0.0,2.0,20.125,0.0,0.0,7.9104
50%,0.0,3.0,28.0,0.0,0.0,14.4542
75%,1.0,3.0,38.0,1.0,0.0,31.0
max,1.0,3.0,80.0,8.0,6.0,512.3292


In [5]:
#欠損値の確認
dataset.isna().sum()

survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64

In [6]:
#特徴量として用いるデータの抽出
features = dataset[['class', 'sex', 'age', 'sibsp', 'parch', 'fare',
       'embarked']]
#ターゲットとして用いるデータの抽出
targets = dataset.pop('survived')

In [7]:
#離散型変数の取り出しとダミーか
features_dummie = pd.get_dummies(features[['class','sex','embarked']],dummy_na = True)

In [8]:
#連続型変数の取り出し
features_continuous = features[['age','sibsp','parch','fare']]
#fareの対数変換(+1)
features_continuous['ln_fare_pulus1'] = np.log(features_continuous[['fare']]+1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  features_continuous['ln_fare_pulus1'] = np.log(features_continuous[['fare']]+1)


In [9]:
#特徴量の連続型変数と離散型変数の結合
features = pd.concat([features_continuous,features_dummie],axis = 1)

In [10]:
#年齢のダミー変数化
#12歳未満をchildrenとする
features['age_children'] = 0
features['age_children'].loc[features['age'] < 12] = 1

#60歳より大きいサンプルをelderlyとする
features['age_elderly'] = 0
features['age_elderly'] .loc[features['age'] > 60] = 1

##※ 12< age <= 60　はベースライン

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  features['age_children'].loc[features['age'] < 12] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  features['age_elderly'] .loc[features['age'] > 60] = 1


##欠損値代入=>年齢（age)の欠損値をいかに埋めるか？

In [11]:
#中央値代入
features['age_fill_median'] = features[['age']].fillna(features[['age']].median())

In [12]:
#ダミー変数化
features['age_na'] = 0
features['age_na'] .loc[np.isnan(features['age'])] = 1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  features['age_na'] .loc[np.isnan(features['age'])] = 1


In [13]:
#線形代入法
#ageがnaではない年齢の取り出し（欠損値処理の学習モデル用）
age_lm_y = features[['age']].loc[~np.isnan(features['age'])]

#ageがnaではない代入モデルに用いる特徴量の取り出し（欠損値処理の学習モデル用）
age_lm_X = features[['sibsp','parch','ln_fare_pulus1','class_First',
                           'class_Second','sex_male','embarked_Q', 'embarked_S']].loc[~np.isnan(features['age'])]

#学習
lm_age = LinearRegression().fit(age_lm_X.values,age_lm_y)


#代入するためのデータ
features_age_lm_pred = features[['sibsp','parch','ln_fare_pulus1','class_First',
                           'class_Second','sex_male','embarked_Q', 'embarked_S']].values
#代入
features['age_fill_lm'] = lm_age.predict(features_age_lm_pred)

#0歳未満のデータは、0歳として数える
features['age_fill_lm'].loc[features['age_fill_lm']< 0] = 0

#元からnaでなかったデータについては、実測値に戻す
features['age_fill_lm'].loc[~np.isnan(features['age'])] = features['age']

#処理なし、ありの比較
features[['age','age_fill_lm']].describe()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  features['age_fill_lm'].loc[features['age_fill_lm']< 0] = 0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  features['age_fill_lm'].loc[~np.isnan(features['age'])] = features['age']


Unnamed: 0,age,age_fill_lm
count,714.0,891.0
mean,29.699118,29.501006
std,14.526497,13.575731
min,0.42,0.0
25%,20.125,21.205594
50%,28.0,28.829223
75%,38.0,36.596058
max,80.0,80.0


In [14]:
#分析用データセットの作成（特徴量とターゲットの結合）
dataset_analyze_proceded = pd.concat([targets,features],axis = 1)
dataset_analyze_proceded.head(10)

Unnamed: 0,survived,age,sibsp,parch,fare,ln_fare_pulus1,class_First,class_Second,class_Third,class_nan,...,sex_nan,embarked_C,embarked_Q,embarked_S,embarked_nan,age_children,age_elderly,age_fill_median,age_na,age_fill_lm
0,0,22.0,1,0,7.25,2.110213,0,0,1,0,...,0,0,0,1,0,0,0,22.0,0,22.0
1,1,38.0,1,0,71.2833,4.280593,1,0,0,0,...,0,1,0,0,0,0,0,38.0,0,38.0
2,1,26.0,0,0,7.925,2.188856,0,0,1,0,...,0,0,0,1,0,0,0,26.0,0,26.0
3,1,35.0,1,0,53.1,3.990834,1,0,0,0,...,0,0,0,1,0,0,0,35.0,0,35.0
4,0,35.0,0,0,8.05,2.202765,0,0,1,0,...,0,0,0,1,0,0,0,35.0,0,35.0
5,0,,0,0,8.4583,2.246893,0,0,1,0,...,0,0,1,0,0,0,0,28.0,1,31.299458
6,0,54.0,0,0,51.8625,3.967694,1,0,0,0,...,0,0,0,1,0,0,0,54.0,0,54.0
7,0,2.0,3,1,21.075,3.094446,0,0,1,0,...,0,0,0,1,0,1,0,2.0,0,2.0
8,1,27.0,0,2,11.1333,2.495954,0,0,1,0,...,0,0,0,1,0,0,0,27.0,0,27.0
9,1,14.0,1,0,30.0708,3.436268,0,1,0,0,...,0,1,0,0,0,0,0,14.0,0,14.0


##代入手法による推定の精度の比較(タイタニック号生存者の予測における)

In [15]:
#訓練データとテストデータに分割:テストサイズ＝40%
X_train,X_test,y_train,y_test = train_test_split(dataset_analyze_proceded[['age', 'age_fill_median','age_fill_lm','sibsp','parch','ln_fare_pulus1','class_First', 'class_Second','class_nan','sex_female','embarked_Q', 'embarked_S','embarked_nan', 'age_children', 'age_elderly', 'age_na']],dataset_analyze_proceded['survived'],random_state = 0,
                                                test_size = 0.4)

In [16]:
#特徴量のスケーリング:ロバストスケーリングを使用
rb_scaler = RobustScaler()
rb_scaler.fit(X_train)
X_train.loc[:,'age':'age_na']= rb_scaler.transform(X_train)
X_test.loc[:,'age':'age_na'] = rb_scaler.transform(X_test)

In [17]:
#リストワイズ除去(na除去)
#特徴量の作成
X_train_narm = X_train[['age','sibsp', 'parch',
       'ln_fare_pulus1', 'class_First', 'class_Second', 'class_nan',
       'sex_female', 'embarked_Q', 'embarked_S', 'embarked_nan',
       ]].dropna()

X_test_narm = X_test[['age','sibsp', 'parch',
       'ln_fare_pulus1', 'class_First', 'class_Second', 'class_nan',
       'sex_female', 'embarked_Q', 'embarked_S', 'embarked_nan',
       ]].dropna()


#特徴量のindexの抽出
X_train_narm_index = X_train[['age','sibsp', 'parch',
       'ln_fare_pulus1', 'class_First', 'class_Second', 'class_nan',
       'sex_female', 'embarked_Q', 'embarked_S', 'embarked_nan',
       ]].dropna().index.values

X_test_narm_index = X_test[['age','sibsp', 'parch',
       'ln_fare_pulus1', 'class_First', 'class_Second', 'class_nan',
       'sex_female', 'embarked_Q', 'embarked_S', 'embarked_nan',
       ]].dropna().index.values

#ターゲットの作成
y_train_narm = y_train[X_train_narm_index]
y_test_narm = y_test[X_test_narm_index]

#学習
clf_narm = LogisticRegression(random_state=0).fit(X_train_narm,y_train_narm)
#結果
clf_narm.score(X_test_narm, y_test_narm)

0.8021201413427562

In [18]:
#ダミー化
X_train_fill_dummy = X_train[['sibsp', 'parch',
       'ln_fare_pulus1', 'class_First', 'class_Second', 'class_nan',
       'sex_female', 'embarked_Q', 'embarked_S', 'embarked_nan',
       'age_children', 'age_elderly', 'age_na']]

X_test_fill_dummy = X_test[['sibsp', 'parch',
       'ln_fare_pulus1', 'class_First', 'class_Second', 'class_nan',
       'sex_female', 'embarked_Q', 'embarked_S', 'embarked_nan',
       'age_children', 'age_elderly', 'age_na']]

#学習
clf_fill_dummy = LogisticRegression(random_state=0).fit(X_train_fill_dummy,y_train)
#結果
clf_fill_dummy.score(X_test_fill_dummy, y_test)

0.7899159663865546

In [19]:
#中央値代入
X_train_fill_median = X_train[['age_fill_median','sibsp', 'parch',
       'ln_fare_pulus1', 'class_First', 'class_Second', 'class_nan',
       'sex_female', 'embarked_Q', 'embarked_S', 'embarked_nan',
       ]]

X_test_fill_median = X_test[['age_fill_median','sibsp', 'parch',
       'ln_fare_pulus1', 'class_First', 'class_Second', 'class_nan',
       'sex_female', 'embarked_Q', 'embarked_S', 'embarked_nan',
       ]]


#学習
clf_fill_median = LogisticRegression(random_state=0).fit(X_train_fill_median,y_train)
#結果
clf_fill_median.score(X_test_fill_median, y_test)

0.7927170868347339

In [20]:
#線形代入
X_train_fill_lm = X_train[['age_fill_lm','sibsp', 'parch',
                           'ln_fare_pulus1', 'class_First', 'class_Second', 'class_nan',
                            'sex_female', 'embarked_Q', 'embarked_S', 'embarked_nan',
                            ]]

X_test_fill_lm = X_test[['age_fill_lm','sibsp', 'parch',
       'ln_fare_pulus1', 'class_First', 'class_Second', 'class_nan',
       'sex_female', 'embarked_Q', 'embarked_S', 'embarked_nan',
       ]]

#学習
clf_fill_lm = LogisticRegression(random_state=0).fit(X_train_fill_lm,y_train)
#結果
clf_fill_lm.score(X_test_fill_lm,y_test)

0.8011204481792717

##このデータセットと特徴量と手法での欠損値処理による精度違いはあまりなさそう。