In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.utils import resample


In [5]:
train = pd.read_csv('../data/kaggle/train.csv')
test = pd.read_csv('../data/kaggle/test.csv')

In [6]:
test_category_ratios = test['Embarked'].value_counts(normalize=True)
print("テストデータのカテゴリ比率:")
print(test_category_ratios)

# 【3】 カテゴリ変数の割合をテストデータに合わせる
train_balanced = []


for category, ratio in test_category_ratios.items():
    # 目標とするサンプル数
    desired_count = int(ratio * len(train))
    
    # 学習データ内の該当カテゴリのデータを取得
    X_subset = train[train['Embarked'] == category]
    
    if len(X_subset) > desired_count:
        # 多すぎる場合はランダムに削減
        X_subset= resample(X_subset,  replace=False, n_samples=desired_count, random_state=42)
    elif len(X_subset) < desired_count:
        # 少なすぎる場合はコピーで増やす
        X_subset = resample(X_subset, replace=True, n_samples=desired_count, random_state=42)
    
    # リストに追加
    train_balanced.append(X_subset)
    

# リストをデータフレームに変換
train_balanced = pd.concat(train_balanced)


# 【4】 調整後の学習データのカテゴリ比率を確認
train_category_ratios = train_balanced['Embarked'].value_counts(normalize=True)
print("調整後の学習データのカテゴリ比率:")
print(train_category_ratios)


テストデータのカテゴリ比率:
Embarked
S    0.645933
C    0.244019
Q    0.110048
Name: proportion, dtype: float64
調整後の学習データのカテゴリ比率:
Embarked
S    0.646067
C    0.243820
Q    0.110112
Name: proportion, dtype: float64


In [11]:
#欠損値の数を特徴量として追加
def process(data):
    data_processed = data.copy()

    data_processed['Miss'] = data_processed['Age'].isnull().astype(int)
    data_processed['Sex'] = data_processed['Sex'].map({'female':0, 'male':1})
    data_processed['Embarked'] = data_processed['Embarked'].fillna(data_processed['Embarked'].mode()[0])
    data_processed = pd.get_dummies(data_processed, columns=['Embarked'], prefix='Embarked', dtype=int)
    data_processed.drop(['Name','Ticket', 'Cabin'], axis=1, inplace=True)
    data_processed['Age'] = data_processed['Age'].fillna(data_processed["Age"].median())
    data_processed['Fare'] = data_processed['Fare'].fillna(data_processed['Fare'].median())
    return data_processed

train_balanced = process(train_balanced).drop(['PassengerId'], axis=1)
test_processed = process(test)

In [12]:
train_balanced.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Miss,Embarked_C,Embarked_Q,Embarked_S
872,0,1,1,33.0,0,0,5.0,0,0,0,1
816,0,3,0,23.0,0,0,7.925,0,0,0,1
824,0,3,1,2.0,4,1,39.6875,0,0,0,1
413,0,2,1,28.0,0,0,0.0,1,0,0,1
103,0,3,1,33.0,0,0,8.6542,0,0,0,1


In [14]:
train_balanced.to_csv('../data/resumple/train.csv', index=False)
test_processed.to_csv('../data/resumple/test.csv', index=False)

In [16]:
len(train_balanced)

890