# 나이 예측해보기

In [112]:
import pandas as pd

data = pd.read_csv("/home/woo/kuBig2025/ml_dl_python/data/titanic/train.csv")
print(data.info())
print(data.head()) 


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
None
   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                      

In [113]:
# 이름에서 'Mr.', 'Mrs.' 같은 호칭 추출
data['Title'] = data['Name'].str.extract(r' ([A-Za-z]+)\.', expand=False)

# 결과 확인
print(data[['Name', 'Title']].head(10))  # 상위 10개만 확인


                                                Name   Title
0                            Braund, Mr. Owen Harris      Mr
1  Cumings, Mrs. John Bradley (Florence Briggs Th...     Mrs
2                             Heikkinen, Miss. Laina    Miss
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)     Mrs
4                           Allen, Mr. William Henry      Mr
5                                   Moran, Mr. James      Mr
6                            McCarthy, Mr. Timothy J      Mr
7                     Palsson, Master. Gosta Leonard  Master
8  Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)     Mrs
9                Nasser, Mrs. Nicholas (Adele Achem)     Mrs


In [114]:
# Title을 간소화
data['Title'] = data['Title'].replace(['Mlle', 'Ms'], 'Miss')
data['Title'] = data['Title'].replace(['Mme'], 'Mrs')

# 등장 빈도가 낮은 희귀 호칭들을 Rare로 묶기
rare_titles = [
    'Dr', 'Rev', 'Col', 'Major', 'Lady', 'Don', 'Dona', 'Jonkheer', 'Capt', 'Countess', 'Sir'
]
data['Title'] = data['Title'].replace(rare_titles, 'Rare')


In [115]:
print(data['Title'].value_counts())


Title
Mr        517
Miss      185
Mrs       126
Master     40
Rare       23
Name: count, dtype: int64


In [116]:
# Title이 'Rare'인 사람들만 추출
rare_data = data[data['Title'] == 'Rare']

# 이름과 원래 호칭을 함께 보기 위해 Name 컬럼 포함해서 출력
print(rare_data[['Name', 'Sex', 'Age', 'Title']])


                                                  Name     Sex   Age Title
30                            Uruchurtu, Don. Manuel E    male  40.0  Rare
149                  Byles, Rev. Thomas Roussel Davids    male  42.0  Rare
150                         Bateman, Rev. Robert James    male  51.0  Rare
245                        Minahan, Dr. William Edward    male  44.0  Rare
249                      Carter, Rev. Ernest Courtenay    male  54.0  Rare
317                               Moraweck, Dr. Ernest    male  54.0  Rare
398                                   Pain, Dr. Alfred    male  23.0  Rare
449                     Peuchen, Major. Arthur Godfrey    male  52.0  Rare
536                  Butt, Major. Archibald Willingham    male  45.0  Rare
556  Duff Gordon, Lady. (Lucille Christiana Sutherl...  female  48.0  Rare
599       Duff Gordon, Sir. Cosmo Edmund ("Mr Morgan")    male  49.0  Rare
626                     Kirkland, Rev. Charles Leonard    male  57.0  Rare
632                      

### 어린아이들의 호칭 확인 (15세 이하)

In [117]:
# 나이가 15세 미만인 데이터만 추출
children = data[data['Age'] < 15]

# 몇 명인지 확인
print("어린이 수:", len(children))


어린이 수: 78


In [118]:
# 어린이들의 Title 분포 보기
print(children['Title'].value_counts())


Title
Miss      38
Master    36
Mr         3
Mrs        1
Name: count, dtype: int64


In [119]:
# 이름, 성별, 나이, 호칭만 출력
print(children[['Name', 'Sex', 'Age', 'Title']])


                                        Name     Sex    Age   Title
7             Palsson, Master. Gosta Leonard    male   2.00  Master
9        Nasser, Mrs. Nicholas (Adele Achem)  female  14.00     Mrs
10           Sandstrom, Miss. Marguerite Rut  female   4.00    Miss
14      Vestrom, Miss. Hulda Amanda Adolfina  female  14.00    Miss
16                      Rice, Master. Eugene    male   2.00  Master
..                                       ...     ...    ...     ...
827                    Mallet, Master. Andre    male   1.00  Master
831          Richards, Master. George Sibley    male   0.83  Master
850  Andersson, Master. Sigvard Harald Elias    male   4.00  Master
852                  Boulos, Miss. Nourelain  female   9.00    Miss
869          Johnson, Master. Harold Theodor    male   4.00  Master

[78 rows x 4 columns]


In [120]:
# Title별 평균 나이
title_age_means = data.groupby('Title')['Age'].mean()
print(title_age_means)


Title
Master     4.574167
Miss      21.845638
Mr        32.368090
Mrs       35.788991
Rare      45.545455
Name: Age, dtype: float64


In [121]:
# 평균값으로 Age 결측값 채우기
data['Age'] = data.apply(
    lambda row: title_age_means[row['Title']] if pd.isnull(row['Age']) else row['Age'],
    axis=1
)


In [122]:
# Age 결측값 확인
print("남은 Age 결측값 수:", data['Age'].isnull().sum())


남은 Age 결측값 수: 0


In [123]:
print("컬럼 목록:", data.columns.tolist())


컬럼 목록: ['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked', 'Title']


In [124]:
print(data.groupby('Title')['Age'].mean()) # 평균 나이 확인

Title
Master     4.574167
Miss      21.845638
Mr        32.368090
Mrs       35.788991
Rare      45.545455
Name: Age, dtype: float64


# 주요 변수별 생존률 확인

In [125]:
# Pclass별 생존률
print(data.groupby('Pclass')['Survived'].mean())

# Sex별 생존률
print(data.groupby('Sex')['Survived'].mean())

# Title별 생존률
print(data.groupby('Title')['Survived'].mean())

# Age 구간별 생존률 (예: 어린이 vs 성인)
data['IsChild'] = data['Age'] < 15
print(data.groupby('IsChild')['Survived'].mean())


Pclass
1    0.629630
2    0.472826
3    0.242363
Name: Survived, dtype: float64
Sex
female    0.742038
male      0.188908
Name: Survived, dtype: float64
Title
Master    0.575000
Miss      0.702703
Mr        0.156673
Mrs       0.793651
Rare      0.347826
Name: Survived, dtype: float64
IsChild
False    0.364648
True     0.573171
Name: Survived, dtype: float64


# train 데이터 준비

In [126]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# 1. 결측값 처리
data['Embarked'] = data['Embarked'].fillna(data['Embarked'].mode()[0])
data['Sex'] = data['Sex'].map({'male': 0, 'female': 1})

# Title별 평균 나이 미리 저장 (★중요)
title_age_means = data.groupby('Title')['Age'].mean().copy()

# 2. 인코딩
data = pd.get_dummies(data, columns=['Embarked', 'Title'], drop_first=True)

# 다시 train.csv 로드해서 Title별 평균 구하기
train_raw = pd.read_csv("/home/woo/kuBig2025/ml_dl_python/data/titanic/train.csv")
train_raw['Title'] = train_raw['Name'].str.extract(r' ([A-Za-z]+)\.', expand=False)
train_raw['Title'] = train_raw['Title'].replace(['Mlle', 'Ms'], 'Miss')
train_raw['Title'] = train_raw['Title'].replace(['Mme'], 'Mrs')
rare_titles = ['Dr', 'Rev', 'Col', 'Major', 'Lady', 'Don', 'Dona', 'Jonkheer', 'Capt', 'Countess', 'Sir']
train_raw['Title'] = train_raw['Title'].replace(rare_titles, 'Rare')
title_age_means = train_raw.groupby('Title')['Age'].mean()


# 3. 피처 및 타겟 분리
features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare'] + \
           [col for col in data.columns if col.startswith('Embarked_') or col.startswith('Title_')]

X = data[features]
y = data['Survived']

# 4. 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [127]:
# 모델 생성
model = xgb.XGBClassifier(
    n_estimators=100,
    max_depth=4,
    learning_rate=0.1,
    use_label_encoder=False,
    eval_metric='logloss',
    random_state=42
)

# 모델 학습
model.fit(X_train, y_train)

# 예측
y_pred = model.predict(X_test)

# 정확도 출력
from sklearn.metrics import accuracy_score
print("XGBoost 정확도:", accuracy_score(y_test, y_pred))


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


XGBoost 정확도: 0.8324022346368715


### test 데이터 전처리

In [131]:
# 1. 테스트 데이터 불러오기
test = pd.read_csv("/home/woo/kuBig2025/ml_dl_python/data/titanic/test.csv")

# 💡 PassengerId 따로 저장해두기
passenger_ids = test['PassengerId'].copy()

# 2. Title 컬럼 추출
test['Title'] = test['Name'].str.extract(r' ([A-Za-z]+)\.', expand=False)

# 3. Title 정리
test['Title'] = test['Title'].replace(['Mlle', 'Ms'], 'Miss')
test['Title'] = test['Title'].replace(['Mme'], 'Mrs')
rare_titles = ['Dr', 'Rev', 'Col', 'Major', 'Lady', 'Don', 'Dona', 'Jonkheer', 'Capt', 'Countess', 'Sir']
test['Title'] = test['Title'].replace(rare_titles, 'Rare')

# 4. Age 결측값을 Title별 평균으로 채우기
# (train 데이터에서 구한 평균과 동일한 기준을 사용해야 하므로 train 기준 사용!)

test['Age'] = test.apply(
    lambda row: title_age_means[row['Title']] if pd.isnull(row['Age']) else row['Age'],
    axis=1
)

# 5. Sex를 숫자로 변환
test['Sex'] = test['Sex'].map({'male': 0, 'female': 1})

# 6. Fare 결측값 처리
test['Fare'] = test['Fare'].fillna(test['Fare'].median())

# 7. Embarked 결측값은 없지만 혹시 모르니 채움 (보통은 없음)
test['Embarked'] = test['Embarked'].fillna('S')

# 8. 범주형 변수 one-hot 인코딩
test = pd.get_dummies(test, columns=['Embarked', 'Title'], drop_first=True)

# 9. train의 feature 목록과 맞추기
# 누락된 컬럼이 있다면 추가하고 0으로 채움
for col in X.columns:
    if col not in test.columns:
        test[col] = 0

# 그리고 순서도 동일하게 맞추기
test = test[X.columns]

# 10. 예측
pred_test = model.predict(test)

# 11. 제출 파일 생성
submission = pd.DataFrame({
    'PassengerId': passenger_ids,
    'Survived': pred_test.astype(int)
})


# 12. 저장
submission.to_csv("/home/woo/kuBig2025/ml_dl_python/data/titanic/xgb_submission.csv", index=False)
print("🎉 캐글 제출 파일 생성 완료: xgb_submission.csv")


🎉 캐글 제출 파일 생성 완료: xgb_submission.csv
