In [7]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier         # 의사결정 분류모델
from sklearn.model_selection import train_test_split    # 학습데이터와 테스트데이터 분리 함수
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn import preprocessing

In [8]:
# train, test 데이터 읽어오기
titanic_test_df = pd.read_csv("./titanic_test.csv")
titanic_train_df = pd.read_csv("./titanic_train.csv")

In [9]:
# 데이터 컬럼 수 확인
print("train 데이터 컬럼 수 :", titanic_train_df.shape ,"\n"
      , "test 데이터 컬럼 수 :", titanic_test_df.shape)
titanic_train_df.head()

train 데이터 컬럼 수 : (891, 12) 
 test 데이터 컬럼 수 : (418, 11)


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [10]:
# 전처리 전 train, test 데이터 NaN값 확인
titanic_train_df.isna().sum(),titanic_test_df.isna().sum()

(PassengerId      0
 Survived         0
 Pclass           0
 Name             0
 Sex              0
 Age            177
 SibSp            0
 Parch            0
 Ticket           0
 Fare             0
 Cabin          687
 Embarked         2
 dtype: int64,
 PassengerId      0
 Pclass           0
 Name             0
 Sex              0
 Age             86
 SibSp            0
 Parch            0
 Ticket           0
 Fare             1
 Cabin          327
 Embarked         0
 dtype: int64)

In [11]:
# 문자열 => 숫자형으로 인코딩 함수
def encode_feature(df=None):
    features = ["Sex","Cabin","Embarked"]
    for feature in features:
        le = preprocessing.LabelEncoder()
        le = le.fit(df[feature])
        df[feature] = le.transform(df[feature])
    return df

In [12]:
# 인자로 입력받은 DataFrame을 복사, Time 컬럼을 삭제하고 복사된 DataFrame을 반환하는 함수
def get_preprocessed_df(df=None):
    df_copy = df.copy() # 입력된 데이터프레임을 복사한것, 원본을 보호
    df_copy.drop(["Name", "Ticket"], axis=1, inplace = True) # Time 컬럼 삭제
    return df_copy

In [13]:
# 원본 데이터를 학습데이터와 테스트데이터를 분리하는 함수 선언
from sklearn.model_selection import train_test_split

def get_train_test_dataset(df=None):
    # 인코딩 함수 호출
    df_copy = encode_feature(df)
    # 입력된 df을 복사하고, Time을 삭제
    df_copy = get_preprocessed_df(df)
    # 데이터와 레이블 분리
    X_features = df_copy.drop("Survived", axis=1)  # 답 피처 제거
    y_labels = df_copy["Survived"] # 답만 추출
    X_train, X_test, y_train, y_test = train_test_split(
          X_features
        , y_labels
        , test_size= 0.2
        , random_state= 0
        , stratify= y_labels  
    )
    return X_train, X_test, y_train, y_test

In [14]:
titanic_test_df = encode_feature(titanic_test_df)
titanic_test_df = get_preprocessed_df(titanic_test_df)

In [15]:
X_train, X_test, y_train, y_test = get_train_test_dataset(titanic_train_df)

In [19]:
titanic_test_df.isna().sum()

PassengerId     0
Pclass          0
Sex             0
Age            86
SibSp           0
Parch           0
Fare            1
Cabin           0
Embarked        0
dtype: int64

In [22]:
print(titanic_test_df['Embarked'].mode()[0])

2


In [16]:
# lightGBM 모델 생성, 모델 학습, 예측
from lightgbm import LGBMClassifier
lgbm_clf = LGBMClassifier(n_estimators=1000, num_leaves=64, n_jobs=-1, boost_from_average=False)
lgbm_clf.fit(X_train,y_train)
pred1 = lgbm_clf.predict(titanic_test_df)

In [17]:
from sklearn.ensemble import RandomForestClassifier
rf_clf = RandomForestClassifier(n_estimators= 100 , min_samples_leaf= 6 , max_depth= 16 , min_samples_split= 2 , random_state= 0)
rf_clf.fit(X_train, y_train)
pred2 = rf_clf.predict(titanic_test_df) 

ValueError: Input X contains NaN.
RandomForestClassifier does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [None]:
# 파일 생성
output = pd.DataFrame({'PassengerId': titanic_test_df.PassengerId, 'Survived': pred1})
output.to_csv('submission4.csv', index=False)

In [None]:
# 오차행렬
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import f1_score, roc_auc_score

def get_clf_eval(y_test, pred=None, pred_proba=None):
    confusion = confusion_matrix(y_test, pred)
    accuracy = accuracy_score(y_test , pred)
    precision = precision_score(y_test , pred)
    recall = recall_score(y_test , pred)
    f1 = f1_score(y_test,pred)
    # ROC-AUC 추가
    roc_auc = roc_auc_score(y_test, pred_proba)
    print('오차 행렬')
    print(confusion)
    # ROC-AUC print 추가
    print('정확도: {0:.4f}, 정밀도: {1:.4f}, 재현율: {2:.4f},\
    F1: {3:.4f}, AUC:{4:.4f}'.format(accuracy, precision, recall, f1, roc_auc))


In [None]:
# 모델 학습,예측,예측확률 함수
def get_model_train_test(model, ftr_train=None, ftr_test=None, tgt_train=None, tgt_test=None):
    model.fit(ftr_train,tgt_train)
    pred = model.predict(ftr_test)
    pred_proba = model.predict_proba(ftr_test)[:,1]
    get_clf_eval(tgt_test, pred, pred_proba)

In [None]:
# lightGBM 알고리즘 
from lightgbm import LGBMClassifier

print('### LightGBM 예측 성능 ###')
lgbm_clf = LGBMClassifier(n_estimators=1000, num_leaves=64, n_jobs=-1, boost_from_average=False)
get_model_train_test(lgbm_clf, ftr_train=X_train, ftr_test=X_test, tgt_train=y_train, tgt_test=y_test)

In [None]:
from sklearn.ensemble import RandomForestClassifier

print("### RandomForest 예측 성능 ###")
rf_clf = RandomForestClassifier(n_estimators= 100 , min_samples_leaf= 6 , max_depth= 16 , min_samples_split= 2 , random_state= 0)
get_model_train_test(rf_clf, ftr_train=X_train, ftr_test=X_test, tgt_train=y_train, tgt_test=y_test)

In [None]:
# NaN 데이터 전처리
# titanic_train_df["Cabin"] = titanic_train_df["Pclass"].apply(lambda x : "C23" if x ==1 else ("F33" if x == 2 else "G6"))
# titanic_test_df["Cabin"] = titanic_test_df["Pclass"].apply(lambda x : "C23" if x ==1 else ("F33" if x == 2 else "G6"))
# titanic_train_df['Age'] = titanic_train_df['Age'].fillna(titanic_train_df['Age'].mean())
# titanic_test_df['Age'] = titanic_test_df['Age'].fillna(titanic_test_df['Age'].mean())
# titanic_train_df['Embarked'] = titanic_train_df['Embarked'].fillna('S')
# titanic_test_df['Embarked'] = titanic_test_df['Embarked'].fillna('S')
# titanic_test_df["Fare"] = titanic_test_df["Fare"].fillna(0)
# titanic_train_df.isna().sum()

In [None]:
# Name에서 Mr, Miss, Mrs, Master만 추출
name_separate = titanic_train_df["Name"].str.split(".").str[0]
name_division = name_separate.str.split(", ").str[1]
name_division.head()

In [None]:
titanic_train_df["name_division"] = name_division
titanic_train_df
# age_mean = titanic_train_df.groupby(["name_division"])
# # age_mean = titanic_train_df.groupby(["name_division"])["Age"].transform("mean")
# age_mean.head()
# titanic_train_df.groupby('name_division')['Age'].apply(lambda x: x.fillna(x.mean()))

In [None]:
print(titanic_train_df.groupby('index').agg(['sum','mean']))