In [1]:
import numpy as np 
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.model_selection import GridSearchCV
import warnings 
warnings.filterwarnings('ignore')
# pip install lightgbm
# pip install xgboost

In [2]:
train = pd.read_csv('pre_train.csv')
test = pd.read_csv('pre_test.csv')

## 분류모델

train, test(val) split

In [3]:
X = train.iloc[:,1:].values
y = train['Survived']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, stratify=y, random_state=2022)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((668, 8), (223, 8), (668,), (223,))

- 예측을 위한 모델
- 결정 트리 / 랜덤 포레스트 / KNN / xgboost / lgbm 사용

In [4]:
# 기본 모델 

# dtree = DecisionTreeClassifier(random_state=2022)
# rfc = RandomForestClassifier(random_state=2022)
# knn = KNeighborsClassifier()
# xgb = XGBClassifier(random_state=2022, eval_metric='mlogloss')
# lgbm = LGBMClassifier(random_state=2022)

-----

In [5]:
# 최적 튜닝 모델 

dtree = DecisionTreeClassifier(random_state=2022, max_depth = 4, max_features = 5, max_leaf_nodes = 8, min_samples_leaf = 2, min_samples_split = 2)
rfc = RandomForestClassifier(random_state=2022, max_depth = 10, min_samples_leaf = 1, min_samples_split = 5, n_estimators = 130)
knn = KNeighborsClassifier(metric = 'manhattan', n_neighbors = 87, weights = 'distance')
xgb = XGBClassifier(random_state=2022, eval_metric='mlogloss')
lgbm = LGBMClassifier(random_state=2022, max_depth = 4, min_child_samples = 40, n_estimators = 100, num_leaves = 20, subsample = 0.5)


In [6]:
lgbm.get_params()

{'boosting_type': 'gbdt',
 'class_weight': None,
 'colsample_bytree': 1.0,
 'importance_type': 'split',
 'learning_rate': 0.1,
 'max_depth': 4,
 'min_child_samples': 40,
 'min_child_weight': 0.001,
 'min_split_gain': 0.0,
 'n_estimators': 100,
 'n_jobs': -1,
 'num_leaves': 20,
 'objective': None,
 'random_state': 2022,
 'reg_alpha': 0.0,
 'reg_lambda': 0.0,
 'silent': 'warn',
 'subsample': 0.5,
 'subsample_for_bin': 200000,
 'subsample_freq': 0}

In [7]:
params = {
    'n_estimators': [100],                 
    'max_depth': [3],          
    'num_leaves': [2],      
    'min_child_samples': [20],     
    'subsample' :  [0.5,1]                 
}

In [8]:
grid_lgbm = GridSearchCV(lgbm, param_grid=params, scoring='accuracy', cv=10)
grid_lgbm.fit(X_train, y_train)

print(grid_lgbm.best_params_)
print(grid_lgbm.best_score_)

{'max_depth': 3, 'min_child_samples': 20, 'n_estimators': 100, 'num_leaves': 2, 'subsample': 0.5}
0.8157620985979195


In [9]:
# 파람스 검증
lgbm = LGBMClassifier(random_state=2022, max_depth = 3, min_child_samples = 20, n_estimators = 100, num_leaves = 10, subsample = 0.5)
lgbm.fit(X_train, y_train)
round(lgbm.score(X_train, y_train) * 100, 2)

86.98

LGBM 

-  순서:        그리드   / 예측성능 / 캐글
-  기본:        0.812754 / 89.37 / **77990**
-  튜닝1 :      0.820262 / 87.13 / 77990
-  튜닝2 :      0.829194 / 86.23 / **78708**
-  튜닝3 :      0.829194 / 86.98 / 76555
>
>

- lgbm1
- 'n_estimators': [100],                 
- 'max_depth': [10, 30, 50, 100],          
- 'num_leaves': [2, 10, 20, 31],      
- 'min_child_samples': [20, 40, 60],     
- 'subsample' :  [0.5, 1]   
> 'max_depth': 10, 'min_child_samples': 40, 'n_estimators': 100, 'num_leaves': 10, 'subsample': 0.5 >> 77990

- **lgbm 2**
- 'n_estimators': [100],                 
- 'max_depth': [2, 4, 6, 8, 10],          
- 'num_leaves': [2, 10, 20, 31],      
- 'min_child_samples': [20, 40, 60],     
- 'subsample' :  [0.5, 1]   
> **'max_depth': 4, 'min_child_samples': 40, 'n_estimators': 100, 'num_leaves': 20, 'subsample': 0.5 >> 78708**

- lgbm3
- 'n_estimators': [100],                 
- 'max_depth': [3, 5, 7, 9],          
- 'num_leaves': [2, 10, 20, 31],      
- 'min_child_samples': [20, 40, 60],     
- 'subsample' :  [0.5, 1]  
> 'max_depth': 3, 'min_child_samples': 20, 'n_estimators': 100, 'num_leaves': 10, 'subsample': 0.5 >> 76555

KNN
-  순서:        그리드   / 예측성능 / 캐글
-  기본:        0.779828 / 85.63 / **75837**
-  튜닝1 :      0.805269 / 82.63 / 72727
-  튜닝2 :      0.799389 / 91.47 / **77990**
> 

- knn1
- 'n_neighbors' : list(range(1,100)),                 
- 'weights' : ["uniform", 'distance'],                
- 'metric' : ['euclidean', 'manhattan','minkowski']
>  'metric': 'manhattan', 'n_neighbors': 16, 'weights': 'uniform' >> 72727

- knn2
- 'n_neighbors' : list(range(50,100)),                 
- 'weights' : ["uniform", 'distance'],                
- 'metric' : ['euclidean', 'manhattan','minkowski']
>  **'metric': 'manhattan', 'n_neighbors': 87, 'weights': 'distance' >> 77990**


dtree
- 'max_depth' : [2,3,4,5,6,7,8,9,10], 
- 'min_samples_leaf' : [2,3,4,5,6,7,8,9,10], 
- 'min_samples_split' :[2,3,4,5,6,7,8,9,10],  
- 'max_features' :[2,3,4,5,6,7,8,9,10],
- 'max_leaf_nodes':[2,3,4,5,6,7,8,9,10]
> max_depth = 4, max_features = 5, max_leaf_nodes = 8, min_samples_leaf = 2, min_samples_split = 2 >> 0.839733

---

## *최적 파라미터 적용한 모델 사용

In [10]:
# 최적 튜닝 모델 

dtree = DecisionTreeClassifier(random_state=2022, max_depth = 4, max_features = 5, max_leaf_nodes = 8, min_samples_leaf = 2, min_samples_split = 2)
rfc = RandomForestClassifier(random_state=2022, max_depth = 10, min_samples_leaf = 1, min_samples_split = 5, n_estimators = 130)
knn = KNeighborsClassifier(metric = 'manhattan', n_neighbors = 87, weights = 'distance')
xgb = XGBClassifier(random_state=2022, eval_metric='mlogloss')
lgbm = LGBMClassifier(random_state=2022, max_depth = 4, min_child_samples = 40, n_estimators = 100, num_leaves = 20, subsample = 0.5)

기본모델 캐글 score

In [11]:
# dtree : 76794
# rfc   : 78229
# knn   : 75837 
# xgb   : 78468 
# lgbm  : 77990 

최적 튜닝 모델의 캐글 score 

In [12]:
# dtree : 76794 > 78468
# rfc   : 78229 > 78468 
# knn   : 75837 > 77990
# xgb   : 78468 
# lgbm  : 77990 > 78708

In [13]:
# vs 에서의 모델별 성능 

model = [dtree, rfc, knn, xgb, lgbm]
ac = []

for i in model:
    i.fit(X_train, y_train)
    acc = round(i.score(X_train, y_train) * 100, 2)
    ac.append(acc)

print(f'dtree의 성능    : {ac[0]}') # 91.47
print(f'rfc의 성능      : {ac[1]}') # 91.47
print(f'knn의 성능      : {ac[2]}') # 85.63
print(f'xgb의 성능      : {ac[3]}') # 91.02
print(f'lgbm의 성능     : {ac[4]}') # 89.37

dtree의 성능    : 83.98
rfc의 성능      : 89.97
knn의 성능      : 91.47
xgb의 성능      : 91.02
lgbm의 성능     : 86.23


In [14]:
# train 전체학습 후 vs 모델 성능 


X = train.iloc[:,1:].values
y = train['Survived']
X_test = test.drop(['PassengerId'], axis=1)
X.shape, y.shape, X_test.shape

model = [dtree, rfc, knn, xgb, lgbm]
ac = []
pred = []

for i in model:
    i.fit(X, y)
    y_pred = i.predict(X_test)
    pred.append(y_pred)

    acc = round(i.score(X, y) * 100, 2)
    ac.append(acc)
    
print(f'dtree의 성능    : {ac[0]}') # 91.47 > 90.68
print(f'rfc의 성능      : {ac[1]}') # 91.47 > 90.68 > 89.23 > 90.68
print(f'knn의 성능      : {ac[2]}') # 91.47 > 90.68
print(f'xgb의 성능      : {ac[3]}') # 91.02 > 90.35 > 90.68
print(f'lgbm의 성능     : {ac[4]}') # 89.37 > 90.68

dtree의 성능    : 83.28
rfc의 성능      : 89.23
knn의 성능      : 90.68
xgb의 성능      : 90.35
lgbm의 성능     : 86.2


In [15]:
# 예측, test에 concat 

dtree = pred[0]
rfc = pred[1]
knn = pred[2]
xgb = pred[3]
lgbm = pred[4]

dtree = pd.DataFrame({'dtree': dtree})
rfc = pd.DataFrame({'rfc': rfc})
knn = pd.DataFrame({'knn': knn})
xgb = pd.DataFrame({'xgb': xgb})
lgbm = pd.DataFrame({'lgbm': lgbm})

dtree = pred[0]
rfc = pred[1]
knn = pred[2]
xgb = pred[3]
lgbm = pred[4]

dtree = pd.DataFrame({'dtree': dtree})
rfc = pd.DataFrame({'rfc': rfc})
knn = pd.DataFrame({'knn': knn})
xgb = pd.DataFrame({'xgb': xgb})
lgbm = pd.DataFrame({'lgbm': lgbm})

model = [dtree, rfc, knn, xgb, lgbm]

for i in model:
    test = pd.concat([test, i], axis=1)

In [16]:
test.head()

Unnamed: 0,PassengerId,Pclass,Sex,Fare,Embarked,Age_gr,F_nm,F_ox,Married,dtree,rfc,knn,xgb,lgbm
0,892,3,1,1,1,1,0,0,0,0,0,0,0,0
1,893,3,0,0,2,2,1,1,0,0,0,0,0,0
2,894,2,1,1,1,4,0,0,0,0,0,0,0,0
3,895,3,1,1,2,0,0,0,0,0,0,0,0,0
4,896,3,0,2,2,0,2,1,0,0,1,1,1,0


In [17]:
test['Survived'] = (test['dtree'] + test['rfc'] + test['knn'] + test['xgb'] + test['lgbm'])/5
test = test.drop(['Pclass', 'Sex', 'Fare', 'Embarked', 'Age_gr', 'F_nm', 'F_ox', 'Married', 'dtree', 'rfc', 'knn', 'xgb', 'lgbm'], axis=1)
test.head()

Unnamed: 0,PassengerId,Survived
0,892,0.0
1,893,0.0
2,894,0.0
3,895,0.0
4,896,0.6


In [18]:
test['Survived'].value_counts()

0.0    256
1.0     97
0.6     25
0.8     21
0.2     13
0.4      6
Name: Survived, dtype: int64

In [19]:
def ensemble_survived(x):
    if x < 0.8:
        return 0
    if x >= 0.8:
        return 1 

test['Survived'] = test['Survived'].apply(lambda x: ensemble_survived(x))
test['Survived'].value_counts()

0    300
1    118
Name: Survived, dtype: int64

In [20]:
test.to_csv('27_81.csv', index=False)

27_20 **(79186)**
- x <= 0.2 : 0
- x > 0.2 : 1

27_21 (78468)
- x < 0.2 : 0
- x >= 0.2 : 1

27_40 **(79665)**
- x <= 0.4 : 0
- x > 0.4 : 1

27_41 (79186)
- x < 0.4 : 0
- x >= 0.4 : 1

27_60 (77511)
- x <= 0.6 : 0
- x > 0.6 : 1

27_61 **(79665)**
- x < 0.6 : 0
- x >= 0.6 : 1

27_80 (77272)
- x <= 0.8 : 0
- x > 0.8 : 1

27_81 (77511)
- x < 0.8 : 0
- x >= 0.8 : 1