In [1]:
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import VotingClassifier

In [2]:
X=np.load('titanic_X_train.npy')
y=np.load('titanic_y_train.npy')

In [3]:
X.shape,y.shape

((889, 27), (889,))

In [11]:
c1=DecisionTreeClassifier(random_state=1,max_depth=4)
c2=LogisticRegression(random_state=1)
c3=GaussianNB()
eclf=VotingClassifier(estimators=[('rf',c1),('lr',c2),('gnb',c3)],voting='hard')

In [12]:
from sklearn.model_selection import cross_val_score
cross_val_score(c1,X,y,cv=5).mean()

0.8223068621849807

In [13]:
cross_val_score(c2,X,y,cv=5).mean()

0.8290420872214816

In [14]:
cross_val_score(c3,X,y,cv=5).mean()

0.4600139655938551

In [15]:
cross_val_score(eclf,X,y,cv=5).mean()

0.8222941661905668

In [16]:
eclf1=VotingClassifier(estimators=[('rf',c1),('lr',c2)],voting='hard')
cross_val_score(eclf1,X,y,cv=5).mean()

0.8301783787215135

<b><h3>하이퍼파라미터 튜닝

In [17]:
c1=DecisionTreeClassifier(random_state=1,max_depth=4)
c2=LogisticRegression(random_state=1)
eclf1=VotingClassifier(estimators=[('dt',c1),('lr',c2)],voting='hard')

In [None]:
#실행 X 의사결정트리 파라미터값 표현
DecisionTreeClassifier(
*, criterion="gini", 
splitter="best", 
max_depth=None, 
min_samples_split=2, 
min_samples_leaf=1, 
min_weight_fraction_leaf=0, 
max_features=None, 
random_state=None,
max_leaf_nodes=None, 
min_impurity_decrease=0, 
class_weight=None, 
ccp_alpha=0)

In [None]:
#실행 X 로지스틱 입력 파라미터값 표현
LogisticRegression(
penalty='l2', *, 
dual=False, 
tol=0.0001, 
C=1.0, 
fit_intercept=True, 
intercept_scaling=1, 
class_weight=None, 
random_state=None, 
solver='lbfgs', 
max_iter=100, 
multi_class='auto', 
verbose=0, 
warm_start=False, 
n_jobs=None, 
l1_ratio=None)

In [18]:
pr={
    'lr__solver':['liblinear'],'lr__C':[0.1,5.0,7.0,10.0,15.0,20.0,100.0],
    'dt__criterion':['gini','entropy'],
    'dt__max_depth':[10,8,7,6,5,4,3,2],
    'dt__min_samples_leaf':[1,2,3,4,5,6,7,8,9]
}

In [19]:
from sklearn.model_selection import GridSearchCV
gr=GridSearchCV(eclf1,param_grid=pr,cv=5)
gr.fit(X,y)
gr.best_score_

0.8425569732749316

In [20]:
gr.best_params_

{'dt__criterion': 'gini',
 'dt__max_depth': 10,
 'dt__min_samples_leaf': 5,
 'lr__C': 5.0,
 'lr__solver': 'liblinear'}

In [21]:
c1=DecisionTreeClassifier(criterion='gini',random_state=1,max_depth=10,min_samples_leaf=5)
c2=LogisticRegression(solver='liblinear',C=5.0,random_state=1)
end_e=VotingClassifier(estimators=[('dt',c1),('lr',c2)],voting='hard')
cross_val_score(end_e,X,y,cv=5).mean()

0.8425569732749316

<b><h3> data 로드

In [22]:
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import BaggingClassifier

In [25]:
X=np.load('titanic_X_train.npy')
y=np.load('titanic_y_train.npy')

In [31]:
eclf=BaggingClassifier(LogisticRegression(random_state=1),oob_score=True)
cross_val_score(eclf,X,y,cv=5).mean()

  "Some inputs do not have OOB scores. "
  oob_decision_function = predictions / predictions.sum(axis=1)[:, np.newaxis]
  "Some inputs do not have OOB scores. "
  oob_decision_function = predictions / predictions.sum(axis=1)[:, np.newaxis]
  "Some inputs do not have OOB scores. "
  oob_decision_function = predictions / predictions.sum(axis=1)[:, np.newaxis]
  "Some inputs do not have OOB scores. "
  oob_decision_function = predictions / predictions.sum(axis=1)[:, np.newaxis]
  "Some inputs do not have OOB scores. "
  oob_decision_function = predictions / predictions.sum(axis=1)[:, np.newaxis]


0.8189106836792991

In [32]:
eclf=BaggingClassifier(DecisionTreeClassifier(random_state=1),oob_score=True,random_state=42)
cross_val_score(eclf,X,y,cv=5).mean()

  "Some inputs do not have OOB scores. "
  oob_decision_function = predictions / predictions.sum(axis=1)[:, np.newaxis]
  "Some inputs do not have OOB scores. "
  oob_decision_function = predictions / predictions.sum(axis=1)[:, np.newaxis]
  "Some inputs do not have OOB scores. "
  oob_decision_function = predictions / predictions.sum(axis=1)[:, np.newaxis]
  "Some inputs do not have OOB scores. "
  oob_decision_function = predictions / predictions.sum(axis=1)[:, np.newaxis]
  "Some inputs do not have OOB scores. "
  oob_decision_function = predictions / predictions.sum(axis=1)[:, np.newaxis]


0.8290674792103092

In [34]:
from sklearn.model_selection import cross_val_score
eclf=BaggingClassifier(LogisticRegression(random_state=1),oob_score=True)
cross_val_score(eclf,X,y,cv=5).mean()

  "Some inputs do not have OOB scores. "
  oob_decision_function = predictions / predictions.sum(axis=1)[:, np.newaxis]
  "Some inputs do not have OOB scores. "
  oob_decision_function = predictions / predictions.sum(axis=1)[:, np.newaxis]
  "Some inputs do not have OOB scores. "
  oob_decision_function = predictions / predictions.sum(axis=1)[:, np.newaxis]
  "Some inputs do not have OOB scores. "
  oob_decision_function = predictions / predictions.sum(axis=1)[:, np.newaxis]
  "Some inputs do not have OOB scores. "
  oob_decision_function = predictions / predictions.sum(axis=1)[:, np.newaxis]


0.824535009204596

In [None]:
#Bagging 파라미터값 입력X
BaggingClassifier(
base_estimator=None, 
n_estimators=10, *, 
max_samples=1, 
max_features=1, 
bootstrap=True, 
bootstrap_features=False, 
oob_score=False, 
warm_start=False, 
n_jobs=None, 
random_state=None, 
verbose=0)

In [36]:
pr={
    'n_estimators':[10,20,30,40,50,55],
    'max_samples':[0.5,0.6,0.7,0.8,0.9,1],
}
from sklearn.model_selection import GridSearchCV
gr=GridSearchCV(eclf,pr,cv=5)
gr.fit(X,y)
gr.best_score_

  "Some inputs do not have OOB scores. "
  oob_decision_function = predictions / predictions.sum(axis=1)[:, np.newaxis]
  "Some inputs do not have OOB scores. "
  oob_decision_function = predictions / predictions.sum(axis=1)[:, np.newaxis]
  "Some inputs do not have OOB scores. "
  oob_decision_function = predictions / predictions.sum(axis=1)[:, np.newaxis]
  "Some inputs do not have OOB scores. "
  oob_decision_function = predictions / predictions.sum(axis=1)[:, np.newaxis]
  "Some inputs do not have OOB scores. "
  oob_decision_function = predictions / predictions.sum(axis=1)[:, np.newaxis]
  "Some inputs do not have OOB scores. "
  oob_decision_function = predictions / predictions.sum(axis=1)[:, np.newaxis]
  "Some inputs do not have OOB scores. "
  oob_decision_function = predictions / predictions.sum(axis=1)[:, np.newaxis]
  "Some inputs do not have OOB scores. "
  oob_decision_function = predictions / predictions.sum(axis=1)[:, np.newaxis]
  "Some inputs do not have OOB scores. "

0.8324128737383356

In [37]:
gr.best_params_

{'max_samples': 0.9, 'n_estimators': 20}

In [38]:
gr.best_estimator_.oob_score_

0.8200224971878515

In [None]:
gr.best_estimator_.predict(X)

In [41]:
from sklearn.ensemble import RandomForestClassifier
en=RandomForestClassifier(n_estimators=10,max_features=2,oob_score=True) #n_estimators : 만들갯수 설정 
cross_val_score(en,X,y,cv=5).mean()



0.7919507395416746

In [None]:
pr={
    'n_estimators':[10,20,30,40,50,55],
    'max_features':[1,2,3,4,5,6,10,15,len(X[0])]
}
gr=GridSearchCV(en,pr,cv=5)
gr.fit(X,y)
gr.best_score_

In [46]:
gr.best_params_

{'max_features': 15, 'n_estimators': 40}

In [47]:
gr.best_estimator_.oob_score_

0.8188976377952756

<b><h3>부스팅

In [48]:
import numpy as np
from sklearn.tree import DecisionTreeClassifier #분류니깐 의사결정트리
X=np.load('titanic_X_train.npy')
y=np.load('titanic_y_train.npy')

In [51]:
from sklearn.ensemble import AdaBoostClassifier
ecf=AdaBoostClassifier(DecisionTreeClassifier(max_depth=2),n_estimators=500)

In [54]:
from sklearn.model_selection import cross_val_score
cross_val_score(ecf,X,y,cv=5).mean()

0.7874500095219958

In [53]:
from sklearn.ensemble import RandomForestClassifier
rf=RandomForestClassifier(n_estimators=500)
cross_val_score(rf,X,y,cv=5).mean()

0.8031930425950613

In [None]:
#AdaBoost 파라미터 실행 X
AdaBoostClassifier(
base_estimator=None, *, #base가 전달받은 이름
n_estimators=50, 
learning_rate=1, 
algorithm="SAMME.R", 
random_state=None)

In [None]:
pr={
    'n_estimators':[10],
    'base_estimator__max_depth':[2]
}

<b><h3>AdaBoost 의사결정트리(깊이2)

In [None]:
X,y

In [60]:
X.shape

(889, 27)

In [56]:
ecf=AdaBoostClassifier(DecisionTreeClassifier(max_depth=2),n_estimators=500)
cross_val_score(ecf,X,y,cv=5).mean()

0.784072875007935

In [58]:
ecf.fit(X,y)

AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=2),
                   n_estimators=500)

In [62]:
len(ecf.feature_importances_)

27

In [76]:
ecf=AdaBoostClassifier(random_state=42)
cross_val_score(ecf,X,y,cv=5).mean()

0.8133307941344506

<b><h3>RandomForest(깊이2)

In [63]:
rf=RandomForestClassifier(max_depth=2, n_estimators=1000,oob_score=True) #배깅알고리즘이라서 oob 쓸수있다
cross_val_score(rf,X,y,cv=5).mean()

0.7829048435218688

In [75]:
rf=RandomForestClassifier(random_state=42) #배깅알고리즘이라서 oob 쓸수있다
cross_val_score(rf,X,y,cv=5).mean()

0.7953342220529422

In [64]:
rf.fit(X,y)

RandomForestClassifier(max_depth=2, n_estimators=1000, oob_score=True)

In [65]:
rf.oob_score_

0.7907761529808774

In [66]:
from sklearn.ensemble import GradientBoostingClassifier
gdb=GradientBoostingClassifier()
cross_val_score(gdb,X,y,cv=5).mean()

0.8245730971878371

In [67]:
from sklearn.ensemble import GradientBoostingClassifier
gdb=GradientBoostingClassifier(learning_rate=0.2,n_estimators=500,random_state=42)
cross_val_score(gdb,X,y,cv=5).mean()

0.8144226496540341

In [68]:
gdb=GradientBoostingClassifier(random_state=42)
cross_val_score(gdb,X,y,cv=5).mean()

0.8245730971878371

In [74]:
from sklearn.ensemble import GradientBoostingClassifier
gdb=GradientBoostingClassifier(random_state=42)
cross_val_score(gdb,X,y,cv=5).mean()

0.8245730971878371

<b><h3>GradientBoosting

In [69]:
from sklearn.ensemble import GradientBoostingClassifier
gdb=GradientBoostingClassifier(learning_rate=0.2,n_estimators=500,random_state=42)
cross_val_score(gdb,X,y,cv=5).mean()

0.8144226496540341

In [73]:
from sklearn.ensemble import GradientBoostingClassifier
gdb=GradientBoostingClassifier(random_state=42)
cross_val_score(gdb,X,y,cv=5).mean()

0.8245730971878371

<b><h3>HistGradientBoosting

In [71]:
from sklearn.ensemble import HistGradientBoostingClassifier
hgd=HistGradientBoostingClassifier(random_state=42)
cross_val_score(hgd,X,y,cv=5).mean()

0.8357963562496031

<b><h3>ExtraTrees

In [72]:
from sklearn.ensemble import ExtraTreesClassifier
et=ExtraTreesClassifier(random_state=42)
cross_val_score(et,X,y,cv=5).mean()

0.8065638291119152

<b><h3>XGBoost

In [77]:
from xgboost import XGBClassifier
xg=XGBClassifier(tree_method='hist',random_state=42)
cross_val_score(xg,X,y,cv=5).mean()

0.8268012442074525

<b><h3>LightGBM

In [78]:
from lightgbm import LGBMClassifier
lg=LGBMClassifier(random_state=42)
cross_val_score(lg,X,y,cv=5).mean()

0.831321018218752