In [1]:
#Survivirs on Titanic 
import pandas as pd 
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split 
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report, confusion_matrix

# Load the dataset
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')


In [2]:
#Prepare data 
y_train = train_data['Survived']
train_data.drop(labels='Survived', axis=1, inplace=True)
full_data = pd.concat([train_data,test_data])
#remove unuseful features
drop_columns = ['Name','Age','Ticket','Cabin','SibSp','Parch','Embarked']
full_data.drop(labels = drop_columns,axis=1,inplace=True)

full_data = pd.get_dummies(full_data,columns=['Sex'])
full_data.fillna(value=0.0,inplace=True)

X_train = full_data.values[0:891]
X_test = full_data.values[891:]

#scale date 
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
#divide data to train and validation
state = 12 
test_size = 0.30
X_train , X_val,y_train , y_val = train_test_split(X_train,y_train,test_size=test_size,random_state=state)


In [3]:
#Build Gradient Boosting classifier - learning rate optimization
lr_list = [0.05, 0.075 , 0.1 , 0.25 , 0.5 , 0.75 , 1 ]
for learning_rate in lr_list:
    gb_clf = GradientBoostingClassifier(n_estimators= 20 , learning_rate=learning_rate,max_features=2,max_depth=2,random_state=0)
    gb_clf.fit(X_train,y_train)
    print("learning rate : ",learning_rate)
    print("accuracy (training): {0:.3f}".format(gb_clf.score(X_train,y_train)))
    print("accuracy (validation): {0:.3f}".format(gb_clf.score(X_val,y_val)))

learning rate :  0.05
accuracy (training): 0.801
accuracy (validation): 0.731
learning rate :  0.075
accuracy (training): 0.814
accuracy (validation): 0.731
learning rate :  0.1
accuracy (training): 0.812
accuracy (validation): 0.724
learning rate :  0.25
accuracy (training): 0.835
accuracy (validation): 0.750
learning rate :  0.5
accuracy (training): 0.864
accuracy (validation): 0.772
learning rate :  0.75
accuracy (training): 0.875
accuracy (validation): 0.754
learning rate :  1
accuracy (training): 0.875
accuracy (validation): 0.739


In [5]:
#Prediction 
gb_clf2 = GradientBoostingClassifier(n_estimators= 20 , learning_rate=0.5,max_features=2,max_depth=2,random_state=0)
gb_clf2.fit(X_train,y_train)
Predictions = gb_clf2.predict(X_val)
print('Conffusion Matrix:')
print(confusion_matrix(y_val,Predictions))
print('Classification Report:')
print(classification_report(y_val,Predictions))

Conffusion Matrix:
[[142  19]
 [ 42  65]]
Classification Report:
              precision    recall  f1-score   support

           0       0.77      0.88      0.82       161
           1       0.77      0.61      0.68       107

    accuracy                           0.77       268
   macro avg       0.77      0.74      0.75       268
weighted avg       0.77      0.77      0.77       268



In [None]:
#!pip install xgboost



You should consider upgrading via the 'C:\Users\ASUS\AppData\Local\Programs\Python\Python39\python.exe -m pip install --upgrade pip' command.


In [8]:
#Toy example 1(XGBoost)

from xgboost import XGBClassifier
xgb_clf = XGBClassifier()
xgb_clf.fit(X_train,y_train)
score = xgb_clf.score(X_val,y_val)
print(score)

0.7388059701492538


In [11]:
#Toy example 2a 
#k-fold CV of xgboost model 
from numpy import loadtxt
from xgboost import XGBClassifier
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
#load data
dataset = loadtxt('diabetes.csv', delimiter=',')
X = dataset[:,0:8]
Y = dataset[:,8]
#CV model 
model = XGBClassifier()
kfold = KFold(n_splits=10, random_state=7,shuffle=True)
results = cross_val_score(model, X, Y, cv=kfold)
print("Accuracy: %.2f%% ( độ lệch chuẩn: %.2f%%)" % (results.mean()*100, results.std()*100))

Accuracy: 72.66% ( độ lệch chuẩn: 4.55%)


In [15]:
#stratified k-fold CV of xgboost model 
from numpy import loadtxt 
from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
#load data 
dataset = loadtxt('diabetes.csv', delimiter=',')
#split 
X = dataset[:,0:8]
Y = dataset[:,8]
#CV model 
model = XGBClassifier()
kfold = StratifiedKFold(n_splits=10,random_state=7,shuffle=True)
results = cross_val_score(model,X,Y,cv=kfold)
print("Accuracy: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

Accuracy: 73.57% (3.40%)
