In [39]:
import pandas as pd
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss, accuracy_score
import pickle
import matplotlib.pyplot as plt
import numpy as np

# Load and prepare data
df = pd.read_csv('Benh_nhom1.csv')

# Calculate age
current_year = 2025
df['Tuoi'] = current_year - df['Nam_Sinh']

# Select relevant columns
selected_columns = [
    'Tuoi', 'So_Lan_Mang_Thai', 'So_Lan_Sinh_Con',
    'Tien_Su_Tranh_Thai', 'Tien_Su_Gia_Dinh', 'Tien_Su_MangThai',
    'Nghe_Nghiep', 'Trinh_Do_Hoc_Van', 'Tinh_Trang_hon_nhan'
]

# Encode categorical variables
categorical_cols = ['Nghe_Nghiep', 'Trinh_Do_Hoc_Van', 'Tinh_Trang_hon_nhan', 'Tien_Su_Tranh_Thai']
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

# Scale numerical features
numerical_cols = ['Tuoi', 'So_Lan_Mang_Thai', 'So_Lan_Sinh_Con']
scaler = StandardScaler()
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])

# Prepare features and target
X = df[selected_columns]
y = df['CoBenh'].map({'Y': 1, 'N': 0})

# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Train XGBoost model 
model = XGBClassifier(
    n_estimators=40,
    max_depth=3,  
    learning_rate=0.05,  
    reg_lambda=1.0,  
    reg_alpha=0.5,  
    subsample=0.8,  
    colsample_bytree=0.8,  
    random_state=42,
    eval_metric=["logloss", "error"]
)


eval_set = [(X_train, y_train), (X_val, y_val)]
model.fit(X_train, y_train, eval_set=eval_set)
# Extract evaluation results
results = model.evals_result()
epochs = range(len(results['validation_0']['logloss']))
train_loss = results['validation_0']['logloss']
val_loss = results['validation_1']['logloss']
train_error = results['validation_0']['error']
val_error = results['validation_1']['error']
train_accuracy = [1 - x for x in train_error]
val_accuracy = [1 - x for x in val_error]

# Calculate final metrics on validation set
y_pred = model.predict(X_val)
final_accuracy = accuracy_score(y_val, y_pred)
final_log_loss = log_loss(y_val, model.predict_proba(X_val))

print(f"Độ chính xác: {final_accuracy:.4f}")
print(f"Log Loss: {final_log_loss:.4f}")

# Plot loss
plt.figure(figsize=(10, 5))
plt.subplot(1, 2, 1)
plt.plot(epochs, train_loss, label='Train Loss')
plt.plot(epochs, val_loss, label='Validation Loss')
plt.title('Loss trong quá trình huấn luyện và validation')
plt.xlabel('Epoch')
plt.ylabel('Log Loss')
plt.legend()
plt.grid(True)

# Plot accuracy
plt.subplot(1, 2, 2)
plt.plot(epochs, train_accuracy, label='Train Accuracy')
plt.plot(epochs, val_accuracy, label='Validation Accuracy')
plt.title('Độ chính xác trong quá trình huấn luyện và validation')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()
plt.grid(True)

plt.tight_layout()

# Save the plot
plt.savefig('training_metrics.png')
plt.close()

# Save model using XGBoost's save_model
model.save_model('my_trained_model.model')

# Save preprocessors
with open('scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)
with open('encoders.pkl', 'wb') as f:
    pickle.dump(label_encoders, f)
    

[0]	validation_0-logloss:0.60489	validation_0-error:0.29717	validation_1-logloss:0.61412	validation_1-error:0.30818
[1]	validation_0-logloss:0.60146	validation_0-error:0.29717	validation_1-logloss:0.61122	validation_1-error:0.30818
[2]	validation_0-logloss:0.59826	validation_0-error:0.29717	validation_1-logloss:0.60769	validation_1-error:0.30818
[3]	validation_0-logloss:0.59559	validation_0-error:0.29717	validation_1-logloss:0.60504	validation_1-error:0.30818
[4]	validation_0-logloss:0.59231	validation_0-error:0.29717	validation_1-logloss:0.60356	validation_1-error:0.30818
[5]	validation_0-logloss:0.58957	validation_0-error:0.29717	validation_1-logloss:0.60173	validation_1-error:0.30818
[6]	validation_0-logloss:0.58769	validation_0-error:0.29717	validation_1-logloss:0.60132	validation_1-error:0.30818
[7]	validation_0-logloss:0.58495	validation_0-error:0.29717	validation_1-logloss:0.59943	validation_1-error:0.30818
[8]	validation_0-logloss:0.58281	validation_0-error:0.29717	validation_1

  self.get_booster().save_model(fname)
