# Best Practice: Regression & Classification (Exam-ready Template)

## 1. Regression (XGBoost)
- ใช้ XGBoostRegressor (ต้องติดตั้ง xgboost ก่อน)
- เหมาะกับข้อมูล tabular, มี feature เยอะ, non-linear
- มี cross-validation, scaling, hyperparameter tuning

In [1]:
# 1.1 Import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from xgboost import XGBRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

In [2]:
# 1.2 Load your data
# df = pd.read_csv('your_data.csv')
# X = df.drop('target_column', axis=1)
# y = df['target_column']
# For demo, create synthetic data
np.random.seed(42)
X = pd.DataFrame(np.random.randn(200, 5), columns=[f'feat{i}' for i in range(5)])
y = X['feat0'] * 10 + X['feat1'] * -5 + np.random.randn(200) * 2

In [3]:
# 1.3 Train-test split & scaling
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [4]:
# 1.4 Hyperparameter tuning (GridSearchCV)
params = {
    'n_estimators': [100, 200],
    'max_depth': [3, 6],
    'learning_rate': [0.05, 0.1]
}
xgb = XGBRegressor(random_state=42)
grid = GridSearchCV(xgb, params, cv=5, scoring='r2', n_jobs=-1)
grid.fit(X_train_scaled, y_train)
print('Best params:', grid.best_params_)

Best params: {'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 200}


In [5]:
# 1.5 Evaluate
best_model = grid.best_estimator_
y_pred = best_model.predict(X_test_scaled)
print('R2:', r2_score(y_test, y_pred))
print('RMSE:', np.sqrt(mean_squared_error(y_test, y_pred)))
print('MAE:', mean_absolute_error(y_test, y_pred))

R2: 0.9335973741810724
RMSE: 2.3658122645789597
MAE: 1.768578361722242


---
## 2. Classification (XGBoost or RandomForest)
- ใช้ XGBoostClassifier (ดีที่สุด) หรือ RandomForestClassifier
- มี cross-validation, scaling, hyperparameter tuning

In [6]:
# 2.1 Import libraries
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [7]:
# 2.2 Load your data
# df = pd.read_csv('your_data.csv')
# X = df.drop('target_column', axis=1)
# y = df['target_column']
# For demo, create synthetic data
np.random.seed(42)
X = pd.DataFrame(np.random.randn(200, 5), columns=[f'feat{i}' for i in range(5)])
y = (X['feat0'] + X['feat1'] > 0).astype(int)

In [8]:
# 2.3 Train-test split & scaling
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [9]:
# 2.4 Hyperparameter tuning (GridSearchCV)
params = {
    'n_estimators': [100, 200],
    'max_depth': [3, 6],
    'learning_rate': [0.05, 0.1]
}
xgbc = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')
grid = GridSearchCV(xgbc, params, cv=5, scoring='accuracy', n_jobs=-1)
grid.fit(X_train_scaled, y_train)
print('Best params:', grid.best_params_)

Best params: {'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 200}


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [13]:
# 2.5 Evaluate
best_model = grid.best_estimator_
y_pred = best_model.predict(X_test_scaled)
print('Accuracy:', accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
print('Confusion matrix:')
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.95
              precision    recall  f1-score   support

           0       0.95      0.95      0.95        22
           1       0.94      0.94      0.94        18

    accuracy                           0.95        40
   macro avg       0.95      0.95      0.95        40
weighted avg       0.95      0.95      0.95        40

Confusion matrix:
[[21  1]
 [ 1 17]]


---
## Notes
- เปลี่ยนชื่อไฟล์, column, และ target ให้ตรงกับข้อมูลจริง
- ถ้าไม่มี xgboost ให้ใช้ RandomForestClassifier (API คล้ายกัน)
- สามารถใช้ cross_val_score() หรือ KFold ได้เหมือนกัน
- อย่าลืม scaling ก่อนเทรน
- สามารถ copy/paste แล้วแก้ไขได้ทันที

---
## (Optional) KFold Cross-Validation Example
ใช้สำหรับประเมินโมเดลแบบ cross-validation (ถ้าอยากโชว์ best practice)

In [14]:
# XGBoost Regression with KFold
from xgboost import XGBRegressor
from sklearn.model_selection import KFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
import numpy as np
# df = pd.read_csv('your_data.csv')
# X = df.drop('target_column', axis=1)
# y = df['target_column']
model = make_pipeline(StandardScaler(), XGBRegressor(random_state=42))
cv = KFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(model, X, y, cv=cv, scoring='r2')
print('KFold R2 scores:', scores)
print('Mean R2:', np.mean(scores))

KFold R2 scores: [0.7952739  0.75502878 0.72546905 0.75290143 0.87693453]
Mean R2: 0.7811215400695801


In [15]:
# XGBoost Classification with KFold
from xgboost import XGBClassifier
from sklearn.model_selection import KFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
import numpy as np
# df = pd.read_csv('your_data.csv')
# X = df.drop('target_column', axis=1)
# y = df['target_column']
model = make_pipeline(StandardScaler(), XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss'))
cv = KFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(model, X, y, cv=cv, scoring='accuracy')
print('KFold accuracy scores:', scores)
print('Mean accuracy:', np.mean(scores))

KFold accuracy scores: [0.925 0.95  0.95  0.9   0.975]
Mean accuracy: 0.9400000000000001


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


---
## ตัวอย่างการนำโมเดล Regression ไปใช้กับข้อมูลใหม่
Predict เงินเดือนพนักงานใหม่จากโมเดลที่เทรนไว้

In [None]:
# ตัวอย่างการใช้โมเดล Regression ทำนายข้อมูลใหม่
new = pd.DataFrame({
    'YearsExperience': [6.5],
    'EducationLevel': [2],
    'PerformanceScore': [82]
})
# ถ้าใช้ PolynomialFeatures ต้อง transform ก่อน
# new_poly = poly.transform(new)
# pred_salary = model.predict(new_poly)
# print(f"Predicted Salary: {pred_salary[0]:.2f} ฿")

# ถ้าใช้ XGBoost หรือ LinearRegression ธรรมดา
# pred_salary = model.predict(new)
# print(f"Predicted Salary: {pred_salary[0]:.2f} ฿")

---
## ตัวอย่างการนำโมเดล Classification ไปใช้กับข้อมูลใหม่
Predict โรคหัวใจจากข้อมูลผู้ป่วยใหม่

In [None]:
# ตัวอย่างการใช้โมเดล Classification ทำนายข้อมูลใหม่
new_patient = pd.DataFrame({
    'Age': [54],
    'Sex': [1],
    'ChestPain': [2],
    'RestingBP': [145],
    'Cholesterol': [265],
    'FastingBS': [1],
    'MaxHR': [138]
})
# ถ้าใช้ StandardScaler ต้อง transform ก่อน
# new_scaled = scaler.transform(new_patient)
# pred = model.predict(new_scaled)
# print(f"Predicted: {pred[0]}")

# ถ้าใช้ RandomForest หรือ XGBoost ธรรมดา
# pred = model.predict(new_patient)
# print(f"Predicted: {pred[0]}")