### 1. Import and data


In [1]:
import sklearn
import pandas as pd

SEED = 42

In [2]:
# Read data: rxnach/student-stress-factors-a-comprehensive-analysis
data_path = "./StressLevelDataset.csv"
df = pd.read_csv(data_path)
df.head()

Unnamed: 0,anxiety_level,self_esteem,mental_health_history,depression,headache,blood_pressure,sleep_quality,breathing_problem,noise_level,living_conditions,...,basic_needs,academic_performance,study_load,teacher_student_relationship,future_career_concerns,social_support,peer_pressure,extracurricular_activities,bullying,stress_level
0,14,20,0,11,2,1,2,4,2,3,...,2,3,2,3,3,2,3,3,2,1
1,15,8,1,15,5,3,1,4,3,1,...,2,1,4,1,5,1,4,5,5,2
2,12,18,1,14,2,1,2,2,2,2,...,2,2,3,3,2,2,3,2,2,1
3,16,12,1,15,4,3,1,3,4,2,...,2,2,4,1,4,1,4,4,5,2
4,16,28,0,7,2,3,5,1,3,2,...,3,4,3,1,2,1,5,0,5,1


In [3]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler

X = df.drop(columns=["stress_level"])
y = df["stress_level"]

# Train/Test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=SEED)

### 2. Analyse models in comparison with XGBoost


In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import (
    RandomForestClassifier,
    AdaBoostClassifier, GradientBoostingClassifier,
)
import xgboost as xgb

# Models
logistic_model = LogisticRegression(
    penalty="l2",
    max_iter=100,
    verbose=False,
)
decision_tree_model = DecisionTreeClassifier()

random_forest_model = RandomForestClassifier(n_estimators=250)
ada_boost_model = AdaBoostClassifier(n_estimators=250)  # thử k=5 trước
gradient_boost_model = GradientBoostingClassifier(n_estimators=250)
xg_boost_model = xgb.XGBClassifier(
    n_estimators = 250
)

# Model dict
models = {
    "Logistic Regression": logistic_model,
    "Decision Tree": decision_tree_model,
    "Random Forest": random_forest_model,
    "Ada Boost": ada_boost_model,
    "Gradient Boost": gradient_boost_model,
    "XGBoost": xg_boost_model,
}

In [5]:
import time

# Evaluating models
# TODO: More rigorous stuff here (latency, throughput, FLOPs)
for model_name, model in models.items():
    start = time.time()
    model.fit(X_train, y_train)
    end = time.time()
    print(f"{model_name} took {round((end - start)*1000, 2)} miliseconds to train.")

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Logistic Regression took 62.87 miliseconds to train.
Decision Tree took 4.39 miliseconds to train.
Random Forest took 428.27 miliseconds to train.
Ada Boost took 507.25 miliseconds to train.
Gradient Boost took 1503.77 miliseconds to train.
XGBoost took 496.62 miliseconds to train.


In [75]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt


def validation_and_visualize(
    model, model_name: str,
    X_test, y_test,
    visualized: bool = True,
):
    y_pred = model.predict(X_test)

    print("Accuracy:", accuracy_score(y_test, y_pred))

    # Confusion Matrix
    if visualized:
        print("\nClassification Report:\n", classification_report(y_test, y_pred))
        
        plt.figure(figsize=(6,4))
        sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt="d", cmap="Blues")
        plt.xlabel("Dự đoán")
        plt.ylabel("Thực tế")
        plt.title(f"Confusion Matrix - {model_name}")
        plt.show()

In [None]:
for model_name, model in models.items():
    validation_and_visualize(
        model, model_name,
        X_test = X_test, y_test = y_test,
        visualized=False
    )

Accuracy: 0.8818181818181818
Accuracy: 0.8863636363636364
Accuracy: 0.8727272727272727
Accuracy: 0.8772727272727273
Accuracy: 0.8818181818181818
Accuracy: 0.8681818181818182


### 3. In-depth stuff of xgboost lib


In [None]:
# Main stuff here
'''
Official Doc: https://xgboost.readthedocs.io
Content:
    - Matrix dataset setting
    - Param list / dict
    - Training
    - Inferencing and prediction
    - Plotting importance and stuff
    - Dumping the model and its feature map into txt file
    - Save / load a model
'''
import xgboost
import numpy as np

In [None]:
### Get the data matrix (look up doc for supported data types)
try:
    # Load from file
    dtrain = xgb.DMatrix('train.buffer')
    dtest = xgboost.DMatrix('test.buffer')
except:
    dtrain = xgboost.DMatrix(X_train, label=y_train)
    dtrain.save_binary('train.buffer')
    dtest = xgboost.DMatrix(X_test, label=y_test)
    dtest.save_binary('test.buffer')

In [87]:
### Parameter list
param = {
    'max_depth': 2,
    
    'eta': 1, # Learning rate
    'lambda': 0.2, # L2 regularizer rate
    'alpha': 0.2, # L1 regularizer rate (sparse repr.)

    'objective': 'multi:softprob', # Objective could be 'binary:logistic', 'multi:softmax' but would not work well with AUC
    'num_class': 3,
    'eval_metric': ['mlogloss', 'merror', 'auc'], # logloss is only for binary
    'nthread': 4,
}

# Validation set (code from doc)
evallist = [(dtrain, 'train'), (dtest, 'eval')]

In [88]:
### Train model
num_round = 10
bst = xgb.train(
    param,
    dtrain,
    num_round,
    evallist,
    early_stopping_rounds=10,
)

[0]	train-mlogloss:0.36031	train-merror:0.10000	train-auc:0.98706	eval-mlogloss:0.38590	eval-merror:0.12273	eval-auc:0.97487
[1]	train-mlogloss:0.24637	train-merror:0.09091	train-auc:0.98971	eval-mlogloss:0.27581	eval-merror:0.12727	eval-auc:0.98555
[2]	train-mlogloss:0.20571	train-merror:0.08295	train-auc:0.99071	eval-mlogloss:0.26070	eval-merror:0.12273	eval-auc:0.98306
[3]	train-mlogloss:0.18537	train-merror:0.07841	train-auc:0.99159	eval-mlogloss:0.25807	eval-merror:0.12727	eval-auc:0.98271
[4]	train-mlogloss:0.16623	train-merror:0.06591	train-auc:0.99329	eval-mlogloss:0.26412	eval-merror:0.13636	eval-auc:0.98213
[5]	train-mlogloss:0.15318	train-merror:0.05568	train-auc:0.99482	eval-mlogloss:0.26209	eval-merror:0.12727	eval-auc:0.98261
[6]	train-mlogloss:0.14671	train-merror:0.04318	train-auc:0.99531	eval-mlogloss:0.25589	eval-merror:0.11818	eval-auc:0.98313
[7]	train-mlogloss:0.13660	train-merror:0.05000	train-auc:0.99613	eval-mlogloss:0.25698	eval-merror:0.10909	eval-auc:0.98375




In [None]:
### Save and load models
# Save model
bst.save_model('xgb_0001.json') # could be .model, .json, etc.

# Get model
bst.load_model('xgb_0001.json')

In [None]:
### Validation
# Use predict() method with DMatrix (converted to supported type)
y_pred = bst.predict(dtest).argmax(axis = 1)

# Accuracy and Confusion Matrix
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

plt.figure(figsize=(6,4))
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt="d", cmap="Blues")
plt.xlabel("Dự đoán")
plt.ylabel("Thực tế")
plt.title(f"Confusion Matrix - {model_name}")
plt.show()

### 4. Other stuff (subsampling, summarizing, plot error bar, etc.)


In [54]:
# Source: https://vtitech.vn/xgboost-bai-14-tuning-subsample/
'''
Content:
    - XGBoost model
    - Subsampling and K-fold with Grid Search
    - Summarizing results
    - Plot error bar (currently unavailable in this notebook)
'''
from pandas import read_csv
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
import matplotlib
matplotlib.use('Agg')
from matplotlib import pyplot

# encode string class values as integers
label_encoded_y = LabelEncoder().fit_transform(y) # y chung

# grid search
grid_model = XGBClassifier()
subsample = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 1.0]
param_grid = dict(subsample=subsample)
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=7)
grid_search = GridSearchCV(grid_model, param_grid, scoring="accuracy", n_jobs=1, cv=kfold, verbose=1)
grid_result = grid_search.fit(X, label_encoded_y)

# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

# plot
pyplot.errorbar(subsample, means, yerr=stds)
pyplot.title("XGBoost subsample vs accuracy")
pyplot.xlabel('subsample')
pyplot.ylabel('Accuracy')
pyplot.savefig('subsample.png')

Fitting 10 folds for each of 9 candidates, totalling 90 fits
Best: 0.886364 using {'subsample': 0.7}
0.884545 (0.027589) with: {'subsample': 0.1}
0.884545 (0.024747) with: {'subsample': 0.2}
0.875455 (0.014113) with: {'subsample': 0.3}
0.881818 (0.022268) with: {'subsample': 0.4}
0.880000 (0.025324) with: {'subsample': 0.5}
0.873636 (0.033141) with: {'subsample': 0.6}
0.886364 (0.025793) with: {'subsample': 0.7}
0.880909 (0.023514) with: {'subsample': 0.8}
0.885455 (0.023070) with: {'subsample': 1.0}
