In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import cross_val_score
import numpy as np

In [None]:
# Load data
df = pd.read_csv('datasets/student_mental_health_dataset.csv')

# Hapus duplikasi & missing value
df_clean = df.drop_duplicates().dropna()

# Encoding fitur kategorikal
le_gender = LabelEncoder()
le_diet = LabelEncoder()
le_mhs = LabelEncoder()

df_clean['Gender'] = le_gender.fit_transform(df_clean['Gender'])
df_clean['Diet_Quality'] = le_diet.fit_transform(df_clean['Diet_Quality'])
df_clean['Mental_Health_Status'] = le_mhs.fit_transform(df_clean['Mental_Health_Status'])

In [4]:
df_fe = df_clean.copy()
df_fe['Sleep_Deficit'] = 8 - df_fe['Sleep_Hours']
df_fe['Screen_Study_Ratio'] = df_fe['Screen_Time_Hours'] / (df_fe['Study_Hours'] + 1e-3)
df_fe['Activity_Efficiency'] = df_fe['Physical_Activity_Minutes'] / (df_fe['Sleep_Hours'] + 1e-3)

# Pilih fitur dan target
X = df_fe.drop(['Stress_Level'], axis=1)
y = df_fe['Stress_Level']

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
xgb = XGBRegressor(random_state=42)
xgb.fit(X_train, y_train)
y_pred = xgb.predict(X_test)

In [8]:
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"MAE: {mae:.2f}")
print(f"RMSE: {rmse:.2f}")
print(f"R² Score: {r2:.2f}")

MAE: 3.05
RMSE: 3.60
R² Score: -0.48


In [9]:
cv_scores = cross_val_score(xgb, X, y, cv=5, scoring='r2')
print("R² Score tiap fold:", cv_scores)
print("Rata-rata R² Score CV:", np.mean(cv_scores))

R² Score tiap fold: [-0.28378391 -0.32915747 -0.27696657 -0.49552488 -0.18188798]
Rata-rata R² Score CV: -0.3134641647338867
