In [4]:
import pandas as pd
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score
import joblib

# Load data
data_path = r"D:\1.UNI 5th Semester\6. Advance DataBase\ADVANCE_DB PROJECT - Copy (3)\ds_project\ruthapp\ml\Ruth_OLAPcsv.csv"
df = pd.read_csv(data_path)

# Drop kolom tidak relevan
X = df.drop(columns=['grade', 'name', 'stu_id'])
y = df['grade']

# Kolom kategorikal & numerikal
categorical_cols = ['gender', 'type_name', 'course_name']
numerical_cols = [col for col in X.columns if col not in categorical_cols]

# Preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ],
    remainder='passthrough'
)

# Pipeline dengan XGBoost
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', XGBRegressor(
        n_estimators=100,
        learning_rate=0.1,
        max_depth=4,
        objective='reg:squarederror',
        random_state=42
    ))
])

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train
pipeline.fit(X_train, y_train)

# Evaluasi
y_pred = pipeline.predict(X_test)
print("MSE:", mean_squared_error(y_test, y_pred))
print("R2 Score:", r2_score(y_test, y_pred))

# Save model
model_path = r"D:\1.UNI 5th Semester\6. Advance DataBase\ADVANCE_DB PROJECT - Copy (3)\ds_project\ruthapp\ml\student_effectiveness_xgb.pkl"
joblib.dump(pipeline, model_path)
print(f"Model saved to {model_path}")


MSE: 219.8004913330078
R2 Score: -0.04997825622558594
Model saved to D:\1.UNI 5th Semester\6. Advance DataBase\ADVANCE_DB PROJECT - Copy (3)\ds_project\ruthapp\ml\student_effectiveness_xgb.pkl
