In [1]:
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error

# 1. Load Data
df = pd.read_csv("D:/1.UNI 5th Semester/6. Advance DataBase/ADVANCE_DB PROJECT/ds_project/mellyapp/ml/mellyapp_dataset_OLAP.csv")
X = df.drop('grade', axis=1)
y = df['grade']

# 2. Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# 3. Pipeline
pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler()),
    ('model', XGBRegressor(objective='reg:squarederror', random_state=42))
])

# 4. Grid Search (opsional → bisa langsung pakai default jika mau simple)
param_grid = {
    'model__n_estimators': [100],
    'model__max_depth': [3],
    'model__learning_rate': [0.1]
}

grid = GridSearchCV(pipe, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error')
grid.fit(X_train, y_train)

# 5. Evaluasi
y_pred = grid.predict(X_test)
rmse = mean_squared_error(y_test, y_pred) ** 0.5
print(f"XGBoost RMSE: {rmse:.2f}")

# import os

# BASE_DIR = os.path.dirname(os.path.abspath(__file__))
# model_path = os.path.join(BASE_DIR, 'xgboost_model.pkl')

# joblib.dump(grid.best_estimator_, model_path)



XGBoost RMSE: 15.21


In [4]:
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error

# 1. Load Data
df = pd.read_csv("D:/1.UNI 5th Semester/6. Advance DataBase/ADVANCE_DB PROJECT/ds_project/mellyapp/ml/mellyapp_dataset_OLAP.csv")
X = df.drop('grade', axis=1)
y = df['grade']

# 2. Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# 3. Pipeline
pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler()),
    ('model', XGBRegressor(objective='reg:squarederror', random_state=42))
])

# 4. Grid Search (opsional → bisa langsung pakai default jika mau simple)
param_grid = {
    'model__n_estimators': [100],
    'model__max_depth': [3],
    'model__learning_rate': [0.1]
}

grid = GridSearchCV(pipe, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error')
grid.fit(X_train, y_train)

# 5. Evaluasi
y_pred = grid.predict(X_test)
rmse = mean_squared_error(y_test, y_pred) ** 0.5
print(f"XGBoost RMSE: {rmse:.2f}")

# import os

# BASE_DIR = os.path.dirname(os.path.abspath(__file__))
# model_path = os.path.join(BASE_DIR, 'xgboost_model.pkl')

# joblib.dump(grid.best_estimator_, model_path)



XGBoost RMSE: 15.21


In [6]:
# train_xgboost_model.py

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
import xgboost as xgb
import joblib

# Load data
df = pd.read_csv("D:/1.UNI 5th Semester/6. Advance DataBase/ADVANCE_DB PROJECT/ds_project/mellyapp/ml/OLAP_FINAL.csv")

# Step 1: Data Cleaning
df.dropna(inplace=True)  # Drop missing values if any
df = df.drop_duplicates()

# Step 2: EDA (Quick)
print("=== Data Summary ===")
print(df.describe())
print("\n=== Correlation with Grade ===")
print(df.corr(numeric_only=True)["grade"].sort_values(ascending=False))

# Step 3: Feature and Target Separation
X = df.drop(columns=["grade", "stu_id","course_id"])  # Exclude grade and student ID
y = df["grade"]

# Step 4: Preprocessing
numeric_features = X.select_dtypes(include=["int64", "float64"]).columns.tolist()
categorical_features = X.select_dtypes(include=["object", "category"]).columns.tolist()

preprocessor = ColumnTransformer(
    transformers=[
        ("num", Pipeline([
            ("imputer", SimpleImputer(strategy="mean")),
            ("scaler", StandardScaler())
        ]), numeric_features),
        
        ("cat", Pipeline([
            ("imputer", SimpleImputer(strategy="most_frequent")),
            ("encoder", OneHotEncoder(handle_unknown="ignore"))
        ]), categorical_features)
    ]
)

# Step 5: Model Pipeline
model = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("regressor", xgb.XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42))
])

# Step 6: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 7: Train Model
model.fit(X_train, y_train)

# Step 8: Evaluation
y_pred = model.predict(X_test)
print(f"R^2 Score: {r2_score(y_test, y_pred):.2f}")
print(f"RMSE: {np.sqrt(mean_squared_error(y_test, y_pred)):.2f}")

# # Step 9: Save Model
# joblib.dump(model, "xgboost_model.pkl")
# print("Model saved as xgboost_model.pkl")


=== Data Summary ===
           stu_id   course_id      gender         age  total_activities  \
count  479.000000  479.000000  479.000000  479.000000        479.000000   
mean    50.678497    3.004175    0.396660   21.640919          2.791232   
std     28.764245    1.418638    0.489716    2.160039          0.551004   
min      1.000000    1.000000    0.000000   18.000000          1.000000   
25%     26.000000    2.000000    0.000000   20.000000          3.000000   
50%     51.000000    3.000000    0.000000   22.000000          3.000000   
75%     75.000000    4.000000    1.000000   23.000000          3.000000   
max    100.000000    5.000000    1.000000   25.000000          3.000000   

       total_duration_minutes  quiz_count  individual_assignment_count  \
count              479.000000  479.000000                   479.000000   
mean                90.394572    0.352818                     1.768267   
std                 34.920364    0.775371                     1.377705   
min    