In [23]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier
import pandas as pd 
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix

df = pd.read_csv('D:/1.UNI 5th Semester/6. Advance DataBase/ADVANCE_DB PROJECT/ds_project/mellyapp/ml/melly_dataset_dropoutcsv.csv')
# 1. Fitur dan target
X = df.drop(columns=['dropout', 'grade'])  # Buang target dan grade
y = df['dropout']

# 2. Kolom kategorik dan numerik
categorical_features = ['gender']
numerical_features = ['age', 'total_activities_done', 'total_duration_minutes']

# 3. Preprocessor
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(drop='first'), categorical_features),
    ('num', StandardScaler(), numerical_features)
])

# 4. Pipeline
pipeline = Pipeline([
    ('preprocess', preprocessor),
    ('clf', XGBClassifier(random_state=42))
])

# 5. Split dan training
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
pipeline.fit(X_train, y_train)

# 6. Evaluasi
y_pred = pipeline.predict(X_test)
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

# 7. Cross-validation
scores = cross_val_score(pipeline, X, y, cv=5, scoring='accuracy')
print(f"Cross-Validation Accuracy (5-fold): {scores.mean():.2f}")


Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.92      0.87        79
           1       0.50      0.29      0.36        21

    accuracy                           0.79       100
   macro avg       0.66      0.60      0.62       100
weighted avg       0.76      0.79      0.77       100

Confusion Matrix:
 [[73  6]
 [15  6]]
Cross-Validation Accuracy (5-fold): 0.64


In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix
from xgboost import XGBClassifier

# 1. Load CSV
df = pd.read_csv('D:/1.UNI 5th Semester/6. Advance DataBase/ADVANCE_DB PROJECT/ds_project/mellyapp/ml/late_dataset.csv')

# 2. Buat label gagal atau tidak (fail_flag)
df['fail_flag'] = df['grade'].apply(lambda x: 1 if x < 60 else 0)

# 3. Definisikan fitur dan target
X = df.drop(columns=['stu_id', 'grade', 'fail_flag'])
y = df['fail_flag']

# 4. Preprocessing
categorical_features = ['gender']
numerical_features = ['age', 'total_activities_done', 'total_available_activities',
                      'completion_ratio', 'avg_duration_minutes', 'num_late_submissions']

preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(drop='first'), categorical_features),
    ('num', StandardScaler(), numerical_features)
])

# 5. Pipeline
pipeline = Pipeline([
    ('preprocess', preprocessor),
    ('clf', XGBClassifier(random_state=42))
])

# 6. Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# 7. Train model
pipeline.fit(X_train, y_train)

# 8. Evaluasi
y_pred = pipeline.predict(X_test)

print("=== Classification Report ===")
print(classification_report(y_test, y_pred))
print("\n=== Confusion Matrix ===")
print(confusion_matrix(y_test, y_pred))

# 9. Cross-validation
cv_scores = cross_val_score(pipeline, X, y, cv=5, scoring='accuracy')
print(f"\n=== Cross-Validation Accuracy (5-fold): {cv_scores.mean():.2f}")


=== Classification Report ===
              precision    recall  f1-score   support

           0       0.82      0.98      0.89        82
           1       0.00      0.00      0.00        18

    accuracy                           0.80       100
   macro avg       0.41      0.49      0.44       100
weighted avg       0.67      0.80      0.73       100


=== Confusion Matrix ===
[[80  2]
 [18  0]]

=== Cross-Validation Accuracy (5-fold): 0.67


In [12]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier
import pandas as pd 
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix

# 1. Load data
df = pd.read_csv('D:/1.UNI 5th Semester/6. Advance DataBase/ADVANCE_DB PROJECT/ds_project/mellyapp/ml/melly_dataset_dropoutcsv.csv')

# 2. Fitur dan target
X = df.drop(columns=['dropout'])  # Buang target dan grade
y = df['dropout']

# 3. Cek distribusi kelas untuk hitung scale_pos_weight
neg, pos = np.bincount(y)
scale = neg / pos
print(f"Jumlah kelas 0 (tidak dropout): {neg}")
print(f"Jumlah kelas 1 (dropout): {pos}")
print(f"scale_pos_weight: {scale:.2f}")

# 4. Kolom kategorik dan numerik
categorical_features = ['gender']
numerical_features = ['age', 'total_activities_done', 'total_duration_minutes']

# 5. Preprocessor
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(drop='first'), categorical_features),
    ('num', StandardScaler(), numerical_features)
])

# 6. Pipeline dengan XGBoost + scale_pos_weight
pipeline = Pipeline([
    ('preprocess', preprocessor),
    ('clf', XGBClassifier(random_state=42, scale_pos_weight=scale))
])

# 7. Split dan training
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
pipeline.fit(X_train, y_train)

# 8. Evaluasi
y_pred = pipeline.predict(X_test)
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

# 9. Cross-validation
scores = cross_val_score(pipeline, X, y, cv=5, scoring='accuracy')
print(f"Cross-Validation Accuracy (5-fold): {scores.mean():.2f}")

Jumlah kelas 0 (tidak dropout): 397
Jumlah kelas 1 (dropout): 103
scale_pos_weight: 3.85

Classification Report:
               precision    recall  f1-score   support

           0       0.78      0.48      0.59        79
           1       0.20      0.48      0.28        21

    accuracy                           0.48       100
   macro avg       0.49      0.48      0.44       100
weighted avg       0.65      0.48      0.53       100

Confusion Matrix:
 [[38 41]
 [11 10]]
Cross-Validation Accuracy (5-fold): 0.55


In [16]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix

# 1. Load dataset
df = pd.read_csv('D:/1.UNI 5th Semester/6. Advance DataBase/ADVANCE_DB PROJECT/ds_project/mellyapp/ml/dropout_dataset.csv')

# 2. Ubah gender menjadi numerik
df['gender'] = df['gender'].map({'Male': 0, 'Female': 1})

# 3. Pisahkan fitur dan target
X = df.drop(columns=['dropout'])  # fitur
y = df['dropout']                 # target dropout

# 4. Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# 5. Buat pipeline model
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('clf', XGBClassifier(random_state=42))
])

# 6. Training
pipeline.fit(X_train, y_train)

# 7. Evaluasi
y_pred = pipeline.predict(X_test)
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

# 8. Cross-validation
scores = cross_val_score(pipeline, X, y, cv=5, scoring='accuracy')
print(f"Cross-Validation Accuracy (5-fold): {scores.mean():.2f}")


Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        79
           1       1.00      1.00      1.00        21

    accuracy                           1.00       100
   macro avg       1.00      1.00      1.00       100
weighted avg       1.00      1.00      1.00       100

Confusion Matrix:
 [[79  0]
 [ 0 21]]
Cross-Validation Accuracy (5-fold): 1.00


In [17]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix

# 1. Load dataset
df = pd.read_csv('D:/1.UNI 5th Semester/6. Advance DataBase/ADVANCE_DB PROJECT/ds_project/mellyapp/ml/dropout_dataset.csv')

# 2. Ubah gender ke numerik
df['gender'] = df['gender'].map({'Male': 0, 'Female': 1})

# 3. Definisi fitur dan target
X = df.drop(columns=['dropout', 'grade'])  # grade dibuang biar nggak leakage
y = df['dropout']

# 4. Hitung scale_pos_weight untuk handling imbalance
jumlah_0 = (y == 0).sum()
jumlah_1 = (y == 1).sum()
scale_pos_weight = jumlah_0 / jumlah_1
print(f"Jumlah kelas 0 (tidak dropout): {jumlah_0}")
print(f"Jumlah kelas 1 (dropout): {jumlah_1}")
print(f"scale_pos_weight: {scale_pos_weight:.2f}")

# 5. Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# 6. Pipeline model
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('clf', XGBClassifier(
        random_state=42,
        scale_pos_weight=scale_pos_weight,
        use_label_encoder=False,
        eval_metric='logloss'
    ))
])

# 7. Training
pipeline.fit(X_train, y_train)

# 8. Evaluasi
y_pred = pipeline.predict(X_test)
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

# 9. Cross-validation
scores = cross_val_score(pipeline, X, y, cv=5, scoring='accuracy')
print(f"Cross-Validation Accuracy (5-fold): {scores.mean():.2f}")

Jumlah kelas 0 (tidak dropout): 397
Jumlah kelas 1 (dropout): 103
scale_pos_weight: 3.85


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



Classification Report:
               precision    recall  f1-score   support

           0       0.75      0.70      0.72        79
           1       0.11      0.14      0.12        21

    accuracy                           0.58       100
   macro avg       0.43      0.42      0.42       100
weighted avg       0.62      0.58      0.60       100

Confusion Matrix:
 [[55 24]
 [18  3]]


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Cross-Validation Accuracy (5-fold): 0.41


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [19]:
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error

# 1. Load Data
df = pd.read_csv("D:/1.UNI 5th Semester/6. Advance DataBase/ADVANCE_DB PROJECT/ds_project/mellyapp/ml/mellyapp_dataset_OLAP.csv")
X = df.drop('grade', axis=1)
y = df['grade']

# 2. Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# 3. Pipeline
pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler()),
    ('model', XGBRegressor(objective='reg:squarederror', random_state=42))
])

# 4. Grid Search (opsional → bisa langsung pakai default jika mau simple)
param_grid = {
    'model__n_estimators': [100],
    'model__max_depth': [3],
    'model__learning_rate': [0.1]
}

grid = GridSearchCV(pipe, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error')
grid.fit(X_train, y_train)

# 5. Evaluasi
y_pred = grid.predict(X_test)
rmse = mean_squared_error(y_test, y_pred) ** 0.5
print(f"XGBoost RMSE: {rmse:.2f}")


XGBoost RMSE: 15.21
