In [57]:
# load data
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn import tree
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

train_df = pd.read_csv("train_data.csv")
test_df = pd.read_csv("test_data.csv")

In [32]:
train_df["Avg_Academic_Score"] = train_df[["Midterm_Score", "Final_Score", "Assignments_Avg", "Quizzes_Avg", "Projects_Score"]].mean(axis=1)
test_df["Avg_Academic_Score"] = test_df[["Midterm_Score", "Final_Score", "Assignments_Avg", "Quizzes_Avg", "Projects_Score"]].mean(axis=1)

In [33]:
train_df["Effective_Study_Hours"] = train_df["Study_Hours_per_Week"] / (train_df["Stress_Level (1-10)"] + 1)
test_df["Effective_Study_Hours"] = test_df["Study_Hours_per_Week"] / (test_df["Stress_Level (1-10)"] + 1)

In [58]:
train_df["Attendance (%)"].fillna(train_df["Attendance (%)"].median(), inplace=True)
train_df["Assignments_Avg"].fillna(train_df["Assignments_Avg"].median(), inplace=True)
train_df["Parent_Education_Level"].fillna("None", inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_df["Attendance (%)"].fillna(train_df["Attendance (%)"].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_df["Assignments_Avg"].fillna(train_df["Assignments_Avg"].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method 

In [59]:
test_df["Attendance (%)"].fillna(test_df["Attendance (%)"].median(), inplace=True)
test_df["Assignments_Avg"].fillna(test_df["Assignments_Avg"].median(), inplace=True)
test_df["Parent_Education_Level"].fillna("None", inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_df["Attendance (%)"].fillna(test_df["Attendance (%)"].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_df["Assignments_Avg"].fillna(test_df["Assignments_Avg"].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will

In [60]:
test_ids = test_df["Student_ID"]

In [61]:
train_df.drop(columns=["Student_ID","First_Name", "Last_Name", "Email"], inplace=True)

In [62]:
test_df.drop(columns=["Student_ID", "First_Name", "Last_Name", "Email"], inplace=True)

In [63]:
train_df = pd.get_dummies(train_df, columns=["Gender", "Department", "Extracurricular_Activities", "Internet_Access_at_Home"], drop_first=True)
test_df = pd.get_dummies(test_df, columns=["Gender", "Department", "Extracurricular_Activities", "Internet_Access_at_Home"], drop_first=True)

In [64]:
# referensi chat gpt
from sklearn.preprocessing import OrdinalEncoder

ordinal_cols = ["Parent_Education_Level", "Family_Income_Level"]
ordinal_mappings = [
    ["None", "High School", "Bachelor's", "Master's", "PhD"],  # Urutan pendidikan orang tua
    ["Low", "Medium", "High"]  # Urutan tingkat pendapatan keluarga
]

encoder = OrdinalEncoder(categories=ordinal_mappings)
train_df[ordinal_cols] = encoder.fit_transform(train_df[ordinal_cols])


In [65]:
# Tentukan kolom yang ada di test_df
ordinal_cols_test = ["Parent_Education_Level", "Family_Income_Level"]
ordinal_mappings_test = [
    ["None", "High School", "Bachelor's", "Master's", "PhD"],  # Parent Education Level
    ["Low", "Medium", "High"]  # Family Income Level
]

encoder_test = OrdinalEncoder(categories=ordinal_mappings_test)
test_df[ordinal_cols_test] = encoder_test.fit_transform(test_df[ordinal_cols_test])

In [66]:
X_train = train_df.drop(columns=["Grade"])  # Semua fitur kecuali Grade
y_train = train_df["Grade"]  # Label

X_test = test_df  # Semua fitur dari test_df

In [43]:
# Drop kolom yang tidak diperlukan
drop_columns = ["Grade"]
X = train_df.drop(columns=drop_columns)  
y = train_df["Grade"]

# Split data: 80% train, 20% validation
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)  


In [67]:
def classification_metrics(prediction, y_test):
  accuracy = accuracy_score(y_test, prediction)
  f1 = f1_score(y_test, prediction, average="macro")
  recall = recall_score(y_test, prediction, average="macro")
  precision = precision_score(y_test, prediction, average="macro")

  print('Accuracy: ' + str(accuracy))
  print('F1 Score: ' + str(f1))
  print('Recall Score: ' + str(recall))
  print('Precision Score: ' + str(precision))


In [46]:
rf_model = RandomForestClassifier(n_estimators=10, random_state=42)

# Train model
rf_model.fit(X_train, y_train)

# Prediksi pada validation set
y_val_pred_rf = rf_model.predict(X_val)

# Evaluasi model
classification_metrics(y_val_pred_rf, y_val)


Accuracy: 0.30793650793650795
F1 Score: 0.30290770403485745
Recall Score: 0.3094641526506281
Precision Score: 0.30734475509174564


In [69]:
print("Decision Tree Hyperparameter Tuning:")
print("-" * 40)

# Define parameter grid for Decision Tree
dt_param_grid = {
    'max_depth': [None, 5, 10, 15, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'criterion': ['gini', 'entropy']
}

# Create GridSearchCV object
dt_grid = GridSearchCV(
    DecisionTreeClassifier(random_state=42),
    param_grid=dt_param_grid,
    cv=5,
    scoring='accuracy',
    verbose=1,
    n_jobs=-1
)

# Fit the grid search
dt_grid.fit(X_train, y_train)

# Print best parameters and score
print(f"Best parameters: {dt_grid.best_params_}")
print(f"Best cross-validation score: {dt_grid.best_score_:.4f}")

# Get best model
best_dt = dt_grid.best_estimator_

# # Evaluate on validation set
# print("\nBest Decision Tree performance on validation set:")
# y_val_pred_best_dt = best_dt.predict(X_val)
# dt_best_metrics = classification_metrics(y_val_pred_best_dt, y_val)


Decision Tree Hyperparameter Tuning:
----------------------------------------
Fitting 5 folds for each of 90 candidates, totalling 450 fits
Best parameters: {'criterion': 'entropy', 'max_depth': 5, 'min_samples_leaf': 2, 'min_samples_split': 2}
Best cross-validation score: 0.3181


In [70]:
rf_param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2']
}

# Create GridSearchCV object
rf_grid = GridSearchCV(
    RandomForestClassifier(random_state=42),
    param_grid=rf_param_grid,
    cv=5,
    scoring='accuracy',
    verbose=1,
    n_jobs=-1
)

# Fit the grid search
rf_grid.fit(X_train, y_train)

# Print best parameters and score
print(f"Best parameters: {rf_grid.best_params_}")
print(f"Best cross-validation score: {rf_grid.best_score_:.4f}")

# Get best model
best_rf = rf_grid.best_estimator_

# print("\nBest Random Forest performance on validation set:")
# y_val_pred_best_rf = best_rf.predict(X_val)
# rf_best_metrics = classification_metrics(y_val_pred_best_rf, y_val)

Fitting 5 folds for each of 162 candidates, totalling 810 fits
Best parameters: {'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 50}
Best cross-validation score: 0.3333


In [101]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import OrdinalEncoder

# 🔹 Load dataset
train_df = pd.read_csv("train_data.csv")
test_df = pd.read_csv("test_data.csv")

# 🔹 Feature Engineering
train_df["Avg_Academic_Score"] = train_df[["Midterm_Score", "Final_Score", "Assignments_Avg", "Quizzes_Avg", "Projects_Score"]].mean(axis=1)
test_df["Avg_Academic_Score"] = test_df[["Midterm_Score", "Final_Score", "Assignments_Avg", "Quizzes_Avg", "Projects_Score"]].mean(axis=1)

train_df["Effective_Study_Hours"] = train_df["Study_Hours_per_Week"] / (train_df["Stress_Level (1-10)"] + 1)
test_df["Effective_Study_Hours"] = test_df["Study_Hours_per_Week"] / (test_df["Stress_Level (1-10)"] + 1)

train_df["Attendance_Study_Ratio"] = train_df["Attendance (%)"] / (train_df["Study_Hours_per_Week"] + 1)
test_df["Attendance_Study_Ratio"] = test_df["Attendance (%)"] / (test_df["Study_Hours_per_Week"] + 1)

# 🔹 Handle Missing Values (Numerik)
for col in train_df.select_dtypes(include=["int64", "float64"]).columns:
    train_df[col].fillna(train_df[col].mean(), inplace=True)
    test_df[col].fillna(test_df[col].mean(), inplace=True)

# 🔹 Drop ID & Irrelevant Columns
drop_columns = ["Student_ID", "First_Name", "Last_Name", "Email"]
train_df.drop(columns=drop_columns, inplace=True)
test_df.drop(columns=drop_columns, inplace=True)

# 🔹 One-Hot Encoding (Kategori Tanpa Urutan)
categorical_cols = ["Gender", "Department", "Extracurricular_Activities", "Internet_Access_at_Home"]
train_df = pd.get_dummies(train_df, columns=categorical_cols, drop_first=True)
test_df = pd.get_dummies(test_df, columns=categorical_cols, drop_first=True)

# 🔹 Ordinal Encoding (Kategori dengan Urutan)
ordinal_cols = ["Parent_Education_Level", "Family_Income_Level"]
ordinal_mappings = [
    ["None", "High School", "Bachelor's", "Master's", "PhD"],
    ["Low", "Medium", "High"]
]

for col in ordinal_cols:
    train_df[col].fillna("None", inplace=True)
    test_df[col].fillna("None", inplace=True)

encoder = OrdinalEncoder(categories=ordinal_mappings)
train_df[ordinal_cols] = encoder.fit_transform(train_df[ordinal_cols])
test_df[ordinal_cols] = encoder.transform(test_df[ordinal_cols])

# 🔹 Pastikan Semua Kolom Sama di Test
missing_cols = set(train_df.columns) - set(test_df.columns) - {"Grade"}
for col in missing_cols:
    test_df[col] = 0  

# 🔹 Split Data
X = train_df.drop(columns=["Grade"])
y = train_df["Grade"]
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# 🔹 Hyperparameter Tuning untuk Decision Tree
param_grid = {
    "max_depth": [10, 20, None],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
    "criterion": ["gini", "entropy"]
}

dt = DecisionTreeClassifier(random_state=42)
grid_search = GridSearchCV(dt, param_grid, cv=5, scoring="accuracy", n_jobs=-1)
grid_search.fit(X_train, y_train)

best_dt = grid_search.best_estimator_

# 🔹 Evaluasi Akurasi
val_acc = best_dt.score(X_val, y_val)
print(f"Validation Accuracy: {val_acc:.5f}")

# 🔹 Prediksi Test Set
y_pred_test = best_dt.predict(test_df)

# 🔹 Simpan Output
# submission = pd.DataFrame({"Student_ID": test_df.index, "Grade": y_pred_test})
# submission.to_csv("submission.csv", index=False)
# print("✅ Prediksi berhasil disimpan!")

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_df[col].fillna(train_df[col].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_df[col].fillna(test_df[col].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we 

Validation Accuracy: 0.30952


In [102]:
# Pastikan test_df sudah tersedia
X_test = test_df  # Drop fitur yang tidak dibutuhkan

# Melakukan prediksi dengan masing-masing model
# y_pred_best_rf = best_rf.predict(X_test)
y_pred_best_dt = best_dt.predict(X_test)
y_pred_best_rf = rf.predict(X_test)


# Load sample submission untuk format yang benar
submission_format = pd.read_csv("sample_submission.csv")

# Fungsi untuk membuat submission file
def create_submission(filename, student_ids, predictions):
    submission = submission_format.copy()  # Salin template
    submission["Student_ID"] = student_ids
    submission["Grade"] = predictions
    submission.to_csv(filename, index=False)
    print(f"Submission saved: {filename}")

# Simpan ke CSV masing-masing model
create_submission("ScudettoCianoSyam_best_rf2.csv", test_ids, y_pred_best_rf)
create_submission("ScudettoCianoSyam_best_dt.csv", test_ids, y_pred_best_dt)

ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- Attendance_Study_Ratio
- Avg_Academic_Score
- Department_CS
- Department_Engineering
- Department_Mathematics
- ...
Feature names seen at fit time, yet now missing:
- Department
- Extracurricular_Activities
- First_Name
- Gender
- Internet_Access_at_Home
- ...


In [76]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OrdinalEncoder

# Load dataset
train_df = pd.read_csv("train_data.csv")
test_df = pd.read_csv("test_data.csv")

# Feature Engineering
train_df["Avg_Academic_Score"] = train_df[["Midterm_Score", "Final_Score", "Assignments_Avg", "Quizzes_Avg", "Projects_Score"]].mean(axis=1)
test_df["Avg_Academic_Score"] = test_df[["Midterm_Score", "Final_Score", "Assignments_Avg", "Quizzes_Avg", "Projects_Score"]].mean(axis=1)

train_df["Effective_Study_Hours"] = train_df["Study_Hours_per_Week"] / (train_df["Stress_Level (1-10)"] + 1)
test_df["Effective_Study_Hours"] = test_df["Study_Hours_per_Week"] / (test_df["Stress_Level (1-10)"] + 1)

train_df["Parent_Education_Level"].fillna("None", inplace=True)
test_df["Parent_Education_Level"].fillna("None", inplace=True)

# Drop unnecessary columns
test_ids = test_df["Student_ID"]
drop_columns = ["Student_ID", "First_Name", "Last_Name", "Email"]
train_df.drop(columns=drop_columns, inplace=True)
test_df.drop(columns=drop_columns, inplace=True)

# Encoding categorical features
categorical_cols = ["Gender", "Department", "Extracurricular_Activities", "Internet_Access_at_Home"]
train_df = pd.get_dummies(train_df, columns=categorical_cols, drop_first=True)
test_df = pd.get_dummies(test_df, columns=categorical_cols, drop_first=True)

ordinal_cols = ["Parent_Education_Level", "Family_Income_Level"]
ordinal_mappings = [
    ["None", "High School", "Bachelor's", "Master's", "PhD"],
    ["Low", "Medium", "High"]
]

encoder = OrdinalEncoder(categories=ordinal_mappings)
train_df[ordinal_cols] = encoder.fit_transform(train_df[ordinal_cols])
test_df[ordinal_cols] = encoder.transform(test_df[ordinal_cols])

# Ensure test_df has same columns as train_df
missing_cols = set(train_df.columns) - set(test_df.columns) - {"Grade"}
for col in missing_cols:
    test_df[col] = 0

# Split train dataset into train and validation set
X = train_df.drop(columns=["Grade"])
y = train_df["Grade"]

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Define hyperparameters for GridSearch
params = {
    'max_depth': [3, 5, 10, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 5],
    'n_estimators': [50, 100, 200]
}

grid_search = GridSearchCV(RandomForestClassifier(random_state=42),
                           params, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

best_rf = grid_search.best_estimator_
print("Best Parameters:", grid_search.best_params_)

# Train final model with best parameters
best_rf.fit(X_train, y_train)

# Predict on validation set
y_val_pred = best_rf.predict(X_val)
accuracy = (y_val_pred == y_val).mean()
print(f"Validation Accuracy: {accuracy:.4f}")

# Predict on test set
y_test_pred = best_rf.predict(test_df)

# # Save predictions
# submission = pd.DataFrame({"Student_ID": test_df.index, "Grade": y_test_pred})
# submission.to_csv("submission.csv", index=False)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_df["Parent_Education_Level"].fillna("None", inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_df["Parent_Education_Level"].fillna("None", inplace=True)


Best Parameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 50}
Validation Accuracy: 0.3619


In [84]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import SMOTE

# 🔹 Load Data
train_df = pd.read_csv("train_data.csv")
test_df = pd.read_csv("test_data.csv")

# 🔹 Feature Engineering
train_df["Avg_Academic_Score"] = train_df[["Midterm_Score", "Final_Score", "Assignments_Avg", "Quizzes_Avg", "Projects_Score"]].mean(axis=1)
test_df["Avg_Academic_Score"] = test_df[["Midterm_Score", "Final_Score", "Assignments_Avg", "Quizzes_Avg", "Projects_Score"]].mean(axis=1)

train_df["Effective_Study_Hours"] = train_df["Study_Hours_per_Week"] / (train_df["Stress_Level (1-10)"] + 1)
test_df["Effective_Study_Hours"] = test_df["Study_Hours_per_Week"] / (test_df["Stress_Level (1-10)"] + 1)

train_df["Attendance_Academic_Score"] = train_df["Attendance (%)"] * train_df["Avg_Academic_Score"]
test_df["Attendance_Academic_Score"] = test_df["Attendance (%)"] * test_df["Avg_Academic_Score"]

# 🔹 Isi missing values untuk numerik dengan mean
for col in train_df.select_dtypes(include=['int64', 'float64']).columns:
    train_df[col].fillna(train_df[col].mean(), inplace=True)
    test_df[col].fillna(test_df[col].mean(), inplace=True)

# 🔹 Isi missing values untuk kategorikal dengan mode (nilai yang paling sering muncul)
# for col in train_df.select_dtypes(include=['object']).columns:
#     train_df[col].fillna(train_df[col].mode()[0], inplace=True)
#     test_df[col].fillna(test_df[col].mode()[0], inplace=True)

train_df["Parent_Education_Level"].fillna("None", inplace=True)
test_df["Parent_Education_Level"].fillna("None", inplace=True)

# 🔹 Drop Unused Columns
drop_columns = ["Student_ID", "First_Name", "Last_Name", "Email"]
train_df.drop(columns=drop_columns, inplace=True)
test_df.drop(columns=drop_columns, inplace=True)

# 🔹 Encoding Categorical Features
ordinal_cols = ["Parent_Education_Level", "Family_Income_Level"]
ordinal_mappings = [
    ["None", "High School", "Bachelor's", "Master's", "PhD"],
    ["Low", "Medium", "High"]
]
encoder = OrdinalEncoder(categories=ordinal_mappings)
train_df[ordinal_cols] = encoder.fit_transform(train_df[ordinal_cols])
test_df[ordinal_cols] = encoder.transform(test_df[ordinal_cols])

categorical_cols = ["Gender", "Department", "Extracurricular_Activities", "Internet_Access_at_Home"]
train_df = pd.get_dummies(train_df, columns=categorical_cols, drop_first=True)
test_df = pd.get_dummies(test_df, columns=categorical_cols, drop_first=True)

# 🔹 Ensure Test Data Matches Train Columns
missing_cols = set(train_df.columns) - set(test_df.columns) - {"Grade"}
for col in missing_cols:
    test_df[col] = 0

# 🔹 Scaling Numeric Features
scaler = StandardScaler()
num_cols = ["Study_Hours_per_Week", "Avg_Academic_Score", "Effective_Study_Hours", "Attendance (%)", "Attendance_Academic_Score"]
train_df[num_cols] = scaler.fit_transform(train_df[num_cols])
test_df[num_cols] = scaler.transform(test_df[num_cols])

# 🔹 Define Features & Target
X = train_df.drop(columns=["Grade"])
y = train_df["Grade"]

# 🔹 Handle Imbalanced Data (SMOTE)
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# 🔹 Train-Test Split (80% Train, 20% Validation)
X_train, X_val, y_train, y_val = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42, stratify=y_resampled)

# 🔹 Hyperparameter Tuning (Random Forest)
rf = RandomForestClassifier(random_state=42)
param_grid = {
    "n_estimators": [200, 300, 400],
    "max_depth": [10, 15, 20],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4]
}

grid_search = GridSearchCV(rf, param_grid, cv=5, scoring="accuracy", n_jobs=-1)
grid_search.fit(X_train, y_train)

best_rf = grid_search.best_estimator_

# 🔹 Evaluasi Model pada Validation Set
y_val_pred = best_rf.predict(X_val)
val_accuracy = accuracy_score(y_val, y_val_pred)
print(f"Validation Accuracy: {val_accuracy:.4f}")

# 🔹 Prediksi pada Test Set
y_test_pred = best_rf.predict(test_df)

# 🔹 Simpan Hasil Prediksi
# submission = pd.DataFrame({"Student_ID": test_df.index, "Predicted_Grade": y_test_pred})
# submission.to_csv("submission.csv", index=False)
# print("Prediksi test_df disimpan dalam submission.csv")


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_df[col].fillna(train_df[col].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_df[col].fillna(test_df[col].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we 

Validation Accuracy: 0.3773


In [88]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OrdinalEncoder

# 🔹 Load dataset
train_df = pd.read_csv("train_data.csv")
test_df = pd.read_csv("test_data.csv")

# 🔹 Feature Engineering
train_df["Avg_Academic_Score"] = train_df[["Midterm_Score", "Final_Score", "Assignments_Avg", "Quizzes_Avg", "Projects_Score"]].mean(axis=1)
test_df["Avg_Academic_Score"] = test_df[["Midterm_Score", "Final_Score", "Assignments_Avg", "Quizzes_Avg", "Projects_Score"]].mean(axis=1)

train_df["Effective_Study_Hours"] = train_df["Study_Hours_per_Week"] / (train_df["Stress_Level (1-10)"] + 1)
test_df["Effective_Study_Hours"] = test_df["Study_Hours_per_Week"] / (test_df["Stress_Level (1-10)"] + 1)

train_df["Attendance_Study_Ratio"] = train_df["Attendance (%)"] / (train_df["Study_Hours_per_Week"] + 1)
test_df["Attendance_Study_Ratio"] = test_df["Attendance (%)"] / (test_df["Study_Hours_per_Week"] + 1)

# 🔹 Handle Missing Values (Hanya Numerik)
for col in train_df.select_dtypes(include=['int64', 'float64']).columns:
    train_df[col].fillna(train_df[col].mean(), inplace=True)
    test_df[col].fillna(test_df[col].mean(), inplace=True)

# 🔹 Drop ID & Irrelevant Columns
drop_columns = ["Student_ID", "First_Name", "Last_Name", "Email"]
train_df.drop(columns=drop_columns, inplace=True)
test_df.drop(columns=drop_columns, inplace=True)

# 🔹 One-Hot Encoding (Kategori Tanpa Urutan)
categorical_cols = ["Gender", "Department", "Extracurricular_Activities", "Internet_Access_at_Home"]
train_df = pd.get_dummies(train_df, columns=categorical_cols, drop_first=True)
test_df = pd.get_dummies(test_df, columns=categorical_cols, drop_first=True)

# 🔹 Ordinal Encoding (Kategori dengan Urutan)
ordinal_cols = ["Parent_Education_Level", "Family_Income_Level"]
ordinal_mappings = [
    ["None", "High School", "Bachelor's", "Master's", "PhD"],
    ["Low", "Medium", "High"]
]

# 🔹 **Solusi Error: Isi NaN di Kolom Ordinal Sebelum Encoding**
for col in ordinal_cols:
    train_df[col].fillna("None", inplace=True)
    test_df[col].fillna("None", inplace=True)

encoder = OrdinalEncoder(categories=ordinal_mappings)
train_df[ordinal_cols] = encoder.fit_transform(train_df[ordinal_cols])
test_df[ordinal_cols] = encoder.transform(test_df[ordinal_cols])

# 🔹 Pastikan Semua Kolom Sama di Test
missing_cols = set(train_df.columns) - set(test_df.columns) - {"Grade"}
for col in missing_cols:
    test_df[col] = 0  

# 🔹 Split Data
X = train_df.drop(columns=["Grade"])
y = train_df["Grade"]
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# 🔹 Model Random Forest dengan Hyperparameter Tuning
param_grid = {
    "n_estimators": [100, 200, 300],
    "max_depth": [10, 20, None],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4]
}

rf = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(rf, param_grid, cv=5, scoring="accuracy", n_jobs=-1)
grid_search.fit(X_train, y_train)

best_rf = grid_search.best_estimator_

# 🔹 Evaluasi Akurasi
val_acc = best_rf.score(X_val, y_val)
print(f"Validation Accuracy: {val_acc:.5f}")

# 🔹 Prediksi Test Set
y_pred_test = best_rf.predict(test_df)

# 🔹 Simpan Output
# submission = pd.DataFrame({"Student_ID": test_df.index, "Grade": y_pred_test})
# submission.to_csv("submission.csv", index=False)
# print("✅ Prediksi berhasil disimpan!")


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_df[col].fillna(train_df[col].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_df[col].fillna(test_df[col].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we 

Validation Accuracy: 0.35238


In [99]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder

# 🔹 Load dataset
df_train = pd.read_csv("train_data.csv")
df_test = pd.read_csv("test_data.csv")

# 🔹 Simpan Student_ID dari test set (agar tidak hilang)
student_id = df_test["Student_ID"]

# 🔹 Handle missing values (menggunakan Mean & Mode)
def fill_null_mean(df, col):
    df[col] = df[col].fillna(df[col].mean())

def fill_null_mode(df, col):
    df[col] = df[col].fillna(df[col].mode()[0])

fill_null_mean(df_train, "Attendance (%)")
fill_null_mean(df_train, "Assignments_Avg")
fill_null_mode(df_train, "Parent_Education_Level")

fill_null_mean(df_test, "Attendance (%)")
fill_null_mean(df_test, "Assignments_Avg")
fill_null_mode(df_test, "Parent_Education_Level")

# 🔹 Drop kolom yang tidak berguna sebelum encoding
df_train.drop(columns=["Email", "Student_ID"], inplace=True)
df_test.drop(columns=["Email", "Student_ID"], inplace=True)

# 🔹 Encode semua kolom kategori kecuali 'Grade'
categorical_columns = df_train.select_dtypes(include=["object"]).columns
categorical_columns = categorical_columns[categorical_columns != "Grade"]  # Hindari mengubah Grade
label_encoders = {}

for col in categorical_columns:
    le = LabelEncoder()
    df_train[col] = le.fit_transform(df_train[col])  # Encode di train

    # Transform di test, tapi hindari error jika ada kategori baru
    df_test[col] = df_test[col].map(lambda x: le.transform([x])[0] if x in le.classes_ else np.nan)

    label_encoders[col] = le

# 🔹 Split data menjadi train & validation set
X = df_train.drop(columns=["Grade"])
y = df_train["Grade"]  # Biarkan Grade tetap A/B/C
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# 🔹 Model Random Forest dengan Hyperparameter Optimal
rf = RandomForestClassifier(n_estimators=300, max_depth=12, min_samples_split=5, random_state=42)
rf.fit(X_train, y_train)

# 🔹 Evaluasi Model (Akurasi Validation)
val_acc = rf.score(X_val, y_val)
print(f"Validation Accuracy: {val_acc:.5f}")

# 🔹 Prediksi Test Set
y_pred_test = rf.predict(df_test)

# 🔹 Simpan Output ke CSV
# df_submission = pd.DataFrame({"Student_ID": student_id, "Grade": y_pred_test})
# df_submission.to_csv("submission.csv", index=False)
# print("✅ Prediksi berhasil disimpan!")


Validation Accuracy: 0.37143


In [103]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import OrdinalEncoder

# 🔹 Load dataset
train_df = pd.read_csv("train_data.csv")
test_df = pd.read_csv("test_data.csv")

# 🔹 Simpan Student_ID dari test set
student_id = test_df["Student_ID"].copy()

# 🔹 Feature Engineering
train_df["Avg_Academic_Score"] = train_df[["Midterm_Score", "Final_Score", "Assignments_Avg", "Quizzes_Avg", "Projects_Score"]].mean(axis=1)
test_df["Avg_Academic_Score"] = test_df[["Midterm_Score", "Final_Score", "Assignments_Avg", "Quizzes_Avg", "Projects_Score"]].mean(axis=1)

train_df["Effective_Study_Hours"] = train_df["Study_Hours_per_Week"] / (train_df["Stress_Level (1-10)"] + 1)
test_df["Effective_Study_Hours"] = test_df["Study_Hours_per_Week"] / (test_df["Stress_Level (1-10)"] + 1)

train_df["Attendance_Study_Ratio"] = train_df["Attendance (%)"] / (train_df["Study_Hours_per_Week"] + 1)
test_df["Attendance_Study_Ratio"] = test_df["Attendance (%)"] / (test_df["Study_Hours_per_Week"] + 1)

# 🔹 Handle Missing Values (Numerik dengan Mean)
for col in train_df.select_dtypes(include=['int64', 'float64']).columns:
    train_df[col].fillna(train_df[col].mean(), inplace=True)
    test_df[col].fillna(test_df[col].mean(), inplace=True)

# 🔹 Drop ID & Irrelevant Columns
drop_columns = ["Student_ID", "First_Name", "Last_Name", "Email"]
train_df.drop(columns=drop_columns, inplace=True)
test_df.drop(columns=drop_columns, inplace=True)

# 🔹 One-Hot Encoding (Kategori Tanpa Urutan)
categorical_cols = ["Gender", "Department", "Extracurricular_Activities", "Internet_Access_at_Home"]
train_df = pd.get_dummies(train_df, columns=categorical_cols, drop_first=True)
test_df = pd.get_dummies(test_df, columns=categorical_cols, drop_first=True)

# 🔹 Ordinal Encoding (Kategori dengan Urutan)
ordinal_cols = ["Parent_Education_Level", "Family_Income_Level"]
ordinal_mappings = [
    ["None", "High School", "Bachelor's", "Master's", "PhD"],
    ["Low", "Medium", "High"]
]

# Isi NaN sebelum encoding
for col in ordinal_cols:
    train_df[col].fillna("None", inplace=True)
    test_df[col].fillna("None", inplace=True)

encoder = OrdinalEncoder(categories=ordinal_mappings)
train_df[ordinal_cols] = encoder.fit_transform(train_df[ordinal_cols])
test_df[ordinal_cols] = encoder.transform(test_df[ordinal_cols])

# 🔹 Pastikan Semua Kolom Sama di Test
missing_cols = set(train_df.columns) - set(test_df.columns) - {"Grade"}
for col in missing_cols:
    test_df[col] = 0  

# 🔹 Split Data
X = train_df.drop(columns=["Grade"])
y = train_df["Grade"]
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# 🔹 Model Decision Tree dengan Hyperparameter Tuning
param_grid = {
    "max_depth": [5, 10, 15, None],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4]
}

dtc = DecisionTreeClassifier(random_state=42)
grid_search = GridSearchCV(dtc, param_grid, cv=5, scoring="accuracy", n_jobs=-1)
grid_search.fit(X_train, y_train)

best_dtc = grid_search.best_estimator_

# 🔹 Evaluasi Akurasi
val_acc = best_dtc.score(X_val, y_val)
print(f"Validation Accuracy: {val_acc:.5f}")

# 🔹 Prediksi Test Set
y_pred_test = best_dtc.predict(test_df)

# 🔹 Simpan Output ke CSV
df_submission = pd.DataFrame({"Student_ID": student_id, "Grade": y_pred_test})
df_submission.to_csv("sample_submission.csv", index=False)
print("✅ Prediksi berhasil disimpan!")

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_df[col].fillna(train_df[col].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_df[col].fillna(test_df[col].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we 

Validation Accuracy: 0.30952
✅ Prediksi berhasil disimpan!


In [109]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler

# Load dataset
df_train = pd.read_csv("train_data.csv")
df_test = pd.read_csv("test_data.csv")

# Method handle null
def fill_null_mean(df):
    return df.fillna(df.mean())

def fill_null_mode(df):
    return df.fillna(df.mode()[0])

# Handle null for train_data.csv
df_train['Attendance (%)'] = fill_null_mean(df_train['Attendance (%)'])
df_train['Assignments_Avg'] = fill_null_mean(df_train['Assignments_Avg'])
df_train['Parent_Education_Level'] = fill_null_mode(df_train['Parent_Education_Level'])

# Handle null for test_data.csv
df_test['Attendance (%)'] = fill_null_mean(df_test['Attendance (%)'])
df_test['Assignments_Avg'] = fill_null_mean(df_test['Assignments_Avg'])
df_test['Parent_Education_Level'] = fill_null_mode(df_test['Parent_Education_Level'])

# Drop unique columns
df_train.drop(columns=['Email'], inplace=True)
df_test.drop(columns=['Email'], inplace=True)
student_id = df_test['Student_ID']
df_test.drop(columns=['Student_ID'], inplace=True)

# Encode categorical columns
categorical_columns = ['First_Name', 'Last_Name', 'Gender', 'Department', 'Extracurricular_Activities', 'Internet_Access_at_Home', 'Parent_Education_Level', 'Family_Income_Level']
label_encoders = {}
le = LabelEncoder()

for col in categorical_columns:
    df_train[col] = le.fit_transform(df_train[col])
    df_test[col] = le.transform(df_test[col])
    label_encoders[col] = le

# Splitting train and test from dataset train_data.csv
X = df_train.drop(columns=['Student_ID', 'Grade'])
y = df_train['Grade']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Standardize numerical columns
numerical_columns = ['Attendance (%)', 'Assignments_Avg']
scaler = StandardScaler()
X_train[numerical_columns] = scaler.fit_transform(X_train[numerical_columns])
X_test[numerical_columns] = scaler.transform(X_test[numerical_columns])
df_test[numerical_columns] = scaler.transform(df_test[numerical_columns])

# Hyperparameter tuning for Random Forest Classifier
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 5, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

rfc = RandomForestClassifier(random_state=42)
clf = RandomizedSearchCV(estimator=rfc, param_distributions=param_grid, n_iter=20, cv=5, random_state=42)
clf.fit(X_train, y_train)
print("Best parameters:", clf.best_params_)

# Train Random Forest Classifier with best parameters
rfc_best = RandomForestClassifier(
    n_estimators=clf.best_params_['n_estimators'],
    max_depth=clf.best_params_['max_depth'],
    min_samples_split=clf.best_params_['min_samples_split'],
    min_samples_leaf=clf.best_params_['min_samples_leaf'],
    random_state=42
)
rfc_best.fit(X_train, y_train)

# Predict test data
prediction_kag = rfc_best.predict(df_test)

# Save results to CSV
df_submission = pd.DataFrame()
df_submission['Student_ID'] = student_id
df_submission['Grade'] = prediction_kag
df_submission.to_csv("submit4.csv", index=False)

print("✅ Submission file saved as sample_submission.csv")


Best parameters: {'n_estimators': 200, 'min_samples_split': 10, 'min_samples_leaf': 4, 'max_depth': 20}
✅ Submission file saved as sample_submission.csv


In [110]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.class_weight import compute_class_weight

# Load dataset
df_train = pd.read_csv("train_data.csv")
df_test = pd.read_csv("test_data.csv")

# Method handle null
def fill_null_mean(df):
    return df.fillna(df.mean())

def fill_null_median(df):
    return df.fillna(df.median())

def fill_null_mode(df):
    return df.fillna(df.mode()[0])

# Handle null for train_data.csv
df_train['Attendance (%)'] = fill_null_mean(df_train['Attendance (%)'])
df_train['Assignments_Avg'] = fill_null_mean(df_train['Assignments_Avg'])
df_train['Parent_Education_Level'] = fill_null_mode(df_train['Parent_Education_Level'])

# Handle null for test_data.csv
df_test['Attendance (%)'] = fill_null_mean(df_test['Attendance (%)'])
df_test['Assignments_Avg'] = fill_null_mean(df_test['Assignments_Avg'])
df_test['Parent_Education_Level'] = fill_null_mode(df_test['Parent_Education_Level'])

# Drop unique columns
df_train.drop(columns=['Email'], inplace=True)
df_test.drop(columns=['Email'], inplace=True)
student_id = df_test['Student_ID']
df_test.drop(columns=['Student_ID'], inplace=True)

# Encode categorical columns
categorical_columns = ['First_Name', 'Last_Name', 'Gender', 'Department', 'Extracurricular_Activities', 'Internet_Access_at_Home', 'Parent_Education_Level', 'Family_Income_Level']
label_encoders = {}
le = LabelEncoder()

for col in categorical_columns:
    df_train[col] = le.fit_transform(df_train[col])
    df_test[col] = le.transform(df_test[col])
    label_encoders[col] = le

# Splitting train and test from dataset train_data.csv
X = df_train.drop(columns=['Student_ID', 'Grade'])
y = df_train['Grade']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Compute class weights to handle imbalance
class_weights = compute_class_weight(class_weight="balanced", classes=np.unique(y_train), y=y_train)
class_weight_dict = {cls: weight for cls, weight in zip(np.unique(y_train), class_weights)}

# Hyperparameter tuning for Decision Tree Classifier
param_grid = {
    'criterion': ['gini', 'entropy'],
    'min_samples_split': [5, 10, 20, 50],
    'max_depth': [None, 5, 10, 15, 20],
    'min_samples_leaf': [1, 2, 5, 10],
    'max_features': [None, 'sqrt', 'log2']
}

dtc_kag = DecisionTreeClassifier(class_weight=class_weight_dict, random_state=42)
clf_kag = RandomizedSearchCV(estimator=dtc_kag, param_distributions=param_grid, cv=5, n_iter=20, random_state=42)
clf_kag.fit(X_train, y_train)
print("Best parameters:", clf_kag.best_params_)

# Train Decision Tree Classifier with best parameters
dtc_kag1 = DecisionTreeClassifier(
    criterion=clf_kag.best_params_['criterion'],
    max_depth=clf_kag.best_params_['max_depth'],
    min_samples_split=clf_kag.best_params_['min_samples_split'],
    min_samples_leaf=clf_kag.best_params_['min_samples_leaf'],
    max_features=clf_kag.best_params_['max_features'],
    class_weight=class_weight_dict,
    random_state=42
)
dtc_kag1.fit(X_train, y_train)

# Predict test data
prediction_kag = dtc_kag1.predict(df_test)

# Save results to CSV
df_submission = pd.DataFrame()
df_submission['Student_ID'] = student_id
df_submission['Grade'] = prediction_kag
df_submission.to_csv("submit5.csv", index=False)

print("✅ Submission file saved as sample_submission.csv")


Best parameters: {'min_samples_split': 50, 'min_samples_leaf': 2, 'max_features': None, 'max_depth': 5, 'criterion': 'gini'}
✅ Submission file saved as sample_submission.csv


In [112]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder

# Load dataset
df_train = pd.read_csv("train_data.csv")
df_test = pd.read_csv("test_data.csv")

# Handle missing values using mean for numerical and mode for categorical
def fill_null_mean(df, cols):
    for col in cols:
        df[col] = df[col].fillna(df[col].mean())
    return df

def fill_null_mode(df, cols):
    for col in cols:
        df[col] = df[col].fillna(df[col].mode()[0])
    return df

num_cols = ['Attendance (%)', 'Assignments_Avg']
cat_cols = ['Parent_Education_Level']

df_train = fill_null_mean(df_train, num_cols)
df_train = fill_null_mode(df_train, cat_cols)

df_test = fill_null_mean(df_test, num_cols)
df_test = fill_null_mode(df_test, cat_cols)

# Drop unnecessary columns
drop_cols = ['Email', 'Student_ID']
student_id = df_test['Student_ID']
df_train.drop(columns=drop_cols, inplace=True, errors='ignore')
df_test.drop(columns=drop_cols, inplace=True, errors='ignore')

# Encode categorical columns
categorical_columns = ['First_Name', 'Last_Name', 'Gender', 'Department', 'Extracurricular_Activities', 'Internet_Access_at_Home', 'Parent_Education_Level', 'Family_Income_Level']
le = LabelEncoder()
for col in categorical_columns:
    df_train[col] = le.fit_transform(df_train[col])
    df_test[col] = le.transform(df_test[col])

# Split dataset
X = df_train.drop(columns=['Grade'])
y = df_train['Grade']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Hyperparameter tuning for Random Forest
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

rf = RandomForestClassifier(random_state=42)
clf = RandomizedSearchCV(rf, param_distributions=param_grid, n_iter=10, cv=5, random_state=42)
clf.fit(X_train, y_train)
print("Best parameters:", clf.best_params_)

# Train Random Forest with best parameters
rf_best = RandomForestClassifier(**clf.best_params_, random_state=42)
rf_best.fit(X_train, y_train)

# Predict test data
prediction = rf_best.predict(df_test)

# Save results to CSV
df_submission = pd.DataFrame({'Student_ID': student_id, 'Grade': prediction})
df_submission.to_csv("submit6.csv", index=False)
print("✅ Submission file saved as sample_submission.csv")


Best parameters: {'n_estimators': 100, 'min_samples_split': 5, 'min_samples_leaf': 4, 'max_depth': 5}
✅ Submission file saved as sample_submission.csv


In [114]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.feature_selection import SelectFromModel
from sklearn.impute import SimpleImputer

# Load dataset
df_train = pd.read_csv("train_data.csv")
df_test = pd.read_csv("test_data.csv")

# Handle missing values with SimpleImputer
imputer_mean = SimpleImputer(strategy="mean")
imputer_mode = SimpleImputer(strategy="most_frequent")

numerical_cols = ['Attendance (%)', 'Assignments_Avg']
categorical_cols = ['Parent_Education_Level']

df_train[numerical_cols] = imputer_mean.fit_transform(df_train[numerical_cols])
df_test[numerical_cols] = imputer_mean.transform(df_test[numerical_cols])
df_train[categorical_cols] = imputer_mode.fit_transform(df_train[categorical_cols])
df_test[categorical_cols] = imputer_mode.transform(df_test[categorical_cols])

# Drop unique columns
df_train.drop(columns=['Email'], inplace=True)
df_test.drop(columns=['Email'], inplace=True)
student_id = df_test['Student_ID']
df_test.drop(columns=['Student_ID'], inplace=True)

# Encode categorical columns
categorical_columns = ['First_Name', 'Last_Name', 'Gender', 'Department', 'Extracurricular_Activities', 'Internet_Access_at_Home', 'Parent_Education_Level', 'Family_Income_Level']
label_encoders = {}
le = LabelEncoder()

for col in categorical_columns:
    df_train[col] = le.fit_transform(df_train[col])
    df_test[col] = le.transform(df_test[col])
    label_encoders[col] = le

# Feature Selection using Random Forest Importance
X = df_train.drop(columns=['Student_ID', 'Grade'])
y = df_train['Grade']
rf_selector = RandomForestClassifier(n_estimators=100, random_state=42)
rf_selector.fit(X, y)
feature_importances = pd.Series(rf_selector.feature_importances_, index=X.columns)
selected_features = feature_importances.nlargest(10).index.tolist()
X = X[selected_features]
df_test = df_test[selected_features]

# Standardize numerical features
scaler = StandardScaler()
X[numerical_cols] = scaler.fit_transform(X[numerical_cols])
df_test[numerical_cols] = scaler.transform(df_test[numerical_cols])

# Splitting train and test from dataset train_data.csv
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Hyperparameter tuning for Decision Tree Classifier
param_grid = {
    'criterion': ['gini', 'entropy'],
    'min_samples_split': [5, 10, 20],
    'max_depth': [None, 5, 10]
}

dtc = DecisionTreeClassifier()
clf_dtc = GridSearchCV(estimator=dtc, param_grid=param_grid, cv=5)
clf_dtc.fit(X_train, y_train)
print("Best parameters for Decision Tree:", clf_dtc.best_params_)

dtc_best = DecisionTreeClassifier(**clf_dtc.best_params_, random_state=42)

# Hyperparameter tuning for Random Forest
param_grid_rf = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [5, 10]
}

rf = RandomForestClassifier(random_state=42)
clf_rf = GridSearchCV(estimator=rf, param_grid=param_grid_rf, cv=5)
clf_rf.fit(X_train, y_train)
print("Best parameters for Random Forest:", clf_rf.best_params_)

rf_best = RandomForestClassifier(**clf_rf.best_params_, random_state=42)

# Voting Classifier
voting_clf = VotingClassifier(estimators=[('dt', dtc_best), ('rf', rf_best)], voting='hard')
voting_clf.fit(X_train, y_train)

# Predict test data
prediction = voting_clf.predict(df_test)

# Save results to CSV
df_submission = pd.DataFrame()
df_submission['Student_ID'] = student_id
df_submission['Grade'] = prediction
df_submission.to_csv("submit7.csv", index=False)

print("✅ Submission file saved as sample_submission.csv")


Best parameters for Decision Tree: {'criterion': 'entropy', 'max_depth': 5, 'min_samples_split': 5}
Best parameters for Random Forest: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 100}
✅ Submission file saved as sample_submission.csv


In [119]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder

# Load dataset
df_train = pd.read_csv("df_sample.csv")
df_test = pd.read_csv("test_data.csv")

# Handle missing values
def fill_null_mean(df, column):
    return df[column].fillna(df[column].mean())

def fill_null_mode(df, column):
    return df[column].fillna(df[column].mode()[0])

numerical_cols = ['Attendance (%)', 'Assignments_Avg']
categorical_cols = ['Parent_Education_Level']

for col in numerical_cols:
    df_train[col] = fill_null_mean(df_train, col)
    df_test[col] = fill_null_mean(df_test, col)

for col in categorical_cols:
    df_train[col] = fill_null_mode(df_train, col)
    df_test[col] = fill_null_mode(df_test, col)

# Drop unnecessary columns
df_train.drop(columns=['Email'], inplace=True, errors='ignore')
df_test.drop(columns=['Email'], inplace=True, errors='ignore')

student_id = df_test['Student_ID']
df_test.drop(columns=['Student_ID'], inplace=True, errors='ignore')

# Encode categorical columns
categorical_columns = ['First_Name', 'Last_Name', 'Gender', 'Department', 'Extracurricular_Activities', 'Internet_Access_at_Home', 'Parent_Education_Level', 'Family_Income_Level']
label_encoders = {}
le = LabelEncoder()

for col in categorical_columns:
    if col in df_train.columns:
        df_train[col] = le.fit_transform(df_train[col])
        df_test[col] = le.transform(df_test[col])
        label_encoders[col] = le

# Split dataset
X = df_train.drop(columns=['Student_ID', 'Grade'], errors='ignore')
y = df_train['Grade']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Hyperparameter tuning
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [5, 10, 15, None],
    'min_samples_split': [2, 5, 10]
}

dtc = DecisionTreeClassifier(random_state=42)
clf = GridSearchCV(dtc, param_grid, cv=5, scoring='accuracy')
clf.fit(X_train, y_train)

print("Best parameters:", clf.best_params_)

# Train model with best parameters
dtc_best = DecisionTreeClassifier(**clf.best_params_, random_state=42)
dtc_best.fit(X_train, y_train)

# Predict test data
prediction = dtc_best.predict(df_test)

# Save results
df_submission = pd.DataFrame({'Student_ID': student_id, 'Grade': prediction})
df_submission.to_csv("/mnt/data/sample_submission.csv", index=False)

print("✅ Submission file saved as sample_submission.csv")


KeyError: 'Grade'

In [120]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Load dataset
df_train = pd.read_csv("train_data.csv")
df_test = pd.read_csv("test_data.csv")

# Handle missing values
imputer_mean = SimpleImputer(strategy="mean")
imputer_mode = SimpleImputer(strategy="most_frequent")

numerical_cols = ['Attendance (%)', 'Assignments_Avg']
categorical_ordinal = ['Parent_Education_Level']  # Contoh ordinal
categorical_ohe = ['Gender', 'Department', 'Extracurricular_Activities', 'Internet_Access_at_Home', 'Family_Income_Level']

# Impute missing values
df_train[numerical_cols] = imputer_mean.fit_transform(df_train[numerical_cols])
df_test[numerical_cols] = imputer_mean.transform(df_test[numerical_cols])
df_train[categorical_ordinal] = imputer_mode.fit_transform(df_train[categorical_ordinal])
df_test[categorical_ordinal] = imputer_mode.transform(df_test[categorical_ordinal])

# Drop unique columns
df_train.drop(columns=['Email'], inplace=True)
df_test.drop(columns=['Email'], inplace=True)
student_id = df_test['Student_ID']
df_test.drop(columns=['Student_ID'], inplace=True)

define_ordinal_mapping = [['High School', 'Associate', 'Bachelor', 'Master', 'Doctorate']]

define_transformer = ColumnTransformer([
    ('ordinal', OrdinalEncoder(categories=define_ordinal_mapping), categorical_ordinal),
    ('onehot', OneHotEncoder(handle_unknown='ignore'), categorical_ohe)
], remainder='passthrough')

# Prepare features & target
X = df_train.drop(columns=['Student_ID', 'Grade'])
y = df_train['Grade']
X_test_final = df_test

# Apply transformation
X_transformed = define_transformer.fit_transform(X)
X_test_transformed = define_transformer.transform(X_test_final)

# Split data
X_train, X_val, y_train, y_val = train_test_split(X_transformed, y, test_size=0.2, random_state=42)

# Train Decision Tree Classifier
param_grid = {
    'criterion': ['gini', 'entropy'],
    'min_samples_split': [5, 10, 20],
    'max_depth': [None, 5, 10]
}

dtc = DecisionTreeClassifier()
clf_dtc = GridSearchCV(dtc, param_grid, cv=5)
clf_dtc.fit(X_train, y_train)
print("Best Decision Tree parameters:", clf_dtc.best_params_)

# Train best Decision Tree model
best_dtc = DecisionTreeClassifier(**clf_dtc.best_params_, random_state=42)
best_dtc.fit(X_train, y_train)

# Predict test data
prediction = best_dtc.predict(X_test_transformed)

# Save results to CSV
df_submission = pd.DataFrame({'Student_ID': student_id, 'Grade': prediction})
df_submission.to_csv("sample_submission.csv", index=False)

print("✅ Submission file saved as sample_submission.csv")

ValueError: Found unknown categories ["Master's", 'PhD', "Bachelor's"] in column 0 during fit

In [125]:
df_train = pd.read_csv("train_data.csv")

for col in df_train.select_dtypes(include=['object']).columns:
    print(f"Unique values in {col}:\n", df_train[col].value_counts(), "\n")

Unique values in Student_ID:
 Student_ID
S1406    1
S3727    1
S5914    1
S2175    1
S1470    1
        ..
S4679    1
S5217    1
S3838    1
S4152    1
S3664    1
Name: count, Length: 1050, dtype: int64 

Unique values in First_Name:
 First_Name
Ahmed    144
John     138
Emma     135
Sara     134
Omar     129
Ali      129
Liam     122
Maria    119
Name: count, dtype: int64 

Unique values in Last_Name:
 Last_Name
Jones       204
Davis       195
Brown       176
Smith       162
Williams    158
Johnson     155
Name: count, dtype: int64 

Unique values in Email:
 Email
student406@university.com     1
student2727@university.com    1
student4914@university.com    1
student1175@university.com    1
student470@university.com     1
                             ..
student3679@university.com    1
student4217@university.com    1
student2838@university.com    1
student3152@university.com    1
student2664@university.com    1
Name: count, Length: 1050, dtype: int64 

Unique values in Gender:
 Gender
Ma

In [126]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# === 1. Load Dataset ===
df_train = pd.read_csv("train_data.csv")
df_test = pd.read_csv("test_data.csv")

# === 2. Drop Unnecessary Columns ===
df_train.drop(columns=['Student_ID', 'First_Name', 'Last_Name', 'Email'], inplace=True)
df_test.drop(columns=['Student_ID', 'First_Name', 'Last_Name', 'Email'], inplace=True)

# === 3. Handle Missing Values ===
imputer_mean = SimpleImputer(strategy="mean")  # Untuk nilai numerik
imputer_mode = SimpleImputer(strategy="most_frequent")  # Untuk nilai kategorikal

numerical_cols = ['Attendance (%)', 'Assignments_Avg']
categorical_cols = ['Gender', 'Department', 'Extracurricular_Activities', 'Internet_Access_at_Home', 'Family_Income_Level']

df_train[numerical_cols] = imputer_mean.fit_transform(df_train[numerical_cols])
df_test[numerical_cols] = imputer_mean.transform(df_test[numerical_cols])

df_train[categorical_cols] = imputer_mode.fit_transform(df_train[categorical_cols])
df_test[categorical_cols] = imputer_mode.transform(df_test[categorical_cols])

# === 4. Encode Categorical Features ===
edu_order = ["High School", "Bachelor's", "Master's", "PhD"]  # Urutan tingkat pendidikan
preprocessor = ColumnTransformer(transformers=[
    ('num', StandardScaler(), numerical_cols),  # Standardize numerical data
    ('edu', OrdinalEncoder(categories=[edu_order], handle_unknown="use_encoded_value", unknown_value=-1), ['Parent_Education_Level']),  # Ordinal Encode
    ('cat', OneHotEncoder(handle_unknown="ignore"), categorical_cols)  # One-Hot Encode lainnya
])

# === 5. Prepare Data for Training ===
X = df_train.drop(columns=['Grade'])
y = df_train['Grade']

# === 6. Split Data ===
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# === 7. Build Decision Tree Model with Pipeline ===
dt_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', DecisionTreeClassifier(random_state=42))
])

# === 8. Hyperparameter Tuning ===
param_grid = {
    'model__criterion': ['gini', 'entropy'],
    'model__max_depth': [None, 5, 10, 15],
    'model__min_samples_split': [2, 5, 10]
}

grid_search = GridSearchCV(dt_pipeline, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

# === 9. Best Model ===
best_dt_model = grid_search.best_estimator_
print("Best parameters:", grid_search.best_params_)

# === 10. Predict Test Set ===
y_pred = best_dt_model.predict(df_test)

# === 11. Save Predictions ===
df_submission = pd.DataFrame()
df_submission['Student_ID'] = pd.read_csv("test_data.csv")['Student_ID']  # Ambil Student_ID kembali
df_submission['Grade'] = y_pred
df_submission.to_csv("submission_decision_tree.csv", index=False)

print("✅ Submission file saved as submission_decision_tree.csv")


Best parameters: {'model__criterion': 'entropy', 'model__max_depth': 10, 'model__min_samples_split': 5}
✅ Submission file saved as submission_decision_tree.csv


In [129]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier

# Load data
df_train = pd.read_csv("train_data.csv")
df_test = pd.read_csv("test_data.csv")

# Function to handle missing values
def fill_null_mean(df, col):
    df[col] = df[col].fillna(df[col].mean())

def fill_null_mode(df, col):
    df[col] = df[col].fillna(df[col].mode()[0])

# Handling missing values in training & testing data
fill_null_mean(df_train, 'Attendance (%)')
fill_null_mean(df_train, 'Assignments_Avg')
fill_null_mode(df_train, 'Parent_Education_Level')

fill_null_mean(df_test, 'Attendance (%)')
fill_null_mean(df_test, 'Assignments_Avg')
fill_null_mode(df_test, 'Parent_Education_Level')

# Drop unique identifier columns
df_train.drop(columns=['Email', 'Student_ID'], inplace=True)
student_id = df_test['Student_ID']
df_test.drop(columns=['Email', 'Student_ID'], inplace=True)

# Encode categorical variables
categorical_columns = ['First_Name', 'Last_Name', 'Gender', 'Department', 
                       'Extracurricular_Activities', 'Internet_Access_at_Home', 
                       'Parent_Education_Level', 'Family_Income_Level']

label_encoders = {}
for col in categorical_columns:
    le = LabelEncoder()
    df_train[col] = le.fit_transform(df_train[col])
    df_test[col] = le.transform(df_test[col])
    label_encoders[col] = le

# Splitting dataset
X = df_train.drop(columns=['Grade'])
y = df_train['Grade']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Hyperparameter tuning for Decision Tree
param_grid = {
    'criterion': ['gini', 'entropy'],
    'min_samples_split': [10, 20, 50],
    'max_depth': [None, 5, 10]
}

clf = GridSearchCV(DecisionTreeClassifier(random_state=42), param_grid, cv=5, scoring='accuracy')
clf.fit(X_train, y_train)

# Best model
best_params = clf.best_params_
dtc = DecisionTreeClassifier(**best_params, random_state=42)
dtc.fit(X_train, y_train)

# Predict on test data
predictions = dtc.predict(df_test)

# Save submission file
df_submission = pd.DataFrame({'Student_ID': student_id, 'Grade': predictions})
df_submission.to_csv("submit888.csv", index=False)


In [131]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import GridSearchCV

# Load data
df_train = pd.read_csv("train_data.csv")
df_test = pd.read_csv("test_data.csv")

# Save student IDs for submission
student_id = df_test['Student_ID']

# Function to preprocess data
def preprocess_data(df):
    # Create a copy to avoid modifying the original dataframe
    df_processed = df.copy()
    
    # Handle missing values in numerical columns with median
    for col in df_processed.select_dtypes(include=['float64', 'int64']).columns:
        if df_processed[col].isnull().sum() > 0:
            df_processed[col] = df_processed[col].fillna(df_processed[col].median())
    
    # For categorical columns, fill with most frequent value
    for col in df_processed.select_dtypes(include=['object']).columns:
        if col != 'Student_ID' and df_processed[col].isnull().sum() > 0:
            df_processed[col] = df_processed[col].fillna(df_processed[col].mode()[0])
    
    return df_processed

# Preprocess the data
df_train_processed = preprocess_data(df_train)
df_test_processed = preprocess_data(df_test)

# Identify target variable
target_column = 'Grade'  # Target is now the Grade column
y_train = df_train_processed[target_column]

# Define features (excluding Student_ID, First_Name, Last_Name, Email and the target)
exclude_cols = ['Student_ID', 'First_Name', 'Last_Name', 'Email', target_column]
feature_cols = [col for col in df_train_processed.columns if col not in exclude_cols]

X_train = df_train_processed[feature_cols]
X_test = df_test_processed[feature_cols]

# Identify numerical and categorical columns
numerical_cols = X_train.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_cols = X_train.select_dtypes(include=['object']).columns.tolist()

# Create preprocessing pipelines
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', pd.get_dummies)
])

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Create a pipeline with preprocessing and the decision tree model
# Using a classifier instead of a regressor
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', DecisionTreeClassifier(random_state=42))
])

# Simple hyperparameter tuning - keeping the model simple as requested
param_grid = {
    'model__max_depth': [3, 5, 7],
    'model__min_samples_split': [5, 10]
}

# Find the best parameters
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Get the best model
best_model = grid_search.best_estimator_

# Make predictions
prediction = best_model.predict(X_test)

# Print the best parameters
print("Best parameters:", grid_search.best_params_)

# Save results to CSV
df_submission = pd.DataFrame()
df_submission['Student_ID'] = student_id
df_submission['Grade'] = prediction
df_submission.to_csv("submit7.csv", index=False)
print("✅ Submission file saved as submit99.csv")

ValueError: 
All the 30 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
30 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/miniconda3/envs/main-ds/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/miniconda3/envs/main-ds/lib/python3.9/site-packages/sklearn/base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/opt/miniconda3/envs/main-ds/lib/python3.9/site-packages/sklearn/pipeline.py", line 654, in fit
    Xt = self._fit(X, y, routed_params, raw_params=params)
  File "/opt/miniconda3/envs/main-ds/lib/python3.9/site-packages/sklearn/pipeline.py", line 588, in _fit
    X, fitted_transformer = fit_transform_one_cached(
  File "/opt/miniconda3/envs/main-ds/lib/python3.9/site-packages/joblib/memory.py", line 312, in __call__
    return self.func(*args, **kwargs)
  File "/opt/miniconda3/envs/main-ds/lib/python3.9/site-packages/sklearn/pipeline.py", line 1551, in _fit_transform_one
    res = transformer.fit_transform(X, y, **params.get("fit_transform", {}))
  File "/opt/miniconda3/envs/main-ds/lib/python3.9/site-packages/sklearn/utils/_set_output.py", line 319, in wrapped
    data_to_wrap = f(self, X, *args, **kwargs)
  File "/opt/miniconda3/envs/main-ds/lib/python3.9/site-packages/sklearn/base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/opt/miniconda3/envs/main-ds/lib/python3.9/site-packages/sklearn/compose/_column_transformer.py", line 990, in fit_transform
    self._validate_transformers()
  File "/opt/miniconda3/envs/main-ds/lib/python3.9/site-packages/sklearn/compose/_column_transformer.py", line 530, in _validate_transformers
    raise TypeError(
TypeError: All estimators should implement fit and transform, or can be 'drop' or 'passthrough' specifiers. 'Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')),
                ('onehot', <function get_dummies at 0x12e9b9e50>)])' (type <class 'sklearn.pipeline.Pipeline'>) doesn't.


In [133]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import GridSearchCV

# Load data
print("Loading data...")
df_train = pd.read_csv("train_data.csv")
df_test = pd.read_csv("test_data.csv")

# Save student IDs for submission
student_id = df_test['Student_ID']

# Function to preprocess data
def preprocess_data(df):
    # Create a copy to avoid modifying the original dataframe
    df_processed = df.copy()
    
    # Handle missing values in numerical columns with median
    for col in df_processed.select_dtypes(include=['float64', 'int64']).columns:
        if df_processed[col].isnull().sum() > 0:
            df_processed[col] = df_processed[col].fillna(df_processed[col].median())
    
    # For categorical columns, fill with most frequent value
    for col in df_processed.select_dtypes(include=['object']).columns:
        if col != 'Student_ID' and col != 'Email' and df_processed[col].isnull().sum() > 0:
            df_processed[col] = df_processed[col].fillna(df_processed[col].mode()[0])
    
    return df_processed

# Preprocess the data
print("Preprocessing data...")
df_train_processed = preprocess_data(df_train)
df_test_processed = preprocess_data(df_test)

# Identify target variable
target_column = 'Grade'
y_train = df_train_processed[target_column]

# Define features (excluding Student_ID, First_Name, Last_Name, Email and the target)
exclude_cols = ['Student_ID', 'First_Name', 'Last_Name', 'Email', target_column]
feature_cols = [col for col in df_train_processed.columns if col not in exclude_cols]

X_train = df_train_processed[feature_cols]
X_test = df_test_processed[feature_cols]

# Identify numerical and categorical columns
numerical_cols = X_train.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_cols = X_train.select_dtypes(include=['object']).columns.tolist()

print(f"Using {len(numerical_cols)} numerical features and {len(categorical_cols)} categorical features")

# Create preprocessing pipelines
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Create a simple decision tree model - keeping it shallow to avoid overfitting
# Based on your comment about complex models performing worse
model = DecisionTreeClassifier(
    max_depth=3,              # Shallow tree to avoid overfitting
    min_samples_split=10,     # Require more samples to split to prevent overfitting
    min_samples_leaf=5,       # Don't create tiny leaf nodes
    random_state=42           # For reproducibility
)

# Create a pipeline with preprocessing and the decision tree model
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', model)
])

# Fit the model directly without grid search to keep it simple
print("Training the model...")
pipeline.fit(X_train, y_train)

# Make predictions
print("Making predictions...")
prediction = pipeline.predict(X_test)

# Save results to CSV
df_submission = pd.DataFrame()
df_submission['Student_ID'] = student_id
df_submission['Grade'] = prediction
df_submission.to_csv("submit10.csv", index=False)
print("✅ Submission file saved as submit7.csv")

# Optional: Print model characteristics
from sklearn.tree import export_text
tree_rules = export_text(pipeline.named_steps['model'], 
                        feature_names=pipeline.named_steps['preprocessor'].get_feature_names_out().tolist())
print("\nDecision Tree Rules (limited to depth 3):")
print(tree_rules[:1000])  # Print part of the tree to avoid overwhelming output

Loading data...
Preprocessing data...
Using 12 numerical features and 6 categorical features
Training the model...
Making predictions...
✅ Submission file saved as submit7.csv

Decision Tree Rules (limited to depth 3):
|--- num__Attendance (%) <= 0.85
|   |--- num__Attendance (%) <= -0.93
|   |   |--- num__Total_Score <= 0.56
|   |   |   |--- class: D
|   |   |--- num__Total_Score >  0.56
|   |   |   |--- class: C
|   |--- num__Attendance (%) >  -0.93
|   |   |--- cat__Parent_Education_Level_PhD <= 0.50
|   |   |   |--- class: B
|   |   |--- cat__Parent_Education_Level_PhD >  0.50
|   |   |   |--- class: C
|--- num__Attendance (%) >  0.85
|   |--- num__Total_Score <= 1.17
|   |   |--- num__Stress_Level (1-10) <= -1.05
|   |   |   |--- class: A
|   |   |--- num__Stress_Level (1-10) >  -1.05
|   |   |   |--- class: A
|   |--- num__Total_Score >  1.17
|   |   |--- num__Assignments_Avg <= 0.54
|   |   |   |--- class: B
|   |   |--- num__Assignments_Avg >  0.54
|   |   |   |--- class: A



In [135]:
pip install optuna

Collecting optuna
  Downloading optuna-4.2.1-py3-none-any.whl.metadata (17 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.14.1-py3-none-any.whl.metadata (7.4 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Collecting sqlalchemy>=1.4.2 (from optuna)
  Downloading SQLAlchemy-2.0.38-cp39-cp39-macosx_11_0_arm64.whl.metadata (9.6 kB)
Collecting tqdm (from optuna)
  Downloading tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.9-py3-none-any.whl.metadata (2.9 kB)
Downloading optuna-4.2.1-py3-none-any.whl (383 kB)
Downloading alembic-1.14.1-py3-none-any.whl (233 kB)
Downloading SQLAlchemy-2.0.38-cp39-cp39-macosx_11_0_arm64.whl (2.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Downloading tqdm-4.67.1-py3-none

In [137]:
import pandas as pd
import numpy as np
import optuna
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

# Load dataset
df_train = pd.read_csv("train_data.csv")
df_test = pd.read_csv("test_data.csv")

# Handle missing values
def fill_null_mode(df):
    mode = df.mode().iloc[0]
    return df.fillna(mode)

df_train['Attendance (%)'] = df_train['Attendance (%)'].fillna(df_train['Attendance (%)'].mean())
df_train['Assignments_Avg'] = df_train['Assignments_Avg'].fillna(df_train['Assignments_Avg'].mean())
df_train['Parent_Education_Level'] = fill_null_mode(df_train['Parent_Education_Level'])

df_test['Attendance (%)'] = df_test['Attendance (%)'].fillna(df_test['Attendance (%)'].mean())
df_test['Assignments_Avg'] = df_test['Assignments_Avg'].fillna(df_test['Assignments_Avg'].mean())
df_test['Parent_Education_Level'] = fill_null_mode(df_test['Parent_Education_Level'])

# Drop unnecessary columns
df_train.drop(columns=['Email', 'Student_ID'], inplace=True)
student_id = df_test['Student_ID']
df_test.drop(columns=['Email', 'Student_ID'], inplace=True)

# Encode categorical features
categorical_columns = ['First_Name', 'Last_Name', 'Gender', 'Department', 'Extracurricular_Activities', 'Internet_Access_at_Home', 'Parent_Education_Level', 'Family_Income_Level']
label_encoders = {}

for col in categorical_columns:
    le = LabelEncoder()
    df_train[col] = le.fit_transform(df_train[col])
    df_test[col] = le.transform(df_test[col])
    label_encoders[col] = le

# Splitting data
X = df_train.drop(columns=['Grade'])
y = df_train['Grade']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Define Optuna objective function
def objective(trial):
    criterion = trial.suggest_categorical('criterion', ['gini', 'entropy', 'log_loss'])
    max_depth = trial.suggest_int('max_depth', 3, 20)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 100)
    
    model = DecisionTreeClassifier(
        criterion=criterion,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        random_state=42
    )
    
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    accuracy = accuracy_score(y_val, y_pred)
    
    return accuracy

# Run Optuna optimization
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

# Get best hyperparameters
best_params = study.best_params
print("Best Hyperparameters:", best_params)

# Train final model with best parameters
final_model = DecisionTreeClassifier(**best_params, random_state=42)
final_model.fit(X, y)

# Predict on test data
prediction = final_model.predict(df_test)

# Save results
df_submission = pd.DataFrame({'Student_ID': student_id, 'Grade': prediction})
df_submission.to_csv("submitlast.csv", index=False)


[I 2025-03-02 21:57:58,862] A new study created in memory with name: no-name-f6aadd10-d8b1-4315-838c-6b6c9b15995d
[I 2025-03-02 21:57:58,874] Trial 0 finished with value: 0.3 and parameters: {'criterion': 'log_loss', 'max_depth': 10, 'min_samples_split': 17}. Best is trial 0 with value: 0.3.
[I 2025-03-02 21:57:58,882] Trial 1 finished with value: 0.3238095238095238 and parameters: {'criterion': 'entropy', 'max_depth': 8, 'min_samples_split': 38}. Best is trial 1 with value: 0.3238095238095238.
[I 2025-03-02 21:57:58,890] Trial 2 finished with value: 0.37142857142857144 and parameters: {'criterion': 'entropy', 'max_depth': 5, 'min_samples_split': 35}. Best is trial 2 with value: 0.37142857142857144.
[I 2025-03-02 21:57:58,899] Trial 3 finished with value: 0.32857142857142857 and parameters: {'criterion': 'log_loss', 'max_depth': 13, 'min_samples_split': 47}. Best is trial 2 with value: 0.37142857142857144.
[I 2025-03-02 21:57:58,905] Trial 4 finished with value: 0.319047619047619 and p

Best Hyperparameters: {'criterion': 'gini', 'max_depth': 5, 'min_samples_split': 17}


In [139]:
import pandas as pd
import numpy as np
import optuna
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Load dataset
df_train = pd.read_csv("train_data.csv")
df_test = pd.read_csv("test_data.csv")

# Handle missing values
def fill_null_mode(df):
    mode = df.mode().iloc[0]
    return df.fillna(mode)

df_train['Attendance (%)'] = df_train['Attendance (%)'].fillna(df_train['Attendance (%)'].mean())
df_train['Assignments_Avg'] = df_train['Assignments_Avg'].fillna(df_train['Assignments_Avg'].mean())
df_train['Parent_Education_Level'] = fill_null_mode(df_train['Parent_Education_Level'])

df_test['Attendance (%)'] = df_test['Attendance (%)'].fillna(df_test['Attendance (%)'].mean())
df_test['Assignments_Avg'] = df_test['Assignments_Avg'].fillna(df_test['Assignments_Avg'].mean())
df_test['Parent_Education_Level'] = fill_null_mode(df_test['Parent_Education_Level'])

# Drop unnecessary columns
df_train.drop(columns=['First_Name', 'Last_Name', 'Email', 'Student_ID'], inplace=True)
student_id = df_test['Student_ID']
df_test.drop(columns=['First_Name', 'Last_Name', 'Email', 'Student_ID'], inplace=True)

# Encode categorical features
categorical_columns = ['Gender', 'Department', 'Extracurricular_Activities', 'Internet_Access_at_Home', 'Parent_Education_Level', 'Family_Income_Level']
label_encoders = {}

for col in categorical_columns:
    le = LabelEncoder()
    df_train[col] = le.fit_transform(df_train[col])
    df_test[col] = le.transform(df_test[col])
    label_encoders[col] = le

# Splitting data
X = df_train.drop(columns=['Grade'])
y = df_train['Grade']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Define Optuna objective function
def objective(trial):
    n_estimators = trial.suggest_int('n_estimators', 50, 300)
    max_depth = trial.suggest_int('max_depth', 5, 50)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 20)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 10)
    bootstrap = trial.suggest_categorical('bootstrap', [True, False])
    
    model = RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        bootstrap=bootstrap,
        random_state=42
    )
    
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    accuracy = accuracy_score(y_val, y_pred)
    
    return accuracy

# Run Optuna optimization
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

# Get best hyperparameters
best_params = study.best_params
print("Best Hyperparameters:", best_params)

# Train final model with best parameters
final_model = RandomForestClassifier(**best_params, random_state=42)
final_model.fit(X, y)

# Predict on test data
prediction = final_model.predict(df_test)

# Save results
df_submission = pd.DataFrame({'Student_ID': student_id, 'Grade': prediction})
df_submission.to_csv("sample_submission.csv", index=False)


[I 2025-03-02 22:06:35,215] A new study created in memory with name: no-name-5f4c36f2-ef8a-4694-a53c-9064329774ea
[I 2025-03-02 22:06:35,614] Trial 0 finished with value: 0.35714285714285715 and parameters: {'n_estimators': 252, 'max_depth': 48, 'min_samples_split': 4, 'min_samples_leaf': 4, 'bootstrap': False}. Best is trial 0 with value: 0.35714285714285715.
[I 2025-03-02 22:06:35,772] Trial 1 finished with value: 0.3476190476190476 and parameters: {'n_estimators': 156, 'max_depth': 25, 'min_samples_split': 17, 'min_samples_leaf': 5, 'bootstrap': True}. Best is trial 0 with value: 0.35714285714285715.
[I 2025-03-02 22:06:36,069] Trial 2 finished with value: 0.3333333333333333 and parameters: {'n_estimators': 232, 'max_depth': 26, 'min_samples_split': 3, 'min_samples_leaf': 1, 'bootstrap': True}. Best is trial 0 with value: 0.35714285714285715.
[I 2025-03-02 22:06:36,350] Trial 3 finished with value: 0.3952380952380952 and parameters: {'n_estimators': 191, 'max_depth': 43, 'min_sample

Best Hyperparameters: {'n_estimators': 200, 'max_depth': 21, 'min_samples_split': 12, 'min_samples_leaf': 9, 'bootstrap': False}
