In [12]:
import sys, os

# If your notebook is under /notebooks and src/ is one level up:
PROJECT_ROOT = os.path.abspath(os.path.join(os.getcwd(), ".."))
sys.path.insert(0, PROJECT_ROOT)

from src.io_utils import load_df, save_df
from src.config import COLS, ENG, SELECTED_FEATURES, SEED
from src.features import add_engineered_features
from src.preprocessing import encode_target, select_features
import pandas as pd
from sklearn.model_selection import train_test_split

RAW_PATH      = os.path.join(PROJECT_ROOT, "data/raw/graduation_dataset.csv")
MODELING_PATH = os.path.join(PROJECT_ROOT, "data/processed/modeling.csv")
X_TRAIN_PATH  = os.path.join(PROJECT_ROOT, "data/processed/X_train.csv")
Y_TRAIN_PATH  = os.path.join(PROJECT_ROOT, "data/processed/y_train.csv")
X_TEST_PATH   = os.path.join(PROJECT_ROOT, "data/processed/X_test.csv")
Y_TEST_PATH   = os.path.join(PROJECT_ROOT, "data/processed/y_test.csv")



In [13]:
df = load_df(RAW_PATH)
df, label_encoder = encode_target(df)
df.head()

Unnamed: 0,Marital status,Application mode,Application order,Course,Daytime/evening attendance,Previous qualification,Nacionality,Mother's qualification,Father's qualification,Mother's occupation,...,Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP,Target
0,1,8,5,2,1,1,1,13,10,6,...,0,0,0,0,0.0,0,10.8,1.4,1.74,0
1,1,6,1,11,1,1,1,1,3,4,...,0,6,6,6,13.666667,0,13.9,-0.3,0.79,2
2,1,1,5,5,1,1,1,22,27,10,...,0,6,0,0,0.0,0,10.8,1.4,1.74,0
3,1,8,2,15,1,1,1,23,27,6,...,0,6,10,5,12.4,0,9.4,-0.8,-3.12,2
4,2,12,1,3,0,1,1,22,28,10,...,0,6,6,6,13.0,0,13.9,-0.3,0.79,2


In [14]:
df = add_engineered_features(df)
# sanity peek
df[[ENG["AVG_GRADE"], ENG["TOTAL_APPROVED"], ENG["TOTAL_ENROLLED"], ENG["APPROVAL_RATE"]]].describe()


Unnamed: 0,average_grade,total_approved_units,total_enrolled_units,overall_approval_rate
count,4424.0,4424.0,4424.0,4424.0
mean,10.435514,9.142405,12.502712,0.679076
std,4.81853,5.960612,4.608821,0.361945
min,0.0,0.0,0.0,0.0
25%,11.0,5.0,10.0,0.5
50%,12.25,10.0,12.0,0.833333
75%,13.266667,12.0,14.0,1.0
max,18.283654,43.0,46.0,1.0


In [15]:
df_model = select_features(df)
assert df_model.isna().sum().sum() == 0, "NaNs found after processing"
df_model.head()

Unnamed: 0,average_grade,total_approved_units,overall_approval_rate,Tuition fees up to date,Debtor,Scholarship holder,Age at enrollment,Gender,Marital status,Unemployment rate,Inflation rate,GDP,International,Target
0,0.0,0.0,0.0,1,0,0,20,1,1,10.8,1.4,1.74,0,0
1,13.833333,12.0,1.0,0,0,0,19,1,1,13.9,-0.3,0.79,0,2
2,0.0,0.0,0.0,0,0,0,19,1,1,10.8,1.4,1.74,0,0
3,12.914286,11.0,0.916667,1,0,0,20,0,1,9.4,-0.8,-3.12,0,2
4,12.666667,11.0,0.916667,1,0,0,45,0,2,13.9,-0.3,0.79,0,2


In [16]:
save_df(df_model, MODELING_PATH)
print("Saved:", MODELING_PATH)

Saved: /home/glinux/Projects/Skole/AnvendtData/data/processed/modeling.csv


In [17]:
X = df_model.drop(COLS["TARGET"], axis=1)
y = df_model[COLS["TARGET"]]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=SEED
)

save_df(X_train, X_TRAIN_PATH)
save_df(y_train.to_frame("Target"), Y_TRAIN_PATH)
save_df(X_test, X_TEST_PATH)
save_df(y_test.to_frame("Target"), Y_TEST_PATH)

print("Saved splits:")
print(X_TRAIN_PATH, Y_TRAIN_PATH, X_TEST_PATH, Y_TEST_PATH)


Saved splits:
/home/glinux/Projects/Skole/AnvendtData/data/processed/X_train.csv /home/glinux/Projects/Skole/AnvendtData/data/processed/y_train.csv /home/glinux/Projects/Skole/AnvendtData/data/processed/X_test.csv /home/glinux/Projects/Skole/AnvendtData/data/processed/y_test.csv
