# Making joblib

## Preprocess + Create X and y

In [4]:
# preprocess_training_data.py

import pandas as pd
import sys
import os

# Load subject model handlers directly
sys.path.append(os.path.abspath("de_model"))
from de_handler import DEModelHandler
sys.path.append(os.path.abspath("fsd_model"))
from fsd_handler import FSDModelHandler
sys.path.append(os.path.abspath("math3_model"))
from math3_handler import Math3ModelHandler
sys.path.append(os.path.abspath("python_model"))
from python_handler import PythonModelHandler

# Load full dataset
df = pd.read_csv("../dataset/train_dataset.csv")

# Subject-level predictions
df["Predicted DE Theory"] = DEModelHandler().predict_from_model(df, "de_model.joblib", "df")["Predicted DE Theory"]
df["Predicted FSD Theory"] = FSDModelHandler().predict_from_model(df, "fsd_model.joblib", "df")["Predicted FSD Theory"]
df["Predicted Math-3 Theory"] = Math3ModelHandler().predict_from_model(df, "math3_model.joblib", "df")["Predicted Math-3 Theory"]
df["Predicted Python Theory"] = PythonModelHandler().predict_from_model(df, "python_model.joblib", "df")["Predicted Python Theory"]

# Create labels (y)
sem3_cols = ["Math-3 Theory", "DE Theory", "FSD Theory", "Python Theory"]
df["Sem 3 Percentage"] = df[sem3_cols].sum(axis=1) / 4
df["Sem 2 Percentage"] = df[[
    "Math-2 Theory", "Data Structures using Java Theory", "DBMS Theory",
    "Fundamental of Electronics and Electrical Theory", "Java-2 Theory"
]].mean(axis=1)
df["Sem 2 Percentile"] = df["Sem 2 Percentage"].rank(pct=True) * 100
df["Sem 3 Percentile"] = df["Sem 3 Percentage"].rank(pct=True) * 100
df["Percentile Drop"] = (df["Sem 2 Percentile"] - df["Sem 3 Percentile"]).round(2)
df["Risk Flag"] = df["Percentile Drop"] > 10
y = df["Risk Flag"].astype(int)

# Feature engineering
df["Sem 1 Percentage"] = df[[
    "Math-1 Theory", "Physics Theory", "Java-1 Theory", "Software Engineering Theory"
]].mean(axis=1).round(2)

df["Sem 2 Percentage"] = df[[
    "Math-2 Theory", "Data Structures using Java Theory", "DBMS Theory",
    "Fundamental of Electronics and Electrical Theory", "Java-2 Theory"
]].mean(axis=1).round(2)

df = df.rename(columns={"Div-1": "Section-1", "Div-2": "Section-2", "Div-3": "Section-3"})
for sec in ["Section-1", "Section-2", "Section-3"]:
    df[sec] = df[sec].str[0]

df["Sem 1 Percentile"] = df["Sem 1 Percentage"].rank(pct=True) * 100
df["Sem 2 Percentile"] = df["Sem 2 Percentage"].rank(pct=True) * 100

df.drop(columns=[
    "Student ID", "Mentor-1", "Mentor-2", "Mentor-3", "Roll-2", "Roll-3",
    "Math-3 Theory", "DE Theory", "DE Practical", "FSD Theory", "FSD Practical",
    "Python Theory", "Python Practical", "Communication Theory", "Law Theory",
    "Sem 3 Percentage","Sem 3 Percentile", "Percentile Drop", "Risk Flag"
], inplace=True, errors="ignore")


df["Predicted Sem 3 Percentage"] = df[[
    "Predicted Math-3 Theory", "Predicted DE Theory", "Predicted FSD Theory", "Predicted Python Theory"
]].mean(axis=1).round(2)

df["Predicted Sem 3 Percentile"] = df["Predicted Sem 3 Percentage"].rank(pct=True) * 100
df["Predicted Percentile Drop"] = (df["Sem 2 Percentile"] - df["Predicted Sem 3 Percentile"]).round(2)

# Save for joblib training
df.to_csv("X_for_training.csv", index=False)
y.to_csv("y_for_training.csv", index=False)


In [1]:
# cell2_train_risk_model.py

import pandas as pd
import numpy as np
import joblib
from xgboost import XGBClassifier
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE
import warnings

warnings.filterwarnings("ignore")

# Load training data
X = pd.read_csv("X_for_training.csv")
y = pd.read_csv("y_for_training.csv").squeeze()

# Column separation
cat_cols = X.select_dtypes(include="object").columns.tolist()
num_cols = X.select_dtypes(include=["int64", "float64"]).columns.tolist()

# Preprocessing
preprocessor = ColumnTransformer([
    ("num", StandardScaler(), num_cols),
    ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols)
])

# Feature selector
feature_selector = SelectFromModel(
    estimator=RandomForestClassifier(random_state=42),
    max_features=25
)

# Classifier
xgb_model = XGBClassifier(
    n_estimators=400,
    max_depth=5,
    learning_rate=0.03,
    subsample=0.7,
    colsample_bytree=0.7,
    use_label_encoder=False,
    eval_metric='logloss',
    random_state=42
)

# Full pipeline with SMOTE
pipeline = ImbPipeline([
    ("preprocessor", preprocessor),
    ("smote", SMOTE(random_state=42)),
    ("feature_selection", feature_selector),
    ("classifier", xgb_model)
])

# Fit on full data
pipeline.fit(X, y)

# Save the model (no logs)
joblib.dump({"model": pipeline}, "risk_model.joblib")


['risk_model.joblib']