In [None]:
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import joblib

data = pd.read_csv('dropout_risk_data.csv')
# Split data
X = data.drop(columns="Dropout_Risk")
y = data["Dropout_Risk"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Preprocessing pipeline
numeric_features = ["Attendance_Rate", "Grades", "Distance_to_School"]
numeric_transformer = StandardScaler()

categorical_features = [
    "Socioeconomic_Status",
    "Parent_Education_Level",
    "School_Resources",
    "Behavioral_Issues",
]
categorical_transformer = OneHotEncoder(drop="first")

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

# Full pipeline with classifier
pipeline = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("classifier", RandomForestClassifier(random_state=42)),
    ]
)

# Train the pipeline
pipeline.fit(X_train, y_train)

# Evaluate the model
y_pred = pipeline.predict(X_test)
print(classification_report(y_test, y_pred))

# Save the pipeline
with open("dropout_pipeline.pkl", "wb") as file:
    pickle.dump(pipeline, file)


In [5]:
import pickle

with open("dropout_pipeline.pkl", "rb") as file:
    pipeline = pickle.load(file)
print(type(pipeline))


<class 'sklearn.pipeline.Pipeline'>
