<a href="https://colab.research.google.com/github/Trappyke/Student-Enrollment-Prediction/blob/main/Student_Enrollment_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import xgboost as xgb
import joblib

def load_and_preprocess_data(filepath):
    """
    Load and preprocess the dataset, including handling missing values and encoding categorical features.
    """
    # Load the dataset
    data = pd.read_csv('files/student_data.csv')

    # Check for missing values and drop rows with missing values
    data = data.dropna()

    # Encode categorical columns (e.g., 'socioeconomic_status') using LabelEncoder
    label_encoder = LabelEncoder()
    data['socioeconomic_status'] = label_encoder.fit_transform(data['socioeconomic_status'])

    # Feature Engineering: Add a 'high_gpa' binary feature (1 if GPA >= 3.0)
    data['high_gpa'] = (data['GPA'] >= 3.0).astype(int)

    # Returning the cleaned data
    return data

def train_enrollment_model(X_train, y_train):
    """
    Train a Random Forest Classifier to predict student enrollment.
    """
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)
    return model

def train_support_model(X_train, y_train):
    """
    Train an XGBoost Classifier to predict graduation support needs.
    """
    model = xgb.XGBClassifier(n_estimators=100, learning_rate=0.1, random_state=42)
    model.fit(X_train, y_train)
    return model

def evaluate_model(model, X_test, y_test):
    """
    Evaluate the model's performance on the test set.
    """
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Model Accuracy: {accuracy:.4f}")
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    print("Classification Report:")
    print(classification_report(y_test, y_pred))

def save_model(model, filename):
    """
    Save the trained model to a file using joblib.
    """
    joblib.dump(model, filename)
    print(f"Model saved as {filename}")

def load_model(filename):
    """
    Load a previously saved model from a file.
    """
    return joblib.load(filename)

def main():
    # Load and preprocess the data
    data = load_and_preprocess_data('student_data.csv')

    # Define features and target variables for both models
    X = data[['GPA', 'attendance_rate', 'previous_enrollment_in_similar_program',
              'socioeconomic_status', 'extracurricular_involvement', 'academic_performance', 'high_gpa']]

    y_enrollment = data['enrolled_in_program']  # Target for enrollment prediction
    y_support = data['support_needs']  # Target for graduation support needs prediction

    # Split data into train and test sets
    X_train, X_test, y_train_enrollment, y_test_enrollment = train_test_split(X, y_enrollment, test_size=0.2, random_state=42)
    X_train_support, X_test_support, y_train_support, y_test_support = train_test_split(X, y_support, test_size=0.2, random_state=42)

    # Train the models
    print("Training Enrollment Model...")
    enrollment_model = train_enrollment_model(X_train, y_train_enrollment)

    print("Training Support Needs Model...")
    support_model = train_support_model(X_train_support, y_train_support)

    # Evaluate the models
    print("\nEvaluating Enrollment Model...")
    evaluate_model(enrollment_model, X_test, y_test_enrollment)

    print("\nEvaluating Support Needs Model...")
    evaluate_model(support_model, X_test_support, y_test_support)

    # Save the models
    save_model(enrollment_model, 'enrollment_model.pkl')
    save_model(support_model, 'support_model.pkl')

    # Example prediction for a new student
    new_student = pd.DataFrame({
        'GPA': [3.5],
        'attendance_rate': [0.95],
        'previous_enrollment_in_similar_program': [1],
        'socioeconomic_status': [1],  # Encoded value
        'extracurricular_involvement': [2],
        'academic_performance': [80],
        'high_gpa': [1]
    })

    # Load the saved models
    print("\nLoading saved models for prediction...")
    enrollment_model = load_model('enrollment_model.pkl')
    support_model = load_model('support_model.pkl')

    # Predict enrollment and support needs for the new student
    predicted_enrollment = enrollment_model.predict(new_student)
    print(f"Predicted Enrollment: {'Enrolled' if predicted_enrollment[0] == 1 else 'Not Enrolled'}")

    predicted_support = support_model.predict(new_student)
    print(f"Predicted Support Needs: {'Needs Support' if predicted_support[0] == 1 else 'No Support Needed'}")

if __name__ == '__main__':
    main()


FileNotFoundError: [Errno 2] No such file or directory: 'data/student_data.csv'