In [18]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
import joblib

# 1. Load the dataset
data = pd.read_csv('/content/drive/MyDrive/Student_performance_data _.csv')

# 2. Create the new target variables
# EnrollmentLikelihood: 1 if GPA >= 3.0, else 0
data['EnrollmentLikelihood'] = (data['GPA'] >= 3.0).astype(int)

# NeedForSupport: 1 if GPA < 2.5 or Absences > 5, else 0
data['NeedForSupport'] = ((data['GPA'] < 2.5) | (data['Absences'] > 5)).astype(int)

# 3. Preprocess the data

# Step 1: Handle missing values (imputation strategy)
imputer = SimpleImputer(strategy='mean')  # Impute missing numerical values with the mean
data[['Age', 'StudyTimeWeekly', 'Absences']] = imputer.fit_transform(data[['Age', 'StudyTimeWeekly', 'Absences']])

# Step 2: Encode categorical features using LabelEncoder
categorical_features = ['Gender', 'Ethnicity', 'ParentalEducation', 'Tutoring',
                        'ParentalSupport', 'Extracurricular', 'Sports', 'Music', 'Volunteering']

label_encoders = {}
for feature in categorical_features:
    le = LabelEncoder()
    data[feature] = le.fit_transform(data[feature])
    label_encoders[feature] = le  # Store the label encoder for each feature

# Step 3: Feature Scaling (optional)
scaler = StandardScaler()
numerical_features = ['Age', 'StudyTimeWeekly', 'Absences']
data[numerical_features] = scaler.fit_transform(data[numerical_features])

# 4. Split the data into training and testing sets (80% for training, 20% for testing)
X = data.drop(columns=['StudentID', 'GPA', 'GradeClass', 'EnrollmentLikelihood', 'NeedForSupport'])  # Features
y = data[['EnrollmentLikelihood', 'NeedForSupport']]  # Multi-output target variable

# Split data into training and testing sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 5. Train the RandomForestClassifier model for multi-output classification
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# 6. Evaluate the model's performance using accuracy_score and classification_report
y_pred = model.predict(X_test)

# Evaluate performance for EnrollmentLikelihood
print("Enrollment Likelihood Model Evaluation:")
print(f"Accuracy: {accuracy_score(y_test['EnrollmentLikelihood'], y_pred[:, 0])}")
print(classification_report(y_test['EnrollmentLikelihood'], y_pred[:, 0]))

# Evaluate performance for NeedForSupport
print("Need for Support Model Evaluation:")
print(f"Accuracy: {accuracy_score(y_test['NeedForSupport'], y_pred[:, 1])}")
print(classification_report(y_test['NeedForSupport'], y_pred[:, 1]))

# 7. Optionally, save the trained model to a file
joblib.dump(model, 'student_performance_model.pkl')
joblib.dump(label_encoders, 'label_encoders.pkl')
joblib.dump(scaler, 'scaler.pkl')

print("Model and preprocessing tools have been saved.")

Enrollment Likelihood Model Evaluation:
Accuracy: 0.9331941544885177
              precision    recall  f1-score   support

           0       0.95      0.97      0.96       414
           1       0.79      0.69      0.74        65

    accuracy                           0.93       479
   macro avg       0.87      0.83      0.85       479
weighted avg       0.93      0.93      0.93       479

Need for Support Model Evaluation:
Accuracy: 0.9874739039665971
              precision    recall  f1-score   support

           0       0.94      1.00      0.97        90
           1       1.00      0.98      0.99       389

    accuracy                           0.99       479
   macro avg       0.97      0.99      0.98       479
weighted avg       0.99      0.99      0.99       479

Model and preprocessing tools have been saved.


In [25]:
import pandas as pd
import joblib
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
import numpy as np

# 1. Load the trained model and preprocessing tools
model = joblib.load('student_performance_model.pkl')
label_encoders = joblib.load('label_encoders.pkl')
scaler = joblib.load('scaler.pkl')

# 2. Sample data for predictions
new_students_data = pd.DataFrame([
    {'Age': 22, 'Gender': 'Male', 'Ethnicity': 'Hispanic', 'ParentalEducation': 'High School',
     'StudyTimeWeekly': 10, 'Absences': 2, 'Tutoring': 'No', 'ParentalSupport': 'High',
     'Extracurricular': 'Yes', 'Sports': 'Yes', 'Music': 'No', 'Volunteering': 'No'},

    {'Age': 19, 'Gender': 'Female', 'Ethnicity': 'White', 'ParentalEducation': "Bachelor's Degree",
     'StudyTimeWeekly': 15, 'Absences': 5, 'Tutoring': 'Yes', 'ParentalSupport': 'Medium',
     'Extracurricular': 'No', 'Sports': 'No', 'Music': 'Yes', 'Volunteering': 'Yes'},

    {'Age': 20, 'Gender': 'Male', 'Ethnicity': 'African American', 'ParentalEducation': 'Some College',
     'StudyTimeWeekly': 8, 'Absences': 7, 'Tutoring': 'No', 'ParentalSupport': 'Low',
     'Extracurricular': 'Yes', 'Sports': 'No', 'Music': 'No', 'Volunteering': 'Yes'},
])

# 3. Preprocess the new student's data similarly to the training data
# 3.1 Handle missing values (if any) - using SimpleImputer
categorical_features = ['Gender', 'Ethnicity', 'ParentalEducation', 'Tutoring',
                        'ParentalSupport', 'Extracurricular', 'Sports', 'Music', 'Volunteering']
numerical_features = ['Age', 'StudyTimeWeekly', 'Absences']

# Create imputer with mean strategy
imputer = SimpleImputer(strategy='mean')
new_students_data[numerical_features] = imputer.fit_transform(new_students_data[numerical_features])

# 3.2 Encode categorical features using LabelEncoder
for feature in categorical_features:
    # Check if the new value is in the label encoder's classes
    # If not, add it to the label encoder's classes and re-fit
    if not all(value in label_encoders[feature].classes_ for value in new_students_data[feature].unique()):
        # Convert existing classes to string type
        label_encoders[feature].classes_ = label_encoders[feature].classes_.astype(str)

        # Get new unique values not present in existing classes using numpy
        new_values = np.setdiff1d(new_students_data[feature].unique(), label_encoders[feature].classes_)

        # Extend the classes_ attribute with new values, maintaining dtype using numpy
        label_encoders[feature].classes_ = np.concatenate([label_encoders[feature].classes_, new_values])

    new_students_data[feature] = label_encoders[feature].transform(new_students_data[feature])

# 3.3 Scale numerical features using StandardScaler
new_students_data[numerical_features] = scaler.transform(new_students_data[numerical_features])

# 4. Make predictions
predictions = model.predict(new_students_data)

# 5. Print the predictions
for i, prediction in enumerate(predictions):
    print(f"Predictions for student {i + 1}:")
    print(f"  Enrollment Likelihood: {prediction[0]}")
    print(f"  Need for Support: {prediction[1]}")

Predictions for student 1:
  Enrollment Likelihood: 1
  Need for Support: 0
Predictions for student 2:
  Enrollment Likelihood: 1
  Need for Support: 0
Predictions for student 3:
  Enrollment Likelihood: 0
  Need for Support: 1
