<a href="https://colab.research.google.com/github/ToobaObeidy-1/DA_bootcamp_python/blob/main/Student_Performance_MLmodel.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Student Performance Analysis and Risk Detection: Machine Learning Implementation


# Introduction

Dataset Overview:
 The dataset contains student's performance records, their socio-economic status and other variables which may help analyze the student progress and detect risk in underperforming students.

Objective:
To predict student performance metrics (e.g., Final GPA, Pass/Fail status) and risk indicators (e.g.,
Engagement level, Drop-out likelihood)



#Import required libraries

In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn as sk
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import accuracy_score, mean_absolute_error, classification_report
from sklearn.preprocessing import OneHotEncoder,LabelEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

#Load data

In [6]:
data = pd.read_csv("/content/Sample_real_time_performance_dataset (1).csv")
df = pd.DataFrame(data)
df

Unnamed: 0,Student ID,Age,Gender,Socioeconomic Status,Attendance Percentage,Previous Academic Records (GPA),Participation in Class Activities,Class Assignments Score,Login Frequency,Submissions,...,Access to Resources,Time Spent Studying Outside Class (mins),Part-Time Job Status,Classroom Environment Satisfaction,Group Learning Sessions,Proximity to Institute (mins),Final GPA,Pass/Fail Status,Engagement Level,Dropout Likelihood
0,S001,18,Male,Low,60,2.0,Inactive,50,5,Early,...,Adequate,50,Yes,Satisfied,Frequent,10,2.0,Fail,High,Likely
1,S002,19,Female,Middle,61,2.1,Active,51,6,On-time,...,Inadequate,51,No,Neutral,Rare,11,2.1,Pass,Low,Unlikely
2,S003,20,Male,High,62,2.2,Active,52,7,Late,...,Adequate,52,No,Satisfied,Never,12,2.2,Pass,Low,Unlikely
3,S004,21,Female,Low,63,2.3,Active,53,8,No Submissions,...,Inadequate,53,Yes,Unsatisfied,Never,13,2.3,Pass,High,Unlikely
4,S005,22,Male,Middle,64,2.4,Inactive,54,9,Early,...,Adequate,54,No,Satisfied,Frequent,14,2.4,Pass,Low,Unlikely
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,S1396,21,Female,High,83,2.3,Active,73,13,No Submissions,...,Inadequate,73,No,Unsatisfied,Never,33,2.3,Pass,Low,Unlikely
496,S1397,19,Female,Middle,91,3.1,Active,81,6,No Submissions,...,Inadequate,81,No,Unsatisfied,Never,11,3.1,Pass,Low,Unlikely
497,S1398,19,Male,Middle,66,2.6,Active,96,6,Late,...,Adequate,96,No,Satisfied,Never,26,2.6,Pass,Low,Unlikely
498,S1399,18,Female,Middle,65,2.5,Active,85,15,On-time,...,Inadequate,135,No,Neutral,Rare,35,2.5,Pass,Low,Likely


#Data Preprocessing

##Handle missing values

In [7]:
print(df.isnull().sum())

Student ID                                  0
Age                                         0
Gender                                      0
Socioeconomic Status                        0
Attendance Percentage                       0
Previous Academic Records (GPA)             0
Participation in Class Activities           0
Class Assignments Score                     0
Login Frequency                             0
Submissions                                 0
Motivational Survey Scores                  0
Stress Levels                               0
Access to Resources                         0
Time Spent Studying Outside Class (mins)    0
Part-Time Job Status                        0
Classroom Environment Satisfaction          0
Group Learning Sessions                     0
Proximity to Institute (mins)               0
Final GPA                                   0
Pass/Fail Status                            0
Engagement Level                            0
Dropout Likelihood                

##Regression Analysis

In [9]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import mean_absolute_error

# Define features and target
numerical_features = ['Age', 'Attendance Percentage', 'Previous Academic Records (GPA)',
                      'Class Assignments Score', 'Login Frequency',
                      'Time Spent Studying Outside Class (mins)','Proximity to Institute (mins)']
categorical_features = ['Gender', 'Socioeconomic Status', 'Participation in Class Activities',
                        'Submissions', 'Motivational Survey Scores', 'Stress Levels',
                        'Access to Resources', 'Part-Time Job Status',
                        'Classroom Environment Satisfaction',
                        'Group Learning Sessions']
target_regression = 'Final GPA'

# Preprocessing pipeline
preprocessor = ColumnTransformer(transformers=[
    ('num', StandardScaler(), numerical_features),
    ('cat', OneHotEncoder(), categorical_features)
])

# Regression pipeline
regressor_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor())
])

# Split data
X = data[categorical_features + numerical_features]
y_reg = data[target_regression]

X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(X, y_reg, test_size=0.2, random_state=42)

# Fit and predict
regressor_pipeline.fit(X_train_reg, y_train_reg)
y_pred_reg = regressor_pipeline.predict(X_test_reg)

# Evaluate
print(f"Regression MAE: {mean_absolute_error(y_test_reg, y_pred_reg)}")


Regression MAE: 0.00012000000000360167


### Cross-Validation


In [10]:
from sklearn.model_selection import cross_val_score

cv_scores = cross_val_score(regressor_pipeline, X, y_reg, cv=5, scoring='neg_mean_absolute_error')
print(f"Cross-Validation MAE: {-cv_scores.mean()}")


Cross-Validation MAE: 1.000000000342638e-05


##Random Forest Classifier

In [13]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Define classification targets
classification_targets = ['Pass/Fail Status', 'Engagement Level', 'Dropout Likelihood']

# Classification pipeline
classifier_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier())
])

# Split data for classification
y_class = data[classification_targets]
X_train_class, X_test_class, y_train_class, y_test_class = train_test_split(X, y_class, test_size=0.2, random_state=42)

# Train and evaluate classifiers for each target
for target in classification_targets:
    classifier_pipeline.fit(X_train_class, y_train_class[target])
    y_pred_class = classifier_pipeline.predict(X_test_class)
    print(f"Classification Report for {target}:\n{classification_report(y_test_class[target], y_pred_class)}")


Classification Report for Pass/Fail Status:
              precision    recall  f1-score   support

        Fail       1.00      1.00      1.00         5
        Pass       1.00      1.00      1.00        95

    accuracy                           1.00       100
   macro avg       1.00      1.00      1.00       100
weighted avg       1.00      1.00      1.00       100

Classification Report for Engagement Level:
              precision    recall  f1-score   support

        High       1.00      1.00      1.00        34
         Low       1.00      1.00      1.00        66

    accuracy                           1.00       100
   macro avg       1.00      1.00      1.00       100
weighted avg       1.00      1.00      1.00       100

Classification Report for Dropout Likelihood:
              precision    recall  f1-score   support

      Likely       1.00      1.00      1.00        21
    Unlikely       1.00      1.00      1.00        79

    accuracy                           1.00     

### Cross-Validation

In [14]:
from sklearn.model_selection import cross_val_score

cv_scores = cross_val_score(classifier_pipeline, X, y_class['Pass/Fail Status'], cv=5, scoring='accuracy')
print(f"Cross-Validation Accuracy: {cv_scores.mean()}")


Cross-Validation Accuracy: 1.0


In [17]:
df = df.drop(columns=df.columns[0])
df.head()

Unnamed: 0,Gender,Socioeconomic Status,Attendance Percentage,Previous Academic Records (GPA),Participation in Class Activities,Class Assignments Score,Login Frequency,Submissions,Motivational Survey Scores,Stress Levels,Access to Resources,Time Spent Studying Outside Class (mins),Part-Time Job Status,Classroom Environment Satisfaction,Group Learning Sessions,Proximity to Institute (mins),Final GPA,Pass/Fail Status,Engagement Level,Dropout Likelihood
0,Male,Low,60,2.0,Inactive,50,5,Early,High,High,Adequate,50,Yes,Satisfied,Frequent,10,2.0,Fail,High,Likely
1,Female,Middle,61,2.1,Active,51,6,On-time,Moderate,Moderate,Inadequate,51,No,Neutral,Rare,11,2.1,Pass,Low,Unlikely
2,Male,High,62,2.2,Active,52,7,Late,Low,Low,Adequate,52,No,Satisfied,Never,12,2.2,Pass,Low,Unlikely
3,Female,Low,63,2.3,Active,53,8,No Submissions,High,Low,Inadequate,53,Yes,Unsatisfied,Never,13,2.3,Pass,High,Unlikely
4,Male,Middle,64,2.4,Inactive,54,9,Early,Moderate,Low,Adequate,54,No,Satisfied,Frequent,14,2.4,Pass,Low,Unlikely


In [21]:
print(data['Pass/Fail Status'].value_counts())
print(data['Engagement Level'].value_counts())
print(data['Dropout Likelihood'].value_counts())


Pass/Fail Status
Pass    475
Fail     25
Name: count, dtype: int64
Engagement Level
Low     361
High    139
Name: count, dtype: int64
Dropout Likelihood
Unlikely    416
Likely       84
Name: count, dtype: int64


#Deployment

##Saving Model using joblib

In [23]:
import re
import joblib

# Define a safe filename generator
def sanitize_filename(name):
    return re.sub(r'[^\w\-_\. ]', '_', name)  # Replace invalid characters with '_'

# Save classification models for each target
for target in classification_targets:
    sanitized_target = sanitize_filename(target)  # Sanitize the target name
    filename = f'classifier_pipeline_{sanitized_target}.pkl'
    joblib.dump(classifier_pipeline, filename)
    print(f"Model saved: {filename}")


Model saved: classifier_pipeline_Pass_Fail Status.pkl
Model saved: classifier_pipeline_Engagement Level.pkl
Model saved: classifier_pipeline_Dropout Likelihood.pkl


In [26]:
# Load regression model
loaded_regressor = joblib.load('regressor_pipeline.pkl')

# Load classification model
loaded_classifier = joblib.load('classifier_pipeline_Pass_Fail Status.pkl')
loaded_classifier = joblib.load('classifier_pipeline_Engagement Level.pkl')
loaded_classifier = joblib.load('classifier_pipeline_Dropout Likelihood.pkl')
