<a href="https://colab.research.google.com/github/ToobaObeidy-1/DA_bootcamp_python/blob/main/Copy_of_Student_Performance_MLmodel.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Student Performance Analysis and Risk Detection: Machine Learning Implementation


# Introduction

Dataset Overview:
 The dataset contains student's performance records, their socio-economic status and other variables which may help analyze the student progress and detect risk in underperforming students.

Objective:
To predict student performance metrics (e.g., Final GPA, Pass/Fail status) and risk indicators (e.g.,
Engagement level, Drop-out likelihood)



#Import required libraries

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn as sk
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import accuracy_score, mean_absolute_error, classification_report
from sklearn.preprocessing import OneHotEncoder,LabelEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

#Load data

In [5]:
data = pd.read_csv("/content/Sample_real_time_performance_dataset (1).csv")
df = pd.DataFrame(data)
df

Unnamed: 0,Student ID,Age,Gender,Socioeconomic Status,Attendance Percentage,Previous Academic Records (GPA),Participation in Class Activities,Class Assignments Score,Login Frequency,Submissions,...,Access to Resources,Time Spent Studying Outside Class (mins),Part-Time Job Status,Classroom Environment Satisfaction,Group Learning Sessions,Proximity to Institute (mins),Final GPA,Pass/Fail Status,Engagement Level,Dropout Likelihood
0,S001,18,Male,Low,60,2.0,Inactive,50,5,Early,...,Adequate,50,Yes,Satisfied,Frequent,10,2.0,Fail,High,Likely
1,S002,19,Female,Middle,61,2.1,Active,51,6,On-time,...,Inadequate,51,No,Neutral,Rare,11,2.1,Pass,Low,Unlikely
2,S003,20,Male,High,62,2.2,Active,52,7,Late,...,Adequate,52,No,Satisfied,Never,12,2.2,Pass,Low,Unlikely
3,S004,21,Female,Low,63,2.3,Active,53,8,No Submissions,...,Inadequate,53,Yes,Unsatisfied,Never,13,2.3,Pass,High,Unlikely
4,S005,22,Male,Middle,64,2.4,Inactive,54,9,Early,...,Adequate,54,No,Satisfied,Frequent,14,2.4,Pass,Low,Unlikely
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,S1396,21,Female,High,83,2.3,Active,73,13,No Submissions,...,Inadequate,73,No,Unsatisfied,Never,33,2.3,Pass,Low,Unlikely
496,S1397,19,Female,Middle,91,3.1,Active,81,6,No Submissions,...,Inadequate,81,No,Unsatisfied,Never,11,3.1,Pass,Low,Unlikely
497,S1398,19,Male,Middle,66,2.6,Active,96,6,Late,...,Adequate,96,No,Satisfied,Never,26,2.6,Pass,Low,Unlikely
498,S1399,18,Female,Middle,65,2.5,Active,85,15,On-time,...,Inadequate,135,No,Neutral,Rare,35,2.5,Pass,Low,Likely


#Data Preprocessing

##Handle missing values

In [6]:
print(df.isnull().sum())

Student ID                                  0
Age                                         0
Gender                                      0
Socioeconomic Status                        0
Attendance Percentage                       0
Previous Academic Records (GPA)             0
Participation in Class Activities           0
Class Assignments Score                     0
Login Frequency                             0
Submissions                                 0
Motivational Survey Scores                  0
Stress Levels                               0
Access to Resources                         0
Time Spent Studying Outside Class (mins)    0
Part-Time Job Status                        0
Classroom Environment Satisfaction          0
Group Learning Sessions                     0
Proximity to Institute (mins)               0
Final GPA                                   0
Pass/Fail Status                            0
Engagement Level                            0
Dropout Likelihood                

##Regression Analysis

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import mean_absolute_error

# Define features and target
numerical_features = ['Age', 'Attendance Percentage', 'Previous Academic Records (GPA)',
                      'Class Assignments Score', 'Login Frequency',
                      'Time Spent Studying Outside Class (mins)','Proximity to Institute (mins)']
categorical_features = ['Gender', 'Socioeconomic Status', 'Participation in Class Activities',
                        'Submissions', 'Motivational Survey Scores', 'Stress Levels',
                        'Access to Resources', 'Part-Time Job Status',
                        'Classroom Environment Satisfaction',
                        'Group Learning Sessions']
target_regression = 'Final GPA'

# Preprocessing pipeline
preprocessor = ColumnTransformer(transformers=[
    ('num', StandardScaler(), numerical_features),
    ('cat', OneHotEncoder(), categorical_features)
])

# Regression pipeline
regressor_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor())
])

# Split data
X = data[categorical_features + numerical_features]
y_reg = data[target_regression]

X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(X, y_reg, test_size=0.2, random_state=42)

# Fit and predict
regressor_pipeline.fit(X_train_reg, y_train_reg)
y_pred_reg = regressor_pipeline.predict(X_test_reg)

# Evaluate
print(f"Regression MAE: {mean_absolute_error(y_test_reg, y_pred_reg)}")


Regression MAE: 3.000000000359382e-05


### Cross-Validation


In [8]:
from sklearn.model_selection import cross_val_score

cv_scores = cross_val_score(regressor_pipeline, X, y_reg, cv=5, scoring='neg_mean_absolute_error')
print(f"Cross-Validation MAE: {-cv_scores.mean()}")


Cross-Validation MAE: 2.2000000003442822e-05


##Random Forest Classifier

In [9]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Define classification targets
classification_targets = ['Pass/Fail Status', 'Engagement Level', 'Dropout Likelihood']

# Classification pipeline
classifier_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier())
])

# Split data for classification
y_class = data[classification_targets]
X_train_class, X_test_class, y_train_class, y_test_class = train_test_split(X, y_class, test_size=0.2, random_state=42)

# Train and evaluate classifiers for each target
for target in classification_targets:
    classifier_pipeline.fit(X_train_class, y_train_class[target])
    y_pred_class = classifier_pipeline.predict(X_test_class)
    print(f"Classification Report for {target}:\n{classification_report(y_test_class[target], y_pred_class)}")


Classification Report for Pass/Fail Status:
              precision    recall  f1-score   support

        Fail       1.00      1.00      1.00         5
        Pass       1.00      1.00      1.00        95

    accuracy                           1.00       100
   macro avg       1.00      1.00      1.00       100
weighted avg       1.00      1.00      1.00       100

Classification Report for Engagement Level:
              precision    recall  f1-score   support

        High       1.00      1.00      1.00        34
         Low       1.00      1.00      1.00        66

    accuracy                           1.00       100
   macro avg       1.00      1.00      1.00       100
weighted avg       1.00      1.00      1.00       100

Classification Report for Dropout Likelihood:
              precision    recall  f1-score   support

      Likely       1.00      1.00      1.00        21
    Unlikely       1.00      1.00      1.00        79

    accuracy                           1.00     

### Cross-Validation

In [10]:
from sklearn.model_selection import cross_val_score

cv_scores = cross_val_score(classifier_pipeline, X, y_class['Pass/Fail Status'], cv=5, scoring='accuracy')
print(f"Cross-Validation Accuracy: {cv_scores.mean()}")


Cross-Validation Accuracy: 1.0


In [11]:
df = df.drop(columns=df.columns[0])
df.head()

Unnamed: 0,Age,Gender,Socioeconomic Status,Attendance Percentage,Previous Academic Records (GPA),Participation in Class Activities,Class Assignments Score,Login Frequency,Submissions,Motivational Survey Scores,...,Access to Resources,Time Spent Studying Outside Class (mins),Part-Time Job Status,Classroom Environment Satisfaction,Group Learning Sessions,Proximity to Institute (mins),Final GPA,Pass/Fail Status,Engagement Level,Dropout Likelihood
0,18,Male,Low,60,2.0,Inactive,50,5,Early,High,...,Adequate,50,Yes,Satisfied,Frequent,10,2.0,Fail,High,Likely
1,19,Female,Middle,61,2.1,Active,51,6,On-time,Moderate,...,Inadequate,51,No,Neutral,Rare,11,2.1,Pass,Low,Unlikely
2,20,Male,High,62,2.2,Active,52,7,Late,Low,...,Adequate,52,No,Satisfied,Never,12,2.2,Pass,Low,Unlikely
3,21,Female,Low,63,2.3,Active,53,8,No Submissions,High,...,Inadequate,53,Yes,Unsatisfied,Never,13,2.3,Pass,High,Unlikely
4,22,Male,Middle,64,2.4,Inactive,54,9,Early,Moderate,...,Adequate,54,No,Satisfied,Frequent,14,2.4,Pass,Low,Unlikely


In [12]:
print(data['Pass/Fail Status'].value_counts())
print(data['Engagement Level'].value_counts())
print(data['Dropout Likelihood'].value_counts())


Pass/Fail Status
Pass    475
Fail     25
Name: count, dtype: int64
Engagement Level
Low     361
High    139
Name: count, dtype: int64
Dropout Likelihood
Unlikely    416
Likely       84
Name: count, dtype: int64


#Deployment

##Saving Model using joblib

In [15]:
import re
import joblib

joblib.dump(regressor_pipeline, 'regressor_pipeline.pkl')

# Define a safe filename generator
def sanitize_filename(name):
    return re.sub(r'[^\w\-_\. ]', '_', name)  # Replace invalid characters with '_'

# Save classification models for each target
for target in classification_targets:
    sanitized_target = sanitize_filename(target)  # Sanitize the target name
    filename = f'classifier_pipeline_{sanitized_target}.pkl'
    joblib.dump(classifier_pipeline, filename)
    print(f"Model saved: {filename}")


Model saved: classifier_pipeline_Pass_Fail Status.pkl
Model saved: classifier_pipeline_Engagement Level.pkl
Model saved: classifier_pipeline_Dropout Likelihood.pkl


In [16]:
# Load regression model
loaded_regressor = joblib.load('regressor_pipeline.pkl')

# Load classification model
loaded_classifier = joblib.load('classifier_pipeline_Pass_Fail Status.pkl')
loaded_classifier = joblib.load('classifier_pipeline_Engagement Level.pkl')
loaded_classifier = joblib.load('classifier_pipeline_Dropout Likelihood.pkl')


##Deploying with streamlit

In [27]:
!pip install streamlit -q

In [33]:
!wget -q -O - ipv4.icanhazip.com

34.73.26.4


In [34]:
! streamlit run app.py & npx localtunnel --port 8501


Collecting usage statistics. To deactivate, set browser.gatherUsageStats to false.
[0m
[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Local URL: [0m[1mhttp://localhost:8501[0m
[34m  Network URL: [0m[1mhttp://172.28.0.12:8501[0m
[34m  External URL: [0m[1mhttp://34.73.26.4:8501[0m
[0m
your url is: https://proud-things-chew.loca.lt
Regressor Features: ['num__Age' 'num__Attendance Percentage'
 'num__Previous Academic Records (GPA)' 'num__Class Assignments Score'
 'num__Login Frequency' 'num__Time Spent Studying Outside Class (mins)'
 'num__Proximity to Institute (mins)' 'cat__Gender_Female'
 'cat__Gender_Male' 'cat__Socioeconomic Status_High'
 'cat__Socioeconomic Status_Low' 'cat__Socioeconomic Status_Middle'
 'cat__Participation in Class Activities_Active'
 'cat__Participation in Class Activities_Inactive'
 'cat__Submissions_Early' 'cat__Submissions_Late'
 'cat__Submissions_No Submissions' 'cat__Submissions_On-time'
 'cat__Motivational S

In [31]:
import pandas as pd

# Load pipelines
regressor_pipeline = joblib.load('regressor_pipeline.pkl')
classifier_pipeline_Pass_Fail = joblib.load('classifier_pipeline_Pass_Fail Status.pkl')
classifier_pipeline_Engagement = joblib.load('classifier_pipeline_Engagement Level.pkl')
classifier_pipeline_Dropout = joblib.load('classifier_pipeline_Dropout Likelihood.pkl')

# Get expected features for each pipeline
regressor_features = regressor_pipeline.named_steps['preprocessor'].get_feature_names_out()
pass_fail_features = classifier_pipeline_Pass_Fail.named_steps['preprocessor'].get_feature_names_out()
engagement_features = classifier_pipeline_Engagement.named_steps['preprocessor'].get_feature_names_out()
dropout_features = classifier_pipeline_Dropout.named_steps['preprocessor'].get_feature_names_out()

# Print expected features for debugging (optional)
print("Regressor Features:", regressor_features)
print("Pass/Fail Features:", pass_fail_features)
print("Engagement Features:", engagement_features)
print("Dropout Features:", dropout_features)

# User input

st.title("Student Performance Prediction")
st.write("Enter student data:")

# Gather input fields for numerical features
input_data = {
    "Age": st.number_input("Age", min_value=10, max_value=100, value=18),
    "Attendance Percentage": st.number_input("Attendance Percentage", min_value=0.0, max_value=100.0, value=75.0),
    "Previous Academic Records (GPA)": st.number_input("GPA", min_value=0.0, max_value=4.0, value=3.0),
    "Class Assignments Score": st.number_input("Assignment Score", min_value=0.0, max_value=100.0, value=85.0),
    "Login Frequency": st.number_input("Login Frequency", min_value=0, max_value=100, value=10),
    "Time Spent Studying Outside Class (mins)": st.number_input("Study Time (minutes)", min_value=0, max_value=1440, value=120),
    "Proximity to Institute (mins)": st.number_input("Proximity to Institute (minutes)", min_value=0, max_value=120, value=30),}

# Input fields for categorical features
input_data.update({
    "Gender": st.selectbox("Gender", ["Male", "Female"]),
    "Socioeconomic Status": st.selectbox("Socioeconomic Status", ["Low", "Middle", "High"]),
    "Participation in Class Activities": st.selectbox("Participation in Class Activities", ["Active", "Inactive"]),
    "Submissions": st.selectbox("Submissions", ["Early", "On-time", "Late", "No Submissions"]),
    "Motivational Survey Scores": st.selectbox("Motivational Survey Scores", ["Low", "Moderate", "High"]),
    "Stress Levels": st.selectbox("Stress Levels", ["Low", "Moderate", "High"]),
    "Access to Resources": st.selectbox("Access to Resources", ["Adequate", "Inadequate"]),
    "Part-Time Job Status": st.selectbox("Part-Time Job Status", ["Yes", "No"]),
    "Classroom Environment Satisfaction": st.selectbox("Classroom Environment Satisfaction", ["Satisfied", "Neutral", "Unsatisfied"]),
    "Group Learning Sessions": st.selectbox("Group Learning Sessions", ["Frequent", "Rare", "Never"]),
})


input_df = pd.DataFrame([input_data])

# Function to align input_df with pipeline features
def align_features(input_df, expected_features):
    for feature in expected_features:
        if feature not in input_df.columns:
            input_df[feature] = 0  # Default value for missing features
    return input_df[expected_features]  # Reorder columns to match expected order

# Align input_df for each pipeline
input_df_regressor = align_features(input_df, regressor_features)
input_df_pass_fail = align_features(input_df, pass_fail_features)
input_df_engagement = align_features(input_df, engagement_features)
input_df_dropout = align_features(input_df, dropout_features)

# Debug aligned DataFrames (optional)
print("Regressor Input DataFrame:")
print(input_df_regressor.head())
print("Pass/Fail Input DataFrame:")
print(input_df_pass_fail.head())




Regressor Features: ['num__Age' 'num__Attendance Percentage'
 'num__Previous Academic Records (GPA)' 'num__Class Assignments Score'
 'num__Login Frequency' 'num__Time Spent Studying Outside Class (mins)'
 'num__Proximity to Institute (mins)' 'cat__Gender_Female'
 'cat__Gender_Male' 'cat__Socioeconomic Status_High'
 'cat__Socioeconomic Status_Low' 'cat__Socioeconomic Status_Middle'
 'cat__Participation in Class Activities_Active'
 'cat__Participation in Class Activities_Inactive'
 'cat__Submissions_Early' 'cat__Submissions_Late'
 'cat__Submissions_No Submissions' 'cat__Submissions_On-time'
 'cat__Motivational Survey Scores_High'
 'cat__Motivational Survey Scores_Low'
 'cat__Motivational Survey Scores_Moderate' 'cat__Stress Levels_High'
 'cat__Stress Levels_Low' 'cat__Stress Levels_Moderate'
 'cat__Access to Resources_Adequate' 'cat__Access to Resources_Inadequate'
 'cat__Part-Time Job Status_No' 'cat__Part-Time Job Status_Yes'
 'cat__Classroom Environment Satisfaction_Neutral'
 'cat__Cl

In [32]:
import streamlit as st
import joblib
import pandas as pd

# Load models
regressor_pipeline = joblib.load('regressor_pipeline.pkl')
classifier_pipeline_Pass_Fail = joblib.load('classifier_pipeline_Pass_Fail Status.pkl')
classifier_pipeline_Engagement = joblib.load('classifier_pipeline_Engagement Level.pkl')
classifier_pipeline_Dropout = joblib.load('classifier_pipeline_Dropout Likelihood.pkl')
# User input
# Ensure input_df matches the expected features
expected_features = regressor_pipeline.named_steps['preprocessor'].get_feature_names_out()

# Add missing columns with default values
for col in expected_features:
    if col not in input_df.columns:
        input_df[col] = 0  # Use default values for missing columns

# Ensure correct column order
input_df = input_df[expected_features]

st.title("Student Performance Prediction")
st.write("Enter student data:")

# Gather input fields for numerical features
input_data = {
    "Age": st.number_input("Age", min_value=10, max_value=100, value=18),
    "Attendance Percentage": st.number_input("Attendance Percentage", min_value=0.0, max_value=100.0, value=75.0),
    "Previous Academic Records (GPA)": st.number_input("GPA", min_value=0.0, max_value=4.0, value=3.0),
    "Class Assignments Score": st.number_input("Assignment Score", min_value=0.0, max_value=100.0, value=85.0),
    "Login Frequency": st.number_input("Login Frequency", min_value=0, max_value=100, value=10),
    "Time Spent Studying Outside Class (mins)": st.number_input("Study Time (minutes)", min_value=0, max_value=1440, value=120),
    "Proximity to Institute (mins)": st.number_input("Proximity to Institute (minutes)", min_value=0, max_value=120, value=30),}

# Input fields for categorical features
input_data.update({
    "Gender": st.selectbox("Gender", ["Male", "Female"]),
    "Socioeconomic Status": st.selectbox("Socioeconomic Status", ["Low", "Middle", "High"]),
    "Participation in Class Activities": st.selectbox("Participation in Class Activities", ["Active", "Inactive"]),
    "Submissions": st.selectbox("Submissions", ["Early", "On-time", "Late", "No Submissions"]),
    "Motivational Survey Scores": st.selectbox("Motivational Survey Scores", ["Low", "Moderate", "High"]),
    "Stress Levels": st.selectbox("Stress Levels", ["Low", "Moderate", "High"]),
    "Access to Resources": st.selectbox("Access to Resources", ["Adequate", "Inadequate"]),
    "Part-Time Job Status": st.selectbox("Part-Time Job Status", ["Yes", "No"]),
    "Classroom Environment Satisfaction": st.selectbox("Classroom Environment Satisfaction", ["Satisfied", "Neutral", "Unsatisfied"]),
    "Group Learning Sessions": st.selectbox("Group Learning Sessions", ["Frequent", "Rare", "Never"]),
})



if st.button("Predict"):
    # Predict GPA
    gpa_prediction = regressor_pipeline.predict(input_df)[0]
    st.write(f"Predicted Final GPA: {gpa_prediction}")

    # Predict Pass/Fail
    pass_fail_prediction = classifier_pipeline_Pass_Fail.predict(input_df)[0]
    st.write(f"Predicted Pass/Fail Status: {pass_fail_prediction}")

    # Predict Engagement Level
    engagement_prediction = classifier_pipeline_Engagement.predict(input_df)[0]
    st.write(f"Predicted Engagement Level: {engagement_prediction}")

    # Predict Dropout Likelihood
    dropout_prediction = classifier_pipeline_Dropout.predict(input_df)[0]
    st.write(f"Predicted Dropout Likelihood: {dropout_prediction}")


