<a href="https://colab.research.google.com/github/ToobaObeidy-1/DA_bootcamp_python/blob/main/Copy_of_Student_Performance_MLmodel.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Student Performance Analysis and Risk Detection: Machine Learning Implementation


# Introduction

Dataset Overview:
 The dataset contains student's performance records, their socio-economic status and other variables which may help analyze the student progress and detect risk in underperforming students.

Objective:
To predict student performance metrics (e.g., Final GPA, Pass/Fail status) and risk indicators (e.g.,
Engagement level, Drop-out likelihood)



#Import required libraries

In [30]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn as sk
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import accuracy_score, mean_absolute_error, classification_report
from sklearn.preprocessing import OneHotEncoder,LabelEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

#Load data

In [31]:
data = pd.read_csv("/content/Sample_real_time_performance_dataset (2).csv")
df = pd.DataFrame(data)
df

Unnamed: 0,Student ID,Age,Gender,Socioeconomic Status,Attendance Percentage,Previous Academic Records (GPA),Participation in Class Activities,Class Assignments Score,Login Frequency,Submissions,...,Access to Resources,Time Spent Studying Outside Class (mins),Part-Time Job Status,Classroom Environment Satisfaction,Group Learning Sessions,Proximity to Institute (mins),Final GPA,Pass/Fail Status,Engagement Level,Dropout Likelihood
0,S001,18,Male,Low,60,2.0,Inactive,50,5,Early,...,Adequate,50,Yes,Satisfied,Frequent,10,2.0,Fail,High,Likely
1,S002,19,Female,Middle,61,2.1,Active,51,6,On-time,...,Inadequate,51,No,Neutral,Rare,11,2.1,Pass,Low,Unlikely
2,S003,20,Male,High,62,2.2,Active,52,7,Late,...,Adequate,52,No,Satisfied,Never,12,2.2,Pass,Low,Unlikely
3,S004,21,Female,Low,63,2.3,Active,53,8,No Submissions,...,Inadequate,53,Yes,Unsatisfied,Never,13,2.3,Pass,High,Unlikely
4,S005,22,Male,Middle,64,2.4,Inactive,54,9,Early,...,Adequate,54,No,Satisfied,Frequent,14,2.4,Pass,Low,Unlikely
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2495,S2996,19,Female,Low,81,2.1,Active,71,11,On-time,...,Inadequate,71,Yes,Neutral,Rare,31,2.1,Pass,High,Unlikely
2496,S2997,22,Male,Low,84,2.4,Inactive,74,14,Early,...,Adequate,74,Yes,Satisfied,Frequent,34,2.4,Pass,High,Unlikely
2497,S2998,22,Male,High,74,3.4,Active,64,19,Late,...,Adequate,64,No,Satisfied,Never,24,3.4,Pass,Low,Unlikely
2498,S2999,21,Male,Middle,78,3.8,Active,58,18,Late,...,Adequate,108,No,Satisfied,Never,38,3.8,Pass,Low,Unlikely


In [4]:
# provided_dataset_path = "/content/Sample_real_time_performance_dataset (1).csv"
# existing_df = pd.read_csv(provided_dataset_path)

# # Generate additional 400 samples based on the structure of the existing dataset
# import random

# # Duplicate rows to generate more data and shuffle the dataset
# additional_samples = existing_df.sample(n=2000, replace=True, random_state=42)

# # Assign new Student IDs to ensure uniqueness
# additional_samples["Student ID"] = [
#     f"S{1000 + i}" for i in range(1, len(additional_samples) + 1)
# ]

# # Combine the datasets
# augmented_df = pd.concat([existing_df, additional_samples], ignore_index=True)

# # Save the updated dataset
# augmented_dataset_path = "/content/Sample_real_time_performance_dataset (2).csv"
# augmented_df.to_csv(augmented_dataset_path, index=False)

# augmented_dataset_path

'/content/Sample_real_time_performance_dataset (2).csv'

#Data Preprocessing

##Handle missing values

In [32]:
print(df.isnull().sum())

Student ID                                  0
Age                                         0
Gender                                      0
Socioeconomic Status                        0
Attendance Percentage                       0
Previous Academic Records (GPA)             0
Participation in Class Activities           0
Class Assignments Score                     0
Login Frequency                             0
Submissions                                 0
Motivational Survey Scores                  0
Stress Levels                               0
Access to Resources                         0
Time Spent Studying Outside Class (mins)    0
Part-Time Job Status                        0
Classroom Environment Satisfaction          0
Group Learning Sessions                     0
Proximity to Institute (mins)               0
Final GPA                                   0
Pass/Fail Status                            0
Engagement Level                            0
Dropout Likelihood                

##Regression Analysis

In [33]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import mean_absolute_error

# Define features and target
numerical_features = ['Age', 'Attendance Percentage', 'Previous Academic Records (GPA)',
                      'Class Assignments Score', 'Login Frequency',
                      'Time Spent Studying Outside Class (mins)','Proximity to Institute (mins)']
categorical_features = ['Gender', 'Socioeconomic Status', 'Participation in Class Activities',
                        'Submissions', 'Motivational Survey Scores', 'Stress Levels',
                        'Access to Resources', 'Part-Time Job Status',
                        'Classroom Environment Satisfaction',
                        'Group Learning Sessions']
target_regression = 'Final GPA'

# Preprocessing pipeline
preprocessor = ColumnTransformer(transformers=[
    ('num', StandardScaler(), numerical_features),
    ('cat', OneHotEncoder(), categorical_features)
])

# Regression pipeline
regressor_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor())
])

# Split data
X = data[categorical_features + numerical_features]
y_reg = data[target_regression]

X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(X, y_reg, test_size=0.3, random_state=42)

# Fit and predict
regressor_pipeline.fit(X_train_reg, y_train_reg)
y_pred_reg = regressor_pipeline.predict(X_test_reg)

# Evaluate
print(f"Regression MAE: {mean_absolute_error(y_test_reg, y_pred_reg)}")


Regression MAE: 3.864168244642011e-15


### Cross-Validation


In [34]:
from sklearn.model_selection import cross_val_score

cv_scores = cross_val_score(regressor_pipeline, X, y_reg, cv=5, scoring='neg_mean_absolute_error')
print(f"Cross-Validation MAE: {-cv_scores.mean()}")


Cross-Validation MAE: 4.0095926578942455e-15


##Random Forest Classifier

In [35]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Define classification targets
classification_targets = ['Pass/Fail Status', 'Engagement Level', 'Dropout Likelihood']

# Classification pipeline
classifier_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier())
])

# Split data for classification
y_class = data[classification_targets]
X_train_class, X_test_class, y_train_class, y_test_class = train_test_split(X, y_class, test_size=0.3, random_state=42)

# Train and evaluate classifiers for each target
for target in classification_targets:
    classifier_pipeline.fit(X_train_class, y_train_class[target])
    y_pred_class = classifier_pipeline.predict(X_test_class)
    print(f"Classification Report for {target}:\n{classification_report(y_test_class[target], y_pred_class)}")


Classification Report for Pass/Fail Status:
              precision    recall  f1-score   support

        Fail       1.00      1.00      1.00        31
        Pass       1.00      1.00      1.00       719

    accuracy                           1.00       750
   macro avg       1.00      1.00      1.00       750
weighted avg       1.00      1.00      1.00       750

Classification Report for Engagement Level:
              precision    recall  f1-score   support

        High       1.00      1.00      1.00       214
         Low       1.00      1.00      1.00       536

    accuracy                           1.00       750
   macro avg       1.00      1.00      1.00       750
weighted avg       1.00      1.00      1.00       750

Classification Report for Dropout Likelihood:
              precision    recall  f1-score   support

      Likely       1.00      1.00      1.00       120
    Unlikely       1.00      1.00      1.00       630

    accuracy                           1.00     

### Cross-Validation

In [36]:
from sklearn.model_selection import cross_val_score

cv_scores = cross_val_score(classifier_pipeline, X, y_class['Pass/Fail Status'], cv=5, scoring='accuracy')
print(f"Cross-Validation Accuracy: {cv_scores.mean()}")


Cross-Validation Accuracy: 1.0


In [37]:
df = df.drop(columns=df.columns[0])
df.head()

Unnamed: 0,Age,Gender,Socioeconomic Status,Attendance Percentage,Previous Academic Records (GPA),Participation in Class Activities,Class Assignments Score,Login Frequency,Submissions,Motivational Survey Scores,...,Access to Resources,Time Spent Studying Outside Class (mins),Part-Time Job Status,Classroom Environment Satisfaction,Group Learning Sessions,Proximity to Institute (mins),Final GPA,Pass/Fail Status,Engagement Level,Dropout Likelihood
0,18,Male,Low,60,2.0,Inactive,50,5,Early,High,...,Adequate,50,Yes,Satisfied,Frequent,10,2.0,Fail,High,Likely
1,19,Female,Middle,61,2.1,Active,51,6,On-time,Moderate,...,Inadequate,51,No,Neutral,Rare,11,2.1,Pass,Low,Unlikely
2,20,Male,High,62,2.2,Active,52,7,Late,Low,...,Adequate,52,No,Satisfied,Never,12,2.2,Pass,Low,Unlikely
3,21,Female,Low,63,2.3,Active,53,8,No Submissions,High,...,Inadequate,53,Yes,Unsatisfied,Never,13,2.3,Pass,High,Unlikely
4,22,Male,Middle,64,2.4,Inactive,54,9,Early,Moderate,...,Adequate,54,No,Satisfied,Frequent,14,2.4,Pass,Low,Unlikely


In [38]:
print(data['Pass/Fail Status'].value_counts())
print(data['Engagement Level'].value_counts())
print(data['Dropout Likelihood'].value_counts())


Pass/Fail Status
Pass    2377
Fail     123
Name: count, dtype: int64
Engagement Level
Low     1784
High     716
Name: count, dtype: int64
Dropout Likelihood
Unlikely    2103
Likely       397
Name: count, dtype: int64


#Deployment

##Saving Model using joblib

In [39]:
import re
import joblib

joblib.dump(regressor_pipeline, 'regressor_pipeline.pkl')

# Define a safe filename generator
def sanitize_filename(name):
    return re.sub(r'[^\w\-_\. ]', '_', name)  # Replace invalid characters with '_'

# Save classification models for each target
for target in classification_targets:
    sanitized_target = sanitize_filename(target)  # Sanitize the target name
    filename = f'classifier_pipeline_{sanitized_target}.pkl'
    joblib.dump(classifier_pipeline, filename)
    print(f"Model saved: {filename}")


Model saved: classifier_pipeline_Pass_Fail Status.pkl
Model saved: classifier_pipeline_Engagement Level.pkl
Model saved: classifier_pipeline_Dropout Likelihood.pkl


In [40]:
# Load regression model
loaded_regressor = joblib.load('regressor_pipeline.pkl')

# Load classification model
loaded_classifier = joblib.load('classifier_pipeline_Pass_Fail Status.pkl')
loaded_classifier = joblib.load('classifier_pipeline_Engagement Level.pkl')
loaded_classifier = joblib.load('classifier_pipeline_Dropout Likelihood.pkl')


##Deploying with streamlit

In [14]:
!pip install streamlit -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.3/44.3 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.6/8.6 MB[0m [31m64.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m85.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.1/79.1 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[?25h

In [15]:
!wget -q -O - ipv4.icanhazip.com

34.125.67.249


In [16]:
! streamlit run app.py & npx localtunnel --port 8501

Usage: streamlit run [OPTIONS] TARGET [ARGS]...
Try 'streamlit run --help' for help.

Error: Invalid value: File does not exist: app.py
[1G[0JNeed to install the following packages:
  localtunnel@2.0.2
Ok to proceed? (y) [20G^C


In [17]:
import joblib
import streamlit as st
import pandas as pd

# Load pipelines
regressor_pipeline = joblib.load('regressor_pipeline.pkl')
classifier_pipeline_Pass_Fail = joblib.load('classifier_pipeline_Pass_Fail Status.pkl')
classifier_pipeline_Engagement = joblib.load('classifier_pipeline_Engagement Level.pkl')
classifier_pipeline_Dropout = joblib.load('classifier_pipeline_Dropout Likelihood.pkl')

# Get expected features for each pipeline
regressor_features = regressor_pipeline.named_steps['preprocessor'].get_feature_names_out()
pass_fail_features = classifier_pipeline_Pass_Fail.named_steps['preprocessor'].get_feature_names_out()
engagement_features = classifier_pipeline_Engagement.named_steps['preprocessor'].get_feature_names_out()
dropout_features = classifier_pipeline_Dropout.named_steps['preprocessor'].get_feature_names_out()



st.title("Student Performance Prediction")
st.write("Enter student data:")
st.write("Enter student ID")

# Gather input fields for numerical features
input_data = {
    "Age": st.number_input("Age", min_value=10, max_value=100, value=18),
    "Attendance Percentage": st.number_input("Attendance Percentage", min_value=0.0, max_value=100.0, value=75.0),
    "Previous Academic Records (GPA)": st.number_input("GPA", min_value=0.0, max_value=4.0, value=3.0),
    "Class Assignments Score": st.number_input("Assignment Score", min_value=0.0, max_value=100.0, value=85.0),
    "Login Frequency": st.number_input("Login Frequency", min_value=0, max_value=100, value=10),
    "Time Spent Studying Outside Class (mins)": st.number_input("Study Time (minutes)", min_value=0, max_value=1440, value=120),
    "Proximity to Institute (mins)": st.number_input("Proximity to Institute (minutes)", min_value=0, max_value=120, value=30),}

# Input fields for categorical features
input_data.update({
    "Gender": st.selectbox("Gender", ["Male", "Female"]),
    "Socioeconomic Status": st.selectbox("Socioeconomic Status", ["Low", "Middle", "High"]),
    "Participation in Class Activities": st.selectbox("Participation in Class Activities", ["Active", "Inactive"]),
    "Submissions": st.selectbox("Submissions", ["Early", "On-time", "Late", "No Submissions"]),
    "Motivational Survey Scores": st.selectbox("Motivational Survey Scores", ["Low", "Moderate", "High"]),
    "Stress Levels": st.selectbox("Stress Levels", ["Low", "Moderate", "High"]),
    "Access to Resources": st.selectbox("Access to Resources", ["Adequate", "Inadequate"]),
    "Part-Time Job Status": st.selectbox("Part-Time Job Status", ["Yes", "No"]),
    "Classroom Environment Satisfaction": st.selectbox("Classroom Environment Satisfaction", ["Satisfied", "Neutral", "Unsatisfied"]),
    "Group Learning Sessions": st.selectbox("Group Learning Sessions", ["Frequent", "Rare", "Never"]),
})


input_df = pd.DataFrame([input_data])

# Function to align input_df with pipeline features
def align_features(input_df, expected_features):
    for feature in expected_features:
        if feature not in input_df.columns:
            input_df[feature] = 0  # Default value for missing features
    return input_df[expected_features]  # Reorder columns to match expected order

# Align input_df for each pipeline
input_df_regressor = align_features(input_df, regressor_features)
input_df_pass_fail = align_features(input_df, pass_fail_features)
input_df_engagement = align_features(input_df, engagement_features)
input_df_dropout = align_features(input_df, dropout_features)

if st.button("Predict"):
    # Predict GPA
    gpa_prediction = regressor_pipeline.predict(input_df)[0]
    st.write(f"Predicted Final GPA: {gpa_prediction}")

    # Predict Pass/Fail
    pass_fail_prediction = classifier_pipeline_Pass_Fail.predict(input_df)[0]
    st.write(f"Predicted Pass/Fail Status: {pass_fail_prediction}")

    # Predict Engagement Level
    engagement_prediction = classifier_pipeline_Engagement.predict(input_df)[0]
    st.write(f"Predicted Engagement Level: {engagement_prediction}")

    # Predict Dropout Likelihood
    dropout_prediction = classifier_pipeline_Dropout.predict(input_df)[0]
    st.write(f"Predicted Dropout Likelihood: {dropout_prediction}")



2024-11-23 16:06:55.924 
  command:

    streamlit run /usr/local/lib/python3.10/dist-packages/colab_kernel_launcher.py [ARGUMENTS]
2024-11-23 16:06:55.957 Session state does not function when running a script without `streamlit run`


In [18]:
print("Regressor Input Features:", input_df_regressor.columns)
print("Pass/Fail Input Features:", input_df_pass_fail.columns)
print("Engagement Input Features:", input_df_engagement.columns)
print("Dropout Input Features:", input_df_dropout.columns)

Regressor Input Features: Index(['num__Age', 'num__Attendance Percentage',
       'num__Previous Academic Records (GPA)', 'num__Class Assignments Score',
       'num__Login Frequency', 'num__Time Spent Studying Outside Class (mins)',
       'num__Proximity to Institute (mins)', 'cat__Gender_Female',
       'cat__Gender_Male', 'cat__Socioeconomic Status_High',
       'cat__Socioeconomic Status_Low', 'cat__Socioeconomic Status_Middle',
       'cat__Participation in Class Activities_Active',
       'cat__Participation in Class Activities_Inactive',
       'cat__Submissions_Early', 'cat__Submissions_Late',
       'cat__Submissions_No Submissions', 'cat__Submissions_On-time',
       'cat__Motivational Survey Scores_High',
       'cat__Motivational Survey Scores_Low',
       'cat__Motivational Survey Scores_Moderate', 'cat__Stress Levels_High',
       'cat__Stress Levels_Low', 'cat__Stress Levels_Moderate',
       'cat__Access to Resources_Adequate',
       'cat__Access to Resources_Inadequa

In [19]:
expected_features = classifier_pipeline_Pass_Fail.named_steps['preprocessor'].get_feature_names_out()
print("Expected Features:", expected_features)
print("Input Features:", input_df.columns)

Expected Features: ['num__Age' 'num__Attendance Percentage'
 'num__Previous Academic Records (GPA)' 'num__Class Assignments Score'
 'num__Login Frequency' 'num__Time Spent Studying Outside Class (mins)'
 'num__Proximity to Institute (mins)' 'cat__Gender_Female'
 'cat__Gender_Male' 'cat__Socioeconomic Status_High'
 'cat__Socioeconomic Status_Low' 'cat__Socioeconomic Status_Middle'
 'cat__Participation in Class Activities_Active'
 'cat__Participation in Class Activities_Inactive'
 'cat__Submissions_Early' 'cat__Submissions_Late'
 'cat__Submissions_No Submissions' 'cat__Submissions_On-time'
 'cat__Motivational Survey Scores_High'
 'cat__Motivational Survey Scores_Low'
 'cat__Motivational Survey Scores_Moderate' 'cat__Stress Levels_High'
 'cat__Stress Levels_Low' 'cat__Stress Levels_Moderate'
 'cat__Access to Resources_Adequate' 'cat__Access to Resources_Inadequate'
 'cat__Part-Time Job Status_No' 'cat__Part-Time Job Status_Yes'
 'cat__Classroom Environment Satisfaction_Neutral'
 'cat__Cla