In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split


In [2]:
dataset=pd.read_csv("student_performance_large_dataset.csv")

In [3]:
dataset

Unnamed: 0,Student_ID,Age,Gender,Study_Hours_per_Week,Preferred_Learning_Style,Online_Courses_Completed,Participation_in_Discussions,Assignment_Completion_Rate (%),Exam_Score (%),Attendance_Rate (%),Use_of_Educational_Tech,Self_Reported_Stress_Level,Time_Spent_on_Social_Media (hours/week),Sleep_Hours_per_Night,Final_Grade
0,S00001,18,Female,48,Kinesthetic,14,Yes,100,69,66,Yes,High,9,8,C
1,S00002,29,Female,30,Reading/Writing,20,No,71,40,57,Yes,Medium,28,8,D
2,S00003,20,Female,47,Kinesthetic,11,No,60,43,79,Yes,Low,13,7,D
3,S00004,23,Female,13,Auditory,0,Yes,63,70,60,Yes,Low,24,10,B
4,S00005,19,Female,24,Auditory,19,Yes,59,63,93,Yes,Medium,26,8,C
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,S09996,20,Male,30,Auditory,6,Yes,62,58,76,Yes,Medium,17,6,C
9996,S09997,23,Female,16,Visual,8,Yes,54,84,86,Yes,Medium,6,5,B
9997,S09998,26,Male,23,Visual,3,Yes,54,40,70,No,Medium,20,8,D
9998,S09999,18,Male,41,Reading/Writing,7,Yes,66,45,90,Yes,Low,6,8,D


# Basic information

In [4]:
dataset.shape

(10000, 15)

In [5]:
dataset.columns

Index(['Student_ID', 'Age', 'Gender', 'Study_Hours_per_Week',
       'Preferred_Learning_Style', 'Online_Courses_Completed',
       'Participation_in_Discussions', 'Assignment_Completion_Rate (%)',
       'Exam_Score (%)', 'Attendance_Rate (%)', 'Use_of_Educational_Tech',
       'Self_Reported_Stress_Level', 'Time_Spent_on_Social_Media (hours/week)',
       'Sleep_Hours_per_Night', 'Final_Grade'],
      dtype='object')

In [6]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 15 columns):
 #   Column                                   Non-Null Count  Dtype 
---  ------                                   --------------  ----- 
 0   Student_ID                               10000 non-null  object
 1   Age                                      10000 non-null  int64 
 2   Gender                                   10000 non-null  object
 3   Study_Hours_per_Week                     10000 non-null  int64 
 4   Preferred_Learning_Style                 10000 non-null  object
 5   Online_Courses_Completed                 10000 non-null  int64 
 6   Participation_in_Discussions             10000 non-null  object
 7   Assignment_Completion_Rate (%)           10000 non-null  int64 
 8   Exam_Score (%)                           10000 non-null  int64 
 9   Attendance_Rate (%)                      10000 non-null  int64 
 10  Use_of_Educational_Tech                  10000 non-null  ob

In [7]:
dataset.head()

Unnamed: 0,Student_ID,Age,Gender,Study_Hours_per_Week,Preferred_Learning_Style,Online_Courses_Completed,Participation_in_Discussions,Assignment_Completion_Rate (%),Exam_Score (%),Attendance_Rate (%),Use_of_Educational_Tech,Self_Reported_Stress_Level,Time_Spent_on_Social_Media (hours/week),Sleep_Hours_per_Night,Final_Grade
0,S00001,18,Female,48,Kinesthetic,14,Yes,100,69,66,Yes,High,9,8,C
1,S00002,29,Female,30,Reading/Writing,20,No,71,40,57,Yes,Medium,28,8,D
2,S00003,20,Female,47,Kinesthetic,11,No,60,43,79,Yes,Low,13,7,D
3,S00004,23,Female,13,Auditory,0,Yes,63,70,60,Yes,Low,24,10,B
4,S00005,19,Female,24,Auditory,19,Yes,59,63,93,Yes,Medium,26,8,C


In [8]:
dataset.describe()

Unnamed: 0,Age,Study_Hours_per_Week,Online_Courses_Completed,Assignment_Completion_Rate (%),Exam_Score (%),Attendance_Rate (%),Time_Spent_on_Social_Media (hours/week),Sleep_Hours_per_Night
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,23.4788,27.1303,10.0079,74.922,70.1889,75.0851,14.9365,6.9793
std,3.461986,13.002547,6.136726,14.675437,17.649447,14.749251,9.022639,1.996965
min,18.0,5.0,0.0,50.0,40.0,50.0,0.0,4.0
25%,20.0,16.0,5.0,62.0,55.0,62.0,7.0,5.0
50%,23.0,27.0,10.0,75.0,70.0,75.0,15.0,7.0
75%,27.0,38.0,15.0,88.0,85.0,88.0,23.0,9.0
max,29.0,49.0,20.0,100.0,100.0,100.0,30.0,10.0


# Data-preprocessing

# 1. check missing values

In [9]:
print(dataset.isnull().sum())

Student_ID                                 0
Age                                        0
Gender                                     0
Study_Hours_per_Week                       0
Preferred_Learning_Style                   0
Online_Courses_Completed                   0
Participation_in_Discussions               0
Assignment_Completion_Rate (%)             0
Exam_Score (%)                             0
Attendance_Rate (%)                        0
Use_of_Educational_Tech                    0
Self_Reported_Stress_Level                 0
Time_Spent_on_Social_Media (hours/week)    0
Sleep_Hours_per_Night                      0
Final_Grade                                0
dtype: int64


In [10]:
# Drop Unnecessary Columns
dataset.drop("Student_ID", axis=1, inplace=True)  # Not useful for prediction

In [11]:
# Handle Categorical Variables
# List of categorical columns
categorical_cols = ['Gender', 'Preferred_Learning_Style', 'Participation_in_Discussions',
                    'Use_of_Educational_Tech', 'Self_Reported_Stress_Level']
# Encode categorical features
dataset['Gender'] = dataset['Gender'].map({'Male': 1, 'Female': 0})
dataset['Preferred_Learning_Style'] = dataset['Preferred_Learning_Style'].map({'Yes': 1, 'No': 0})
dataset['Participation_in_Discussions'] = dataset['Participation_in_Discussions'].map({'Yes': 1, 'No': 0})
dataset['Use_of_Educational_Tech'] = dataset['Use_of_Educational_Tech'].map({'Yes': 1, 'No': 0})
dataset['Self_Reported_Stress_Level'] = dataset['Self_Reported_Stress_Level'].map({'Low': 0, 'Medium': 1, 'High': 2})



In [13]:
# One-hot encode learning style
dataset = pd.get_dummies(dataset, columns=['Preferred_Learning_Style'], drop_first=True,dtype=int)


In [14]:
# Encode the target column (Final Grade)
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
dataset['Final_Grade'] = le.fit_transform(dataset['Final_Grade'])


In [16]:
# Standardize numerical columns
scaler = StandardScaler()
numerical_cols = ['Age', 'Study_Hours_per_Week', 'Online_Courses_Completed',
                  'Assignment_Completion_Rate (%)', 'Exam_Score (%)',
                  'Attendance_Rate (%)', 'Time_Spent_on_Social_Media (hours/week)',
                  'Sleep_Hours_per_Night']

dataset[numerical_cols] = scaler.fit_transform(dataset[numerical_cols])


In [17]:
# Save to CSV
dataset.to_csv("preprocessed_student_dataset.csv", index=False)

print("✅ Preprocessed data saved as 'preprocessed_student_dataset.csv'")

✅ Preprocessed data saved as 'preprocessed_student_dataset.csv'
