In [103]:
import pandas as pd
import matplotlib as plt
import seaborn as sns
import numpy as np

# 1. Business understanding
We are analyzing the student performance dataset to understand what factors affect the students performance the most, can we predict the final grade of a student based on the other features in the dataset and how different learning styles affect the performance of the student. We will use the following columns to predict the final grade of a student:

In [104]:
df = pd.read_csv('./datasets/student_performance_large_dataset.csv')
df

Unnamed: 0,Student_ID,Age,Gender,Study_Hours_per_Week,Preferred_Learning_Style,Online_Courses_Completed,Participation_in_Discussions,Assignment_Completion_Rate (%),Exam_Score (%),Attendance_Rate (%),Use_of_Educational_Tech,Self_Reported_Stress_Level,Time_Spent_on_Social_Media (hours/week),Sleep_Hours_per_Night,Final_Grade
0,S00001,18,Female,48,Kinesthetic,14,Yes,100,69,66,Yes,High,9,8,C
1,S00002,29,Female,30,Reading/Writing,20,No,71,40,57,Yes,Medium,28,8,D
2,S00003,20,Female,47,Kinesthetic,11,No,60,43,79,Yes,Low,13,7,D
3,S00004,23,Female,13,Auditory,0,Yes,63,70,60,Yes,Low,24,10,B
4,S00005,19,Female,24,Auditory,19,Yes,59,63,93,Yes,Medium,26,8,C
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,S09996,20,Male,30,Auditory,6,Yes,62,58,76,Yes,Medium,17,6,C
9996,S09997,23,Female,16,Visual,8,Yes,54,84,86,Yes,Medium,6,5,B
9997,S09998,26,Male,23,Visual,3,Yes,54,40,70,No,Medium,20,8,D
9998,S09999,18,Male,41,Reading/Writing,7,Yes,66,45,90,Yes,Low,6,8,D


# 2. Data understanding
The data is pulled from Kaggle and created by Adil Shahim (https://www.kaggle.com/datasets/adilshamim8/student-performance-and-learning-style). The dataset contains the following columns:

- **Student_ID** – Unique identifier for each student
- **Age** – Student's age (18-30 years)
- **Gender** – Male, Female, or Other
- **Study_Hours_per_Week** – Hours spent studying per week (5-50 hours)
- **Preferred_Learning_Style** – Visual, Auditory, Reading/Writing, Kinesthetic
- **Online_Courses_Completed** – Number of online courses completed (0-20)
- **Participation_in_Discussions** – Whether the student actively participates in discussions (Yes/No)
- **Assignment_Completion_Rate (%)** – Percentage of assignments completed (50%-100%)
- **Exam_Score (%)** – Student’s final exam score (40%-100%)
- **Attendance_Rate (%)** – Percentage of classes attended (50%-100%)
- **Use_of_Educational_Tech** – Whether the student uses educational technology (Yes/No)
- **Self_Reported_Stress_Level** – Student’s stress level (Low, Medium, High)
- **Time_Spent_on_Social_Media** (hours/week) – Weekly hours spent on social media (0-30 hours)
- **Sleep_Hours_per_Night** – Average sleep duration (4-10 hours)
- **Final_Grade** – Assigned grade based on exam score (A, B, C, D, F)

The dataset does not contain missing values by default. However, we will check for missing values and duplicates in the dataset just to be sure.

# 3. Data preparation
We will check for missing values and duplicates in the dataset. We will also check for unique values in each column to ensure that the data is clean and ready for analysis.

In [105]:
print(df.describe()) # Summary statistics

                Age  Study_Hours_per_Week  Online_Courses_Completed  \
count  10000.000000          10000.000000              10000.000000   
mean      23.478800             27.130300                 10.007900   
std        3.461986             13.002547                  6.136726   
min       18.000000              5.000000                  0.000000   
25%       20.000000             16.000000                  5.000000   
50%       23.000000             27.000000                 10.000000   
75%       27.000000             38.000000                 15.000000   
max       29.000000             49.000000                 20.000000   

       Assignment_Completion_Rate (%)  Exam_Score (%)  Attendance_Rate (%)  \
count                    10000.000000    10000.000000         10000.000000   
mean                        74.922000       70.188900            75.085100   
std                         14.675437       17.649447            14.749251   
min                         50.000000       40.0

In [106]:
missing_values = df.isnull().sum() # Check for missing values
print(missing_values)

Student_ID                                 0
Age                                        0
Gender                                     0
Study_Hours_per_Week                       0
Preferred_Learning_Style                   0
Online_Courses_Completed                   0
Participation_in_Discussions               0
Assignment_Completion_Rate (%)             0
Exam_Score (%)                             0
Attendance_Rate (%)                        0
Use_of_Educational_Tech                    0
Self_Reported_Stress_Level                 0
Time_Spent_on_Social_Media (hours/week)    0
Sleep_Hours_per_Night                      0
Final_Grade                                0
dtype: int64


Mapping categorical values to numerical values

In [107]:
df["Gender"] = df["Gender"].map({"Male" : 1, "Female" : 2, "Other" : 3}) # Mapping genders by numbers
df["Gender"]

0       2
1       2
2       2
3       2
4       2
       ..
9995    1
9996    2
9997    1
9998    1
9999    1
Name: Gender, Length: 10000, dtype: int64

In [108]:
df["Preferred_Learning_Style"] = df["Preferred_Learning_Style"].map({"Kinesthetic" : 1, "Reading/Writing" : 2, "Auditory" : 3, "Visual" : 3}) # Mapping learning styles by numbers
df["Preferred_Learning_Style"]

0       1
1       2
2       1
3       3
4       3
       ..
9995    3
9996    3
9997    3
9998    2
9999    3
Name: Preferred_Learning_Style, Length: 10000, dtype: int64

## 4. Modelling
## 5. Evaluation
## 6. Deployment