In [37]:
import numpy as np
import pandas as pd

%matplotlib inline
import matplotlib.pyplot as plt

import warnings
warnings.simplefilter(action="ignore", category=FutureWarning)

In [38]:
df = pd.read_csv("../Data/StudentPerformanceFactors.csv")
df.head(20)

Unnamed: 0,Hours_Studied,Attendance,Parental_Involvement,Access_to_Resources,Extracurricular_Activities,Sleep_Hours,Previous_Scores,Motivation_Level,Internet_Access,Tutoring_Sessions,Family_Income,Teacher_Quality,School_Type,Peer_Influence,Physical_Activity,Learning_Disabilities,Parental_Education_Level,Distance_from_Home,Gender,Exam_Score
0,23,84,Low,High,No,7,73,Low,Yes,0,Low,Medium,Public,Positive,3,No,High School,Near,Male,67
1,19,64,Low,Medium,No,8,59,Low,Yes,2,Medium,Medium,Public,Negative,4,No,College,Moderate,Female,61
2,24,98,Medium,Medium,Yes,7,91,Medium,Yes,2,Medium,Medium,Public,Neutral,4,No,Postgraduate,Near,Male,74
3,29,89,Low,Medium,Yes,8,98,Medium,Yes,1,Medium,Medium,Public,Negative,4,No,High School,Moderate,Male,71
4,19,92,Medium,Medium,Yes,6,65,Medium,Yes,3,Medium,High,Public,Neutral,4,No,College,Near,Female,70
5,19,88,Medium,Medium,Yes,8,89,Medium,Yes,3,Medium,Medium,Public,Positive,3,No,Postgraduate,Near,Male,71
6,29,84,Medium,Low,Yes,7,68,Low,Yes,1,Low,Medium,Private,Neutral,2,No,High School,Moderate,Male,67
7,25,78,Low,High,Yes,6,50,Medium,Yes,1,High,High,Public,Negative,2,No,High School,Far,Male,66
8,17,94,Medium,High,No,6,80,High,Yes,0,Medium,Low,Private,Neutral,1,No,College,Near,Male,69
9,23,98,Medium,Medium,Yes,8,71,Medium,Yes,0,High,High,Public,Positive,5,No,High School,Moderate,Male,72


Missing Values

In [39]:
df.isnull().sum()[df.isnull().sum() > 0]

Teacher_Quality             78
Parental_Education_Level    90
Distance_from_Home          67
dtype: int64

In [40]:
# Fill missing Teacher_Quality with the most common value (mode)
df['Teacher_Quality'].fillna(df['Teacher_Quality'].mode()[0], inplace=True)

# Fill missing Parental_Education_Level with 'Missing'
df['Parental_Education_Level'].fillna('Missing', inplace=True)

# Fill missing Distance_from_Home with 'Missing'
df['Distance_from_Home'].fillna('Missing', inplace=True)

Standardizing Categorical Values (capitalization,, whitespaces, etc)

In [41]:
categorical_cols = ['Parental_Involvement', 'Access_to_Resources', 'Extracurricular_Activities',
                    'Motivation_Level', 'Internet_Access', 'Family_Income', 'Teacher_Quality',
                    'School_Type', 'Peer_Influence', 'Learning_Disabilities',
                    'Parental_Education_Level', 'Distance_from_Home', 'Gender']

for col in categorical_cols:
    df[col] = df[col].astype(str).str.strip().str.title()

Encoding Categorical Variables for Regression (Ordinal and Dummy Variables)

In [42]:
# Ordinal encoding: convert ordered categories into numeric values
ordinal_maps = {
    'Teacher_Quality': {'Low': 1, 'Medium': 2, 'High': 3},
    'Motivation_Level': {'Low': 1, 'Medium': 2, 'High': 3},
    'Parental_Involvement': {'Low': 1, 'Medium': 2, 'High': 3},
    'Access_to_Resources': {'Low': 1, 'Medium': 2, 'High': 3},
    'Parental_Education_Level': {'High School': 1, 'College': 2, 'Postgrad': 3, 'Missing': 0},
    'Distance_from_Home': {'Near': 1, 'Moderate': 2, 'Far': 3, 'Missing': 0}
}

for col, mapping in ordinal_maps.items():
    if col in df.columns:
        df[col] = df[col].map(mapping)

# Dummy variable encoding: create binary columns for unordered categories
nominal_cols = ['Internet_Access', 'Extracurricular_Activities', 'Family_Income',
                'School_Type', 'Peer_Influence', 'Learning_Disabilities', 'Gender']

df = pd.get_dummies(df, columns=nominal_cols, drop_first=True)

bool_cols = df.select_dtypes(include='bool').columns
df[bool_cols] = df[bool_cols].astype(int)

In [43]:
df.head()

Unnamed: 0,Hours_Studied,Attendance,Parental_Involvement,Access_to_Resources,Sleep_Hours,Previous_Scores,Motivation_Level,Tutoring_Sessions,Teacher_Quality,Physical_Activity,...,Exam_Score,Internet_Access_Yes,Extracurricular_Activities_Yes,Family_Income_Low,Family_Income_Medium,School_Type_Public,Peer_Influence_Neutral,Peer_Influence_Positive,Learning_Disabilities_Yes,Gender_Male
0,23,84,1,3,7,73,1,0,2,3,...,67,1,0,1,0,1,0,1,0,1
1,19,64,1,2,8,59,1,2,2,4,...,61,1,0,0,1,1,0,0,0,0
2,24,98,2,2,7,91,2,2,2,4,...,74,1,1,0,1,1,1,0,0,1
3,29,89,1,2,8,98,2,1,2,4,...,71,1,1,0,1,1,0,0,0,1
4,19,92,2,2,6,65,2,3,3,4,...,70,1,1,0,1,1,1,0,0,0
