In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler


In [2]:
df = pd.read_csv("StudentPerformanceFactors.csv")  

df.head()


Unnamed: 0,Hours_Studied,Attendance,Parental_Involvement,Access_to_Resources,Extracurricular_Activities,Sleep_Hours,Previous_Scores,Motivation_Level,Internet_Access,Tutoring_Sessions,Family_Income,Teacher_Quality,School_Type,Peer_Influence,Physical_Activity,Learning_Disabilities,Parental_Education_Level,Distance_from_Home,Gender,Exam_Score
0,23,84,Low,High,No,7,73,Low,Yes,0,Low,Medium,Public,Positive,3,No,High School,Near,Male,67
1,19,64,Low,Medium,No,8,59,Low,Yes,2,Medium,Medium,Public,Negative,4,No,College,Moderate,Female,61
2,24,98,Medium,Medium,Yes,7,91,Medium,Yes,2,Medium,Medium,Public,Neutral,4,No,Postgraduate,Near,Male,74
3,29,89,Low,Medium,Yes,8,98,Medium,Yes,1,Medium,Medium,Public,Negative,4,No,High School,Moderate,Male,71
4,19,92,Medium,Medium,Yes,6,65,Medium,Yes,3,Medium,High,Public,Neutral,4,No,College,Near,Female,70


In [3]:
df.shape

(6607, 20)

In [4]:
df.isnull().sum()


Hours_Studied                  0
Attendance                     0
Parental_Involvement           0
Access_to_Resources            0
Extracurricular_Activities     0
Sleep_Hours                    0
Previous_Scores                0
Motivation_Level               0
Internet_Access                0
Tutoring_Sessions              0
Family_Income                  0
Teacher_Quality               78
School_Type                    0
Peer_Influence                 0
Physical_Activity              0
Learning_Disabilities          0
Parental_Education_Level      90
Distance_from_Home            67
Gender                         0
Exam_Score                     0
dtype: int64

In [5]:
# filling missing values in several categorical columns using the most frequent value (mode) of each column.As the percentage of missing values is relatively low,
#  this approach helps maintain the integrity of the dataset without introducing significant bias.
categorical_cols = [
    'Access_to_Resources', 'Parental_Involvement', 'Extracurricular_Activities',
    'Motivation_Level', 'Internet_Access', 'Family_Income', 'Teacher_Quality',
    'School_Type', 'Peer_Influence', 'Learning_Disabilities',
    'Parental_Education_Level', 'Distance_from_Home', 'Gender'
]

for col in categorical_cols:
    df[col] = df[col].fillna(df[col].mode()[0])


In [6]:
#Ordinal Encoding for ordinal categorical features
ordinal_map = {
    'Low': 0,
    'Medium': 1,
    'High': 2
}

ordinal_cols = [
    'Access_to_Resources',
    'Parental_Involvement',
    'Motivation_Level',
    'Family_Income',
    'Teacher_Quality'
]

for col in ordinal_cols:
    df[col] = df[col].map(ordinal_map)


In [7]:
#Binary Encoding for binary categorical features
binary_map = {
    'Yes': 1,
    'No': 0,
    'Male': 1,
    'Female': 0
}

binary_cols = [
    'Extracurricular_Activities',
    'Internet_Access',
    'Learning_Disabilities',
    'Gender'
]

for col in binary_cols:
    df[col] = df[col].map(binary_map)


In [8]:
nominal_cols = [
    'School_Type',
    'Peer_Influence',
    'Parental_Education_Level',
    'Distance_from_Home'
]

df = pd.get_dummies(
    df,
    columns=nominal_cols,
    drop_first=True
)


In [9]:
df.head()


Unnamed: 0,Hours_Studied,Attendance,Parental_Involvement,Access_to_Resources,Extracurricular_Activities,Sleep_Hours,Previous_Scores,Motivation_Level,Internet_Access,Tutoring_Sessions,...,Learning_Disabilities,Gender,Exam_Score,School_Type_Public,Peer_Influence_Neutral,Peer_Influence_Positive,Parental_Education_Level_High School,Parental_Education_Level_Postgraduate,Distance_from_Home_Moderate,Distance_from_Home_Near
0,23,84,0,2,0,7,73,0,1,0,...,0,1,67,True,False,True,True,False,False,True
1,19,64,0,1,0,8,59,0,1,2,...,0,0,61,True,False,False,False,False,True,False
2,24,98,1,1,1,7,91,1,1,2,...,0,1,74,True,True,False,False,True,False,True
3,29,89,0,1,1,8,98,1,1,1,...,0,1,71,True,False,False,True,False,True,False
4,19,92,1,1,1,6,65,1,1,3,...,0,0,70,True,True,False,False,False,False,True


In [10]:
y = df['Exam_Score']
X = df.drop(columns=['Exam_Score'])


In [11]:
# Identify numeric columns that need to be scaled
numeric_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
print("Numeric columns to be scaled:")
print(numeric_cols)
print(f"\nTotal numeric features: {len(numeric_cols)}")

Numeric columns to be scaled:
['Hours_Studied', 'Attendance', 'Parental_Involvement', 'Access_to_Resources', 'Extracurricular_Activities', 'Sleep_Hours', 'Previous_Scores', 'Motivation_Level', 'Internet_Access', 'Tutoring_Sessions', 'Family_Income', 'Teacher_Quality', 'Physical_Activity', 'Learning_Disabilities', 'Gender']

Total numeric features: 15


In [12]:
# Apply StandardScaler to numeric features
# Note: Best practice is to fit scaler on training data only (after train/test split) to avoid data leakage
scaler = StandardScaler()
X[numeric_cols] = scaler.fit_transform(X[numeric_cols])

print("Scaling applied to numeric features.")
print("\nScaled data preview:")
print(X[numeric_cols].head())
print(f"\nMean of scaled features (should be ~0): \n{X[numeric_cols].mean()}")
print(f"\nStd of scaled features (should be ~1): \n{X[numeric_cols].std()}")

Scaling applied to numeric features.

Scaled data preview:
   Hours_Studied  Attendance  Parental_Involvement  Access_to_Resources  \
0       0.504942    0.348375             -1.562146             1.288574   
1      -0.162822   -1.383736             -1.562146            -0.143488   
2       0.671882    1.560853             -0.124267            -0.143488   
3       1.506587    0.781403             -1.562146            -0.143488   
4      -0.162822    1.041220             -0.124267            -0.143488   

   Extracurricular_Activities  Sleep_Hours  Previous_Scores  Motivation_Level  \
0                   -1.214685    -0.019796        -0.143800         -1.302866   
1                   -1.214685     0.661399        -1.116110         -1.302866   
2                    0.823259    -0.019796         1.106313          0.134442   
3                    0.823259     0.661399         1.592469          0.134442   
4                    0.823259    -0.700990        -0.699406          0.134442   

   