In [1]:
import pandas as pd
import joblib
from sklearn.preprocessing import StandardScaler

# load the original student dataset 
original_df = pd.df = pd.read_csv(r'..\data\Student_performance_data .csv')

# remove outliers using IQR for selected columns
def remove_outliers_iqr(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]

# columns to clean for outliers
columns_to_clean = ['StudyTimeWeekly', 'Absences', 'GPA']
df_filtered = original_df.copy()
for col in columns_to_clean:
    df_filtered = remove_outliers_iqr(df_filtered, col)

# drop StudentID and separate target
X = df_filtered.drop(columns=['StudentID', 'GradeClass'])
y = df_filtered['GradeClass']

# scale the features using StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
df_scaled = pd.DataFrame(X_scaled, columns=X.columns)

# add target variable back
df_scaled['GradeClass'] = y.values

# save the cleaned and scaled dataset to CSV
filtered_csv_path = r"..\data\BrightPath_Filtered_Scaled_Dataset.csv"
df_scaled.to_csv(filtered_csv_path, index=False)

# show summary of changes
print("Data processing complete.")
print(f"Original dataset size: {original_df.shape}")
print(f"Filtered dataset size (outliers removed): {df_filtered.shape}")
print(f"Saved to: {filtered_csv_path}")

joblib.dump(scaler, r"..\Source\scaler.pkl") #save scaler to a file

# show first few rows of final dataset
df_scaled.head()


Data processing complete.
Original dataset size: (2392, 15)
Filtered dataset size (outliers removed): (2392, 15)
Saved to: ..\data\BrightPath_Filtered_Scaled_Dataset.csv


Unnamed: 0,Age,Gender,Ethnicity,ParentalEducation,StudyTimeWeekly,Absences,Tutoring,ParentalSupport,Extracurricular,Sports,Music,Volunteering,GPA,GradeClass
0,0.472919,0.978492,-0.853391,0.253711,1.780336,-0.890822,1.522371,-0.108744,-0.788476,-0.660132,2.019544,-0.431866,1.118086,2.0
1,1.362944,-1.021981,-0.853391,-0.746087,0.997376,-1.717694,-0.65687,-0.999551,-0.788476,-0.660132,-0.495161,-0.431866,1.242374,1.0
2,-1.307132,-1.021981,1.091641,1.253509,-0.984045,1.353542,-0.65687,-0.108744,-0.788476,-0.660132,-0.495161,-0.431866,-1.960277,4.0
3,0.472919,0.978492,-0.853391,1.253509,0.045445,-0.063951,-0.65687,0.782063,1.268269,-0.660132,-0.495161,-0.431866,0.16179,3.0
4,0.472919,0.978492,-0.853391,0.253711,-0.902311,0.290422,1.522371,0.782063,-0.788476,-0.660132,-0.495161,-0.431866,-0.675573,4.0
