In [1]:
import pandas as pd

In [4]:
file_path = '../data/processed/all_schools_cleaned.csv'
df = pd.read_csv(file_path)

In [5]:
df

Unnamed: 0,student_id,full_name,gender,math_score,english_score,science_score,attendance_rate,term
0,5506-350,Mary Smith,Male,92.0,14.0,71.0,74.98,2023_T1
1,2424-704,Mary Moore,Female,82.0,86.0,74.0,83.95,2023_T1
2,4582-338,James Johnson,Male,99.0,23.0,2.0,78.37,2024_T1
3,7873-325,Patricia Moore,Female,1.0,87.0,29.0,60.82,2024_T1
4,3615-814,Robert Smith,Female,63.0,59.0,20.0,68.49,2023_T2
...,...,...,...,...,...,...,...,...
595,5093-196,William Moore,Female,18.0,27.0,39.0,74.76,2023_T1
596,1195-163,John Johnson,Male,96.0,75.0,14.0,90.05,2023_T1
597,7098-802,Patricia Miller,Male,1.0,62.0,28.0,89.99,2024_T1
598,2228-122,Elizabeth Brown,Male,65.0,50.0,74.0,87.15,2023_T1


In [6]:
# Feature Engineering

#GPA: Mean of all grades
df['gpa'] = df[['math_score', 'english_score', 'science_score']].mean(axis=1)

In [7]:
df

Unnamed: 0,student_id,full_name,gender,math_score,english_score,science_score,attendance_rate,term,gpa
0,5506-350,Mary Smith,Male,92.0,14.0,71.0,74.98,2023_T1,59.000000
1,2424-704,Mary Moore,Female,82.0,86.0,74.0,83.95,2023_T1,80.666667
2,4582-338,James Johnson,Male,99.0,23.0,2.0,78.37,2024_T1,41.333333
3,7873-325,Patricia Moore,Female,1.0,87.0,29.0,60.82,2024_T1,39.000000
4,3615-814,Robert Smith,Female,63.0,59.0,20.0,68.49,2023_T2,47.333333
...,...,...,...,...,...,...,...,...,...
595,5093-196,William Moore,Female,18.0,27.0,39.0,74.76,2023_T1,28.000000
596,1195-163,John Johnson,Male,96.0,75.0,14.0,90.05,2023_T1,61.666667
597,7098-802,Patricia Miller,Male,1.0,62.0,28.0,89.99,2024_T1,30.333333
598,2228-122,Elizabeth Brown,Male,65.0,50.0,74.0,87.15,2023_T1,63.000000


In [8]:
## Risk score 
df['risk_score'] = (df['gpa'] < 40).astype(int) + (df['attendance_rate'] < 75).astype(int) * 50

In [9]:
df

Unnamed: 0,student_id,full_name,gender,math_score,english_score,science_score,attendance_rate,term,gpa,risk_score
0,5506-350,Mary Smith,Male,92.0,14.0,71.0,74.98,2023_T1,59.000000,50
1,2424-704,Mary Moore,Female,82.0,86.0,74.0,83.95,2023_T1,80.666667,0
2,4582-338,James Johnson,Male,99.0,23.0,2.0,78.37,2024_T1,41.333333,0
3,7873-325,Patricia Moore,Female,1.0,87.0,29.0,60.82,2024_T1,39.000000,51
4,3615-814,Robert Smith,Female,63.0,59.0,20.0,68.49,2023_T2,47.333333,50
...,...,...,...,...,...,...,...,...,...,...
595,5093-196,William Moore,Female,18.0,27.0,39.0,74.76,2023_T1,28.000000,51
596,1195-163,John Johnson,Male,96.0,75.0,14.0,90.05,2023_T1,61.666667,0
597,7098-802,Patricia Miller,Male,1.0,62.0,28.0,89.99,2024_T1,30.333333,1
598,2228-122,Elizabeth Brown,Male,65.0,50.0,74.0,87.15,2023_T1,63.000000,0


In [10]:
# Attendance level greater than or equals 90: engagement level is high
# Attendance level between 75 and 90: engagement level is medium
# Attendance level less than 75: engagement level is low
def engagement_level(attendance_rate):
    if attendance_rate >= 90:
        return 'High'
    elif attendance_rate >= 75:
        return 'Medium'
    else:
        return 'Low'

In [11]:
df['engagement_level'] = df['attendance_rate'].apply(engagement_level)

In [12]:
df

Unnamed: 0,student_id,full_name,gender,math_score,english_score,science_score,attendance_rate,term,gpa,risk_score,engagement_level
0,5506-350,Mary Smith,Male,92.0,14.0,71.0,74.98,2023_T1,59.000000,50,Low
1,2424-704,Mary Moore,Female,82.0,86.0,74.0,83.95,2023_T1,80.666667,0,Medium
2,4582-338,James Johnson,Male,99.0,23.0,2.0,78.37,2024_T1,41.333333,0,Medium
3,7873-325,Patricia Moore,Female,1.0,87.0,29.0,60.82,2024_T1,39.000000,51,Low
4,3615-814,Robert Smith,Female,63.0,59.0,20.0,68.49,2023_T2,47.333333,50,Low
...,...,...,...,...,...,...,...,...,...,...,...
595,5093-196,William Moore,Female,18.0,27.0,39.0,74.76,2023_T1,28.000000,51,Low
596,1195-163,John Johnson,Male,96.0,75.0,14.0,90.05,2023_T1,61.666667,0,High
597,7098-802,Patricia Miller,Male,1.0,62.0,28.0,89.99,2024_T1,30.333333,1,Medium
598,2228-122,Elizabeth Brown,Male,65.0,50.0,74.0,87.15,2023_T1,63.000000,0,Medium


In [14]:
df

Unnamed: 0,student_id,full_name,gender,math_score,english_score,science_score,attendance_rate,term,gpa,risk_score,engagement_level
0,5506-350,Mary Smith,Male,92.0,14.0,71.0,74.98,2023_T1,59.000000,50,Low
1,2424-704,Mary Moore,Female,82.0,86.0,74.0,83.95,2023_T1,80.666667,0,Medium
2,4582-338,James Johnson,Male,99.0,23.0,2.0,78.37,2024_T1,41.333333,0,Medium
3,7873-325,Patricia Moore,Female,1.0,87.0,29.0,60.82,2024_T1,39.000000,51,Low
4,3615-814,Robert Smith,Female,63.0,59.0,20.0,68.49,2023_T2,47.333333,50,Low
...,...,...,...,...,...,...,...,...,...,...,...
595,5093-196,William Moore,Female,18.0,27.0,39.0,74.76,2023_T1,28.000000,51,Low
596,1195-163,John Johnson,Male,96.0,75.0,14.0,90.05,2023_T1,61.666667,0,High
597,7098-802,Patricia Miller,Male,1.0,62.0,28.0,89.99,2024_T1,30.333333,1,Medium
598,2228-122,Elizabeth Brown,Male,65.0,50.0,74.0,87.15,2023_T1,63.000000,0,Medium


In [15]:
df.to_csv('../data/processed/engineered_data.csv', index=False)
print("Feature engineering completed and saved to '../data/processed/engineered_data.csv'.")

Feature engineering completed and saved to '../data/processed/engineered_data.csv'.


In [16]:
df.columns

Index(['student_id', 'full_name', 'gender', 'math_score', 'english_score',
       'science_score', 'attendance_rate', 'term', 'gpa', 'risk_score',
       'engagement_level'],
      dtype='object')