In [None]:
import pandas as pd


df = pd.read_csv('dataset.csv')


print(f"Original Data: {df.shape}")

Original Data: (4424, 35)


In [None]:


# 1. Clean the Text (The Fix!)
# This removes any hidden spaces at the start or end of the words
df['Target'] = df['Target'].str.strip()

# 2. Filter the rows

df = df[df['Target'] != 'Enrolled']

# 3. Encode the Target

df['Target'] = df['Target'].map({'Dropout': 0, 'Graduate': 1})



print(f"Filtered Data Shape: {df.shape}")

Filtered Data Shape: (3630, 35)


In [None]:






# 1. Approval Rates (Efficiency)
# Logic: Passed Courses / Enrolled Courses
# A student who passes 1/1 (100%) is safer than one who passes 3/6 (50%).
# +0.001 to avoid "Division by Zero" errors.
df['Sem1_Approval_Rate'] = df['Curricular units 1st sem (approved)'] / (df['Curricular units 1st sem (enrolled)'] + 0.001)
df['Sem2_Approval_Rate'] = df['Curricular units 2nd sem (approved)'] / (df['Curricular units 2nd sem (enrolled)'] + 0.001)




In [None]:
# 2. Grade Trend (Momentum)
# Logic: Semester 2 Grade - Semester 1 Grade
# Positive = Improving. Negative = Slacking off.
df['Grade_Trend'] = df['Curricular units 2nd sem (grade)'] - df['Curricular units 1st sem (grade)']


In [None]:
# 3. Overall Academic Performance
# Logic: Average of both semesters to get a stable baseline.
df['Overall_Grade'] = (df['Curricular units 1st sem (grade)'] + df['Curricular units 2nd sem (grade)']) / 2

print("Academic features created.")

Academic features created.


In [None]:







# 4. Financial Risk Score
# Logic: 'Debtor' (1) is bad. 'Tuition fees up to date' (0) is bad.
# We combine them: Higher Score = Higher Financial Pressure.
# Formula: Debtor Status + (1 if Tuition Not Paid)
df['Financial_Risk_Score'] = df['Debtor'] + (1 - df['Tuition fees up to date'])



In [None]:
# 5. Struggle Ratio (Effort vs. Result)
# Logic: Evaluations (Exams taken) / Approved (Exams passed).
# A high number means they are taking many exams but failing them (Struggling).
df['Struggle_Ratio'] = df['Curricular units 2nd sem (evaluations)'] / (df['Curricular units 2nd sem (approved)'] + 0.001)

In [None]:
# 6. Parental Support Score
# Logic: Combining Mother's & Father's Qualification levels.
# This acts as a proxy for "Educational Support at Home".
df['Parent_Qualification_Sum'] = df["Mother's qualification"] + df["Father's qualification"]

print("Dictionary-based features created.")

Dictionary-based features created.


In [None]:





# Save to a new CSV file
# index=False prevents pandas from adding an extra column of row numbers
df.to_csv('processed_data.csv', index=False)

print("processed_data.csv has been created.")


processed_data.csv has been created.


In [None]:
print(df.info)

<bound method DataFrame.info of       Marital status  Application mode  Application order  Course  \
0                  1                 8                  5       2   
1                  1                 6                  1      11   
2                  1                 1                  5       5   
3                  1                 8                  2      15   
4                  2                12                  1       3   
...              ...               ...                ...     ...   
4419               1                 1                  6      15   
4420               1                 1                  2      15   
4421               1                 1                  1      12   
4422               1                 1                  1       9   
4423               1                 5                  1      15   

      Daytime/evening attendance  Previous qualification  Nacionality  \
0                              1                       1          