In [30]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from google.colab import files

data = pd.read_csv('https://raw.githubusercontent.com/Salina-Huang/AI-in-ED-Multi-dimensional-Student-analytics-for-dropout-risk-prediction/refs/heads/main/cleaned_data.csv')
data_encoded = data.copy()
data_encoded = data_encoded.drop('student_id', axis=1)

categorical_columns = [
    'socioeconomic_status', 'parental_education_level', 'family_background',
    'first_generation_student', 'previous_academic_performance',
    'scholarship_status', 'learning_style', 'stress_level', 'gender'
]

categorical_columns = [col for col in categorical_columns if col in data_encoded.columns]

print("Categorical columns for one-hot encoding:")
print(categorical_columns)

data_final = pd.get_dummies(data_encoded, columns=categorical_columns, drop_first=True)

print(f"Data shape before encoding: {data_encoded.shape}")
print(f"Data shape after encoding: {data_final.shape}")

remaining_non_numeric = data_final.select_dtypes(include=['object']).columns
if len(remaining_non_numeric) > 0:
    print(f"Additional non-numeric columns found: {list(remaining_non_numeric)}")
    data_final = pd.get_dummies(data_final, columns=remaining_non_numeric, drop_first=True)

numeric_columns_to_scale = [
    'age', 'gpa', 'subject_failure_count', 'credits_completed',
    'course_load', 'attendance_rate', 'class_participation_score',
    'assignment_submission_rate', 'quiz_completion_rate',
    'time_spent_on_learning_platform', 'login_frequency',
    'engagement_score', 'motivation_level', 'peer_interaction_score',
    'counseling_sessions_attended'
]

numeric_columns_to_scale = [col for col in numeric_columns_to_scale if col in data_final.columns]

print(f"Numeric columns to scale: {len(numeric_columns_to_scale)}")

scaler = StandardScaler()
data_final[numeric_columns_to_scale] = scaler.fit_transform(data_final[numeric_columns_to_scale])

print("Processed data sample:")
print(data_final.head())

print(f"Final data shape: {data_final.shape}")

output_filename = 'encoded_data_onehot.csv'
data_final.to_csv(output_filename, index=True)
files.download(output_filename)

print(f"File '{output_filename}' downloaded successfully")

Categorical columns for one-hot encoding:
['socioeconomic_status', 'parental_education_level', 'family_background', 'first_generation_student', 'previous_academic_performance', 'scholarship_status', 'learning_style', 'stress_level', 'gender']
Data shape before encoding: (4481, 25)
Data shape after encoding: (4481, 35)
Numeric columns to scale: 15
Processed data sample:
        age       gpa  subject_failure_count  credits_completed  course_load  \
0  0.017472 -0.493167              -0.870832          -0.031773    -0.715848   
1  1.583324 -1.330171               1.459187          -1.133052    -0.009614   
2 -0.765454  1.551317              -0.870832           0.105887    -1.422082   
3 -0.373991  0.453608              -0.870832           1.069506     1.402854   
4  1.191861 -0.753873               0.294177          -0.444753    -1.422082   

   attendance_rate  class_participation_score  assignment_submission_rate  \
0         1.401647                  -1.257148                    1.588

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

File 'encoded_data_onehot.csv' downloaded successfully
