In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

In [2]:
df=pd.read_csv('preprocessed_student_dataset.csv')

In [3]:
df

Unnamed: 0,Age,Gender,Study_Hours_per_Week,Online_Courses_Completed,Participation_in_Discussions,Assignment_Completion_Rate (%),Exam_Score (%),Attendance_Rate (%),Use_of_Educational_Tech,Self_Reported_Stress_Level,Time_Spent_on_Social_Media (hours/week),Sleep_Hours_per_Night,Final_Grade
0,-1.582639,0.0,1.605127,0.650559,1,1.708927,-0.067365,-0.616001,1,2,-0.657989,0.511151,2
1,1.594887,0.0,0.220714,1.628328,0,-0.267263,-1.710558,-1.226232,1,1,1.447930,0.511151,3
2,-1.004907,0.0,1.528215,0.161674,0,-1.016852,-1.540573,0.265444,1,0,-0.214638,0.010366,3
3,-0.138309,0.0,-1.086788,-1.630902,1,-0.812418,-0.010703,-1.022822,1,0,1.004579,1.512721,1
4,-1.293773,0.0,-0.240757,1.465366,1,-1.084996,-0.407336,1.214692,1,1,1.226255,0.511151,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,-1.004907,1.0,0.220714,-0.653133,1,-0.880563,-0.690645,0.062033,1,1,0.228714,-0.490419,2
9996,-0.138309,0.0,-0.856052,-0.327210,1,-1.425719,0.782562,0.740068,1,1,-0.990503,-0.991204,1
9997,0.728289,1.0,-0.317669,-1.142018,1,-1.425719,-1.710558,-0.344787,0,1,0.561228,0.511151,3
9998,-1.582639,1.0,1.066744,-0.490172,1,-0.607985,-1.427249,1.011282,1,0,-0.990503,0.511151,3


In [4]:
#  Separate features and target
X = df.drop(columns=['Final_Grade'])
y = df['Final_Grade']

In [5]:
# Handle missing values
X = X.fillna(X.median())


In [6]:
# Scale features for chi2 (chi2 needs non-negative features)
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

In [7]:
# Feature selection using SelectKBest with chi2
k = 5  # Number of top features to select
selector = SelectKBest(score_func=chi2, k=5)
X_selected = selector.fit_transform(X_scaled, y)


In [8]:
# Get selected feature names (optional)
selected_features = X.columns[selector.get_support()]
print("Selected Features:", selected_features.tolist())

Selected Features: ['Online_Courses_Completed', 'Assignment_Completion_Rate (%)', 'Exam_Score (%)', 'Self_Reported_Stress_Level', 'Sleep_Hours_per_Night']


In [9]:
# Get feature scores
chi_scores = selector.scores_
p_values = selector.pvalues_
# Create dataframe of features and scores
chi_df = pd.DataFrame({'Feature': X.columns, 'Chi2 Score': chi_scores, 'p-value': p_values})
chi_df.sort_values(by='Chi2 Score', ascending=False, inplace=True)

# Print results
print(chi_df)


                                    Feature   Chi2 Score   p-value
6                            Exam_Score (%)  1611.395861  0.000000
9                Self_Reported_Stress_Level     1.269919  0.736288
11                    Sleep_Hours_per_Night     1.108129  0.775112
3                  Online_Courses_Completed     0.687914  0.876043
5            Assignment_Completion_Rate (%)     0.569463  0.903386
8                   Use_of_Educational_Tech     0.464115  0.926701
10  Time_Spent_on_Social_Media (hours/week)     0.393709  0.941539
7                       Attendance_Rate (%)     0.235070  0.971739
4              Participation_in_Discussions     0.227019  0.973114
1                                    Gender     0.158451  0.984000
0                                       Age     0.064424  0.995734
2                      Study_Hours_per_Week     0.058528  0.996300


In [10]:
# train/test split
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42)

In [11]:
# Define models
models = {
    "Random Forest": RandomForestClassifier(),
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Decision Tree": DecisionTreeClassifier(),
    "K-Nearest Neighbors": KNeighborsClassifier(),
    "Support Vector Machine": SVC()
}

In [12]:
# Train and evaluate each model
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    print(f"{name} Accuracy: {acc:.2f}")

Random Forest Accuracy: 1.00
Logistic Regression Accuracy: 1.00
Decision Tree Accuracy: 1.00
K-Nearest Neighbors Accuracy: 0.90
Support Vector Machine Accuracy: 0.98
