In [25]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Load dataset
df = pd.read_csv("student_performance_datasetss.csv")

# Drop Student_ID since it's not a predictor
df.drop(columns=["Student_ID"], inplace=True)

# Encode categorical features
label_encoders = {}
categorical_columns = [
    "Gender", "Parental_Education_Level", "Extra_Curricular_Activities", "Internet_Access_at_Home", 
    "Socioeconomic_Status", "Health_Issues", "Motivation_Level", "Peer_Influence", "Time_Management_Skills",
    "Family_Support", "Stress_Level", "Learning_Style"
]

for col in categorical_columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

# Encode target variable
le_target = LabelEncoder()
df["Final_Performance"] = le_target.fit_transform(df["Final_Performance"])

# Split features and target
X = df.drop(columns=["Final_Performance"])
y = df["Final_Performance"]

# Add Previous GPA
df["Previous_GPA"] = np.random.uniform(2.0, 4.0, len(df))
X["Previous_GPA"] = df["Previous_GPA"]

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize numerical features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Train Random Forest Classifier
model = RandomForestClassifier(n_estimators=100, random_state=22)
model.fit(X_train, y_train)

# Predict on test data
y_pred = model.predict(X_test)

# Evaluate model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Simple Prediction Function
def predict_performance(input_data):
    input_df = pd.DataFrame([input_data], columns=X.columns)
    for col in categorical_columns:
        input_df[col] = label_encoders[col].transform(input_df[col])
    input_df = scaler.transform(input_df)
    prediction = model.predict(input_df)
    return le_target.inverse_transform(prediction)[0]

# Example Usage
example_input = {
    "Age": 20,
    "Gender": "Male",
    "Study_Hours_per_Week": 25,
    "Previous_Scores": 15,
    "Class_Participation": 90,
    "Attendance_Rate": 15,
    "Parental_Education_Level": "Higher",
    "Extra_Curricular_Activities": "Yes",
    "Internet_Access_at_Home": "Yes",
    "Socioeconomic_Status": "Medium",
    "Health_Issues": "No",
    "Motivation_Level": "High",
    "Peer_Influence": "Positive",
    "Time_Management_Skills": "Good",
    "Family_Support": "Strong",
    "Stress_Level": "Low",
    "Learning_Style": "Visual",
    "Previous_GPA": 3.5
}

predicted_performance = predict_performance(example_input)
print(f"Predicted Performance: {predicted_performance}")

Accuracy: 0.41
Classification Report:
              precision    recall  f1-score   support

           0       0.36      0.54      0.43        35
           1       0.45      0.29      0.35        31
           2       0.48      0.38      0.43        34

    accuracy                           0.41       100
   macro avg       0.43      0.41      0.40       100
weighted avg       0.43      0.41      0.41       100



ValueError: y contains previously unseen labels: 'Higher'

In [29]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE

# Load dataset
df = pd.read_csv("student_performance_datasetss.csv")

# Drop Student_ID since it's not a predictor
df.drop(columns=["Student_ID"], inplace=True)

# Encode categorical features
label_encoders = {}
categorical_columns = [
    "Gender", "Parental_Education_Level", "Extra_Curricular_Activities", "Internet_Access_at_Home", 
    "Socioeconomic_Status", "Health_Issues", "Motivation_Level", "Peer_Influence", "Time_Management_Skills",
    "Family_Support", "Stress_Level", "Learning_Style"
]

for col in categorical_columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

# Encode target variable
le_target = LabelEncoder()
df["Final_Performance"] = le_target.fit_transform(df["Final_Performance"])

# Add Previous GPA
df["Previous_GPA"] = np.random.uniform(2.0, 4.0, len(df))

# Split features and target
X = df.drop(columns=["Final_Performance"])
y = df["Final_Performance"]

# Handle class imbalance using SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Standardize numerical features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Hyperparameter tuning using GridSearchCV
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

# Best model
model = grid_search.best_estimator_

# Predict on test data
y_pred = model.predict(X_test)

# Evaluate model
accuracy = accuracy_score(y_test, y_pred)
print(f"Optimized Accuracy: {accuracy:.2f}")
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Simple Prediction Function
def predict_performance(input_data):
    input_df = pd.DataFrame([input_data], columns=X.columns)
    for col in categorical_columns:
        input_df[col] = label_encoders[col].transform(input_df[col])
    input_df = scaler.transform(input_df)
    prediction = model.predict(input_df)
    return le_target.inverse_transform(prediction)[0]

# Example Usage
example_input = {
    "Age": 20,
    "Gender": "Male",
    "Study_Hours_per_Week": 25,
    "Previous_Scores": 85,
    "Class_Participation": 90,
    "Attendance_Rate": 95,
    "Parental_Education_Level": "Higher",
    "Extra_Curricular_Activities": "Yes",
    "Internet_Access_at_Home": "Yes",
    "Socioeconomic_Status": "Medium",
    "Health_Issues": "No",
    "Motivation_Level": "High",
    "Peer_Influence": "Positive",
    "Time_Management_Skills": "Good",
    "Family_Support": "Strong",
    "Stress_Level": "Low",
    "Learning_Style": "Visual",
    "Previous_GPA": 3.5
}

predicted_performance = predict_performance(example_input)
print(f"Predicted Performance: {predicted_performance}")

Fitting 5 folds for each of 108 candidates, totalling 540 fits
Optimized Accuracy: 0.33
Classification Report:
              precision    recall  f1-score   support

           0       0.23      0.34      0.28        29
           1       0.36      0.32      0.34        37
           2       0.45      0.33      0.38        39

    accuracy                           0.33       105
   macro avg       0.35      0.33      0.33       105
weighted avg       0.36      0.33      0.34       105



ValueError: y contains previously unseen labels: 'Higher'