In [None]:
# Import Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import (RandomForestClassifier, BaggingClassifier, AdaBoostClassifier,
                              GradientBoostingClassifier, VotingClassifier, ExtraTreesClassifier)
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.ensemble import StackingClassifier
from imblearn.over_sampling import SMOTE
import warnings
warnings.filterwarnings('ignore')

# Load and Preprocess Data
data = pd.read_csv('StudentsPerformance_with_headers.csv')
label_encoder = LabelEncoder()
for column in data.select_dtypes(include=['object']).columns:
    data[column] = label_encoder.fit_transform(data[column])

# Define target and features
X = data.drop(columns=['GRADE'])
y = data['GRADE'].apply(lambda x: 1 if x > 2 else 0)

# Balance dataset with SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Scale numerical features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Evaluation Function
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    return accuracy, precision, recall, f1

# Models and Ensembles

# Support Vector Machine (SVM)
svm = SVC(kernel='linear', C=0.5, gamma=0.01, probability=True)
svm.fit(X_train, y_train)
svm_metrics = evaluate_model(svm, X_test, y_test)

# Random Forest
rf = RandomForestClassifier(n_estimators=200, max_depth=10, random_state=42)
rf.fit(X_train, y_train)
rf_metrics = evaluate_model(rf, X_test, y_test)

# XGBoost
xgb = XGBClassifier(n_estimators=150, max_depth=8, learning_rate=0.03, use_label_encoder=False, eval_metric='logloss')
xgb.fit(X_train, y_train)
xgb_metrics = evaluate_model(xgb, X_test, y_test)

# Bagging with RandomForest as base estimator
bagging_rf = BaggingClassifier(estimator=RandomForestClassifier(), n_estimators=10, random_state=42)
bagging_rf.fit(X_train, y_train)
bagging_rf_metrics = evaluate_model(bagging_rf, X_test, y_test)

# Bagging with SVM as base estimator i
bagging_svm = BaggingClassifier(estimator=SVC(kernel='linear', C=0.5, gamma=0.01, probability=True), n_estimators=10, random_state=42)
bagging_svm.fit(X_train, y_train)
bagging_svm_metrics = evaluate_model(bagging_svm, X_test, y_test)


# Extra Trees (Another bagging method) 
extra_trees = ExtraTreesClassifier(n_estimators=100, max_depth=10, random_state=42)
extra_trees.fit(X_train, y_train)
extra_trees_metrics = evaluate_model(extra_trees, X_test, y_test)

# AdaBoost with Decision Tree as base estimator i
ada_boost = AdaBoostClassifier(estimator=RandomForestClassifier(max_depth=1), n_estimators=50, learning_rate=0.5, random_state=42)
ada_boost.fit(X_train, y_train)
ada_boost_metrics = evaluate_model(ada_boost, X_test, y_test)


# Gradient Boosting Classifier
gradient_boosting = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
gradient_boosting.fit(X_train, y_train)
gradient_boosting_metrics = evaluate_model(gradient_boosting, X_test, y_test)

# Hybrid Voting Ensemble (Soft Voting)
voting_ensemble = VotingClassifier(estimators=[
    ('SVM', svm), ('RandomForest', rf), ('XGBoost', xgb),
    ('Bagging_RF', bagging_rf), ('Bagging_SVM', bagging_svm), ('AdaBoost', ada_boost), ('GradientBoosting', gradient_boosting)],
    voting='soft')
voting_ensemble.fit(X_train, y_train)
voting_ensemble_metrics = evaluate_model(voting_ensemble, X_test, y_test)

# Stacking Ensemble with Logistic Regression as final estimator
stacking_ensemble = StackingClassifier(
    estimators=[
        ('RandomForest', rf), ('XGBoost', xgb), ('GradientBoosting', gradient_boosting)],
    final_estimator=LogisticRegression())
stacking_ensemble.fit(X_train, y_train)
stacking_ensemble_metrics = evaluate_model(stacking_ensemble, X_test, y_test)

# Display Model Comparison
model_names = [
    "SVM", "Random Forest", "XGBoost", "Bagging (Random Forest)",
    "Bagging (SVM)", "Extra Trees", "AdaBoost", "Gradient Boosting", 
    "Hybrid Voting", "Stacking Ensemble"
]
model_metrics = [
    svm_metrics, rf_metrics, xgb_metrics, bagging_rf_metrics,
    bagging_svm_metrics, extra_trees_metrics, ada_boost_metrics, gradient_boosting_metrics,
    voting_ensemble_metrics, stacking_ensemble_metrics
]

print("\nModel Comparison:\n")
for name, metrics in zip(model_names, model_metrics):
    accuracy, precision, recall, f1 = metrics
    print(f"{name} - Accuracy: {accuracy*100:.2f}%, Precision: {precision*100:.2f}%, Recall: {recall*100:.2f}%, F1 Score: {f1*100:.2f}%")

# Function for collecting user input and making predictions
def user_predict_cgpa(model, scaler):
    print("\nPlease enter the following details to predict your CGPA:")
    
    # Collecting user data (ensuring it matches feature names in training data)
    # Updated user input collection (replace any mismatched keys)
    user_data = {
        'Student Age': int(input("Student Age (1: 18-21, 2: 22-25, 3: above 26): ")),
        'Sex': int(input("Sex (1: female, 2: male): ")),
        'High School Type': int(input("High School Type (1: private, 2: state, 3: other): ")),
        'Scholarship': int(input("Scholarship (1: None, 2: 25%, 3: 50%, 4: 75%, 5: Full): ")),
        'Additional Work': int(input("Additional Work (1: Yes, 2: No): ")),
        'Artistic/Sports Activity': int(input("Artistic/Sports Activity (1: Yes, 2: No): ")),
        'Partner': int(input("Partner (1: Yes, 2: No): ")),
        'Total Salary': int(input("Total Salary (1: 135-200, 2: 201-270, 3: 271-340, 4: 341-410, 5: above 410): ")),
        'Transportation': int(input("Transportation (1: Bus, 2: Car/Taxi, 3: Bicycle, 4: Other): ")),
        'Accommodation': int(input("Accommodation (1: Rental, 2: Dormitory, 3: Family, 4: Other): ")),
        'Mother’s Education': int(input("Mother's Education (1: primary, 2: secondary, 3: high school, 4: university, 5: MSc, 6: PhD): ")),
        'Father’s Education': int(input("Father's Education (1: primary, 2: secondary, 3: high school, 4: university, 5: MSc, 6: PhD): ")),
        'Siblings': int(input("Siblings (1: 1, 2: 2, 3: 3, 4: 4, 5: 5+): ")),
        'Parental Status': int(input("Parental Status (1: married, 2: divorced, 3: died): ")),
        'Mother’s Occupation': int(input("Mother's Occupation (1: retired, 2: housewife, 3: gov. officer, 4: private, 5: self-employed, 6: other): ")),
        'Father’s Occupation': int(input("Father's Occupation (1: retired, 2: gov. officer, 3: private, 4: self-employed, 5: other): ")),
        'Weekly Study Hours': int(input("Weekly Study Hours (1: None, 2: <5, 3: 6-10, 4: 11-20, 5: >20): ")),
        'Non-Scientific Reading': int(input("Non-Scientific Reading (1: None, 2: Sometimes, 3: Often): ")),
        'Scientific Reading': int(input("Scientific Reading (1: None, 2: Sometimes, 3: Often): ")),
        'Seminar Attendance': int(input("Seminar Attendance (1: Yes, 2: No): ")),
        'Impact of Projects': int(input("Impact of Projects (1: positive, 2: negative, 3: neutral): ")),
        'Class Attendance': int(input("Class Attendance (1: always, 2: sometimes, 3: never): ")),
        'Midterm 1 Prep': int(input("Midterm 1 Prep (1: alone, 2: friends, 3: n/a): ")),
        'Midterm 2 Prep': int(input("Midterm 2 Prep (1: close to exam, 2: regular, 3: never): ")),
        'Taking Notes': int(input("Taking Notes (1: never, 2: sometimes, 3: always): ")),
        'Listening in Class': int(input("Listening in Class (1: never, 2: sometimes, 3: always): ")),
        'Discussion Improves Success': int(input("Discussion Improves Success (1: never, 2: sometimes, 3: always): ")),
        'Flip-Classroom': int(input("Flip-Classroom (1: not useful, 2: useful, 3: n/a): ")),
    }

    # Ensure user input is in DataFrame form and aligns with training columns
    user_input_df = pd.DataFrame([user_data])

    # Aligning DataFrame columns if necessary
    missing_cols = [col for col in X.columns if col not in user_input_df.columns]
    for col in missing_cols:
        user_input_df[col] = 0

    user_input_df = user_input_df[X.columns]  # Reorder columns to match training set

    # Scale input
    user_input_scaled = scaler.transform(user_input_df)

    # Predict with the ensemble model
    prediction = voting_ensemble.predict(user_input_scaled)
    if prediction == 1:
        print("\nPrediction: Pass")
    else:
        print("\nPrediction: Fail")

        
# Use the chosen model for final user prediction
user_predict_cgpa(voting_ensemble, scaler)



Model Comparison:

SVM - Accuracy: 75.00%, Precision: 93.33%, Recall: 66.67%, F1 Score: 77.78%
Random Forest - Accuracy: 75.00%, Precision: 93.33%, Recall: 66.67%, F1 Score: 77.78%
XGBoost - Accuracy: 81.25%, Precision: 94.12%, Recall: 76.19%, F1 Score: 84.21%
Bagging (Random Forest) - Accuracy: 78.12%, Precision: 100.00%, Recall: 66.67%, F1 Score: 80.00%
Bagging (SVM) - Accuracy: 75.00%, Precision: 93.33%, Recall: 66.67%, F1 Score: 77.78%
Extra Trees - Accuracy: 75.00%, Precision: 84.21%, Recall: 76.19%, F1 Score: 80.00%
AdaBoost - Accuracy: 75.00%, Precision: 93.33%, Recall: 66.67%, F1 Score: 77.78%
Gradient Boosting - Accuracy: 78.12%, Precision: 93.75%, Recall: 71.43%, F1 Score: 81.08%
Hybrid Voting - Accuracy: 78.12%, Precision: 93.75%, Recall: 71.43%, F1 Score: 81.08%
Stacking Ensemble - Accuracy: 81.25%, Precision: 94.12%, Recall: 76.19%, F1 Score: 84.21%

Please enter the following details to predict your CGPA:
