In [None]:
# Import Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import (RandomForestClassifier, BaggingClassifier, AdaBoostClassifier,
                              GradientBoostingClassifier, VotingClassifier, ExtraTreesClassifier)
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.ensemble import StackingClassifier
from imblearn.over_sampling import SMOTE
import warnings
warnings.filterwarnings('ignore')

# Load and Preprocess Data
data = pd.read_csv('StudentsPerformance_with_headers.csv')
label_encoder = LabelEncoder()
for column in data.select_dtypes(include=['object']).columns:
    data[column] = label_encoder.fit_transform(data[column])

# Define target and features
X = data.drop(columns=['GRADE'])
y = data['GRADE'].apply(lambda x: 1 if x > 0 else 0)

# Balance dataset with SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Scale numerical features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Evaluation Function
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    return accuracy, precision, recall, f1

# Models and Ensembles

# Support Vector Machine (SVM)
svm = SVC(kernel='linear', C=0.5, gamma=0.01, probability=True)
svm.fit(X_train, y_train)
svm_metrics = evaluate_model(svm, X_test, y_test)

# Random Forest
rf = RandomForestClassifier(n_estimators=200, max_depth=10, random_state=42)
rf.fit(X_train, y_train)
rf_metrics = evaluate_model(rf, X_test, y_test)

# XGBoost
xgb = XGBClassifier(n_estimators=150, max_depth=8, learning_rate=0.03, use_label_encoder=False, eval_metric='logloss')
xgb.fit(X_train, y_train)
xgb_metrics = evaluate_model(xgb, X_test, y_test)

# Bagging with RandomForest as base estimator
bagging_rf = BaggingClassifier(base_estimator=RandomForestClassifier(), n_estimators=10, random_state=42)
bagging_rf.fit(X_train, y_train)
bagging_rf_metrics = evaluate_model(bagging_rf, X_test, y_test)

# Bagging with SVM as base estimator
bagging_svm = BaggingClassifier(base_estimator=SVC(kernel='linear', C=0.5, gamma=0.01, probability=True), n_estimators=10, random_state=42)
bagging_svm.fit(X_train, y_train)
bagging_svm_metrics = evaluate_model(bagging_svm, X_test, y_test)

# Extra Trees (Another bagging method)
extra_trees = ExtraTreesClassifier(n_estimators=100, max_depth=10, random_state=42)
extra_trees.fit(X_train, y_train)
extra_trees_metrics = evaluate_model(extra_trees, X_test, y_test)

# AdaBoost with Decision Tree as base estimator
ada_boost = AdaBoostClassifier(base_estimator=RandomForestClassifier(max_depth=1), n_estimators=50, learning_rate=0.5, random_state=42)
ada_boost.fit(X_train, y_train)
ada_boost_metrics = evaluate_model(ada_boost, X_test, y_test)

# Gradient Boosting Classifier
gradient_boosting = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
gradient_boosting.fit(X_train, y_train)
gradient_boosting_metrics = evaluate_model(gradient_boosting, X_test, y_test)

# Hybrid Voting Ensemble (Soft Voting)
voting_ensemble = VotingClassifier(estimators=[
    ('SVM', svm), ('RandomForest', rf), ('XGBoost', xgb),
    ('Bagging_RF', bagging_rf), ('Bagging_SVM', bagging_svm), ('AdaBoost', ada_boost), ('GradientBoosting', gradient_boosting)],
    voting='soft')
voting_ensemble.fit(X_train, y_train)
voting_ensemble_metrics = evaluate_model(voting_ensemble, X_test, y_test)

# Stacking Ensemble with Logistic Regression as final estimator
stacking_ensemble = StackingClassifier(
    estimators=[
        ('RandomForest', rf), ('XGBoost', xgb), ('GradientBoosting', gradient_boosting)],
    final_estimator=LogisticRegression())
stacking_ensemble.fit(X_train, y_train)
stacking_ensemble_metrics = evaluate_model(stacking_ensemble, X_test, y_test)

# Display Model Comparison
model_names = [
    "SVM", "Random Forest", "XGBoost", "Bagging (Random Forest)",
    "Bagging (SVM)", "Extra Trees", "AdaBoost", "Gradient Boosting", 
    "Hybrid Voting", "Stacking Ensemble"
]
model_metrics = [
    svm_metrics, rf_metrics, xgb_metrics, bagging_rf_metrics,
    bagging_svm_metrics, extra_trees_metrics, ada_boost_metrics, gradient_boosting_metrics,
    voting_ensemble_metrics, stacking_ensemble_metrics
]

print("\nModel Comparison:\n")
for name, metrics in zip(model_names, model_metrics):
    accuracy, precision, recall, f1 = metrics
    print(f"{name} - Accuracy: {accuracy*100:.2f}%, Precision: {precision*100:.2f}%, Recall: {recall*100:.2f}%, F1 Score: {f1*100:.2f}%")

# User Input Function for Prediction
def user_predict_cgpa(model, scaler):
    print("\nPlease enter the following details to predict your CGPA:")
    
    user_data = {
        'Student Age': int(input("Student Age (1: 18-21, 2: 22-25, 3: above 26): ")),
        'Sex': int(input("Sex (1: female, 2: male): ")),
        'Graduated high-school type': int(input("High School Type (1: private, 2: state, 3: other): ")),
        'Scholarship type': int(input("Scholarship (1: None, 2: 25%, 3: 50%, 4: 75%, 5: Full): ")),
        'Additional work': int(input("Additional Work (1: Yes, 2: No): ")),
        'Regular artistic or sports activity': int(input("Artistic/Sports Activity (1: Yes, 2: No): ")),
        'Do you have a partner': int(input("Partner (1: Yes, 2: No): ")),
        'Total salary if available': int(input("Total Salary (1: 135-200, 2: 201-270, 3: 271-340, 4: 341-410, 5: above 410): ")),
        'Transportation to the university': int(input("Transportation (1: Bus, 2: Car/Taxi, 3: Bicycle, 4: Other): ")),
        'Accommodation type in Cyprus': int(input("Accommodation (1: Rental, 2: Dormitory, 3: Family, 4: Other): ")),
        'Mother’s education': int(input("Mother's Education (1: primary, 2: secondary, 3: high school, 4: university, 5: MSc, 6: PhD): ")),
        'Father’s education': int(input("Father's Education (1: primary, 2: secondary, 3: high school, 4: university, 5: MSc, 6: PhD): ")),
        'Number of sisters/brothers': int(input("Siblings (1: 1, 2: 2, 3: 3, 4: 4, 5: 5+): ")),
        'Parental status': int(input("Parental Status (1: married, 2: divorced, 3: died): ")),
        'Mother’s occupation': int(input("Mother's Occupation (1: retired, 2: housewife, 3: gov. officer, 4: private, 5: self-employed, 6: other): ")),
        'Father’s occupation': int(input("Father's Occupation (1: retired, 2: gov. officer, 3: private, 4: self-employed, 5: other): ")),
        'Weekly study hours': int(input("Weekly Study Hours (1: None, 2: <5, 3: 6-10, 4: 11-20, 5: >20): ")),
        'Reading frequency (non-academic)': int(input("Reading Frequency (1: Never, 2: Sometimes, 3: Often): ")),
        'Attendance to seminars/conferences': int(input("Seminars/Conferences (1: Never, 2: Sometimes, 3: Always): ")),
        'Impact of additional activities on success': int(input("Impact of Activities (1: negative, 2: neutral, 3: positive): ")),
        'Class Teacher Interactions': int(input("Teacher Interaction (1: never, 2: sometimes, 3: always): "))
    }
    
    user_input = np.array(list(user_data.values())).reshape(1, -1)
    user_input_scaled = scaler.transform(user_input)
    prediction = model.predict(user_input_scaled)
    
    if prediction[0] == 1:
        print("\nPrediction: Pass")
    else:
        print("\nPrediction: Fail")

# Use Hybrid Voting Model for final user prediction
user_predict_cgpa(voting_ensemble, scaler)


Support Vector Machine (SVM):
Accuracy: 92.73%
Precision: 96.15%
Recall: 89.29%
F1 Score: 92.59%

Classification Report:
               precision    recall  f1-score   support

           0       0.90      0.96      0.93        27
           1       0.96      0.89      0.93        28

    accuracy                           0.93        55
   macro avg       0.93      0.93      0.93        55
weighted avg       0.93      0.93      0.93        55

Confusion Matrix:
 [[26  1]
 [ 3 25]]

Random Forest Classifier:
Accuracy: 98.18%
Precision: 100.00%
Recall: 96.43%
F1 Score: 98.18%

Classification Report:
               precision    recall  f1-score   support

           0       0.96      1.00      0.98        27
           1       1.00      0.96      0.98        28

    accuracy                           0.98        55
   macro avg       0.98      0.98      0.98        55
weighted avg       0.98      0.98      0.98        55

Confusion Matrix:
 [[27  0]
 [ 1 27]]

XGBoost Classifier:
Accurac

Student Age (1: 18-21, 2: 22-25, 3: above 26):  1
Sex (1: female, 2: male):  1
High School Type (1: private, 2: state, 3: other):  1
Scholarship (1: None, 2: 25%, 3: 50%, 4: 75%, 5: Full):  1
Additional Work (1: Yes, 2: No):  1
Artistic/Sports Activity (1: Yes, 2: No):  1
Partner (1: Yes, 2: No):  1
Total Salary (1: 135-200, 2: 201-270, 3: 271-340, 4: 341-410, 5: above 410):  1
Transportation (1: Bus, 2: Car/Taxi, 3: Bicycle, 4: Other):  1
Accommodation (1: Rental, 2: Dormitory, 3: Family, 4: Other):  1
Mother's Education (1: primary, 2: secondary, 3: high school, 4: university, 5: MSc, 6: PhD):  2
Father's Education (1: primary, 2: secondary, 3: high school, 4: university, 5: MSc, 6: PhD):  2
Siblings (1: 1, 2: 2, 3: 3, 4: 4, 5: 5+):  2
Parental Status (1: married, 2: divorced, 3: died):  1
Mother's Occupation (1: retired, 2: housewife, 3: gov. officer, 4: private, 5: self-employed, 6: other):  3
Father's Occupation (1: retired, 2: gov. officer, 3: private, 4: self-employed, 5: other)


Predicted CGPA Category: Fail
