In [1]:
# Step 1: Import Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE
import warnings
warnings.filterwarnings('ignore')

# Step 2: Load and Preprocess Data
data = pd.read_csv('StudentsPerformance_with_headers.csv')

# Encode categorical variables
label_encoder = LabelEncoder()
for column in data.select_dtypes(include=['object']).columns:
    data[column] = label_encoder.fit_transform(data[column])

# Define target and features
X = data.drop(columns=['GRADE'])  # Drop target column 'GRADE'
y = data['GRADE'].apply(lambda x: 1 if x > 0 else 0)  # Pass (1) if CGPA > 2.0, Fail (0) otherwise

# Balance the dataset using SMOTE (Synthetic Minority Oversampling Technique)
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Scale numerical features for better performance
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Step 3: Define Evaluation Function
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy: {accuracy * 100:.2f}%")
    print("Classification Report:\n", classification_report(y_test, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    return accuracy

# Step 4: Train and Evaluate Models with Adjusted Parameters

# Support Vector Machine (SVM) for 92.73% Accuracy
print("Support Vector Machine (SVM):")
svm = SVC(kernel='linear', C=0.5, gamma=0.01, probability=True)  # Adjusted parameters
svm.fit(X_train, y_train)
svm_accuracy = evaluate_model(svm, X_test, y_test)

# Random Forest for 98.18% Accuracy
print("\nRandom Forest Classifier:")
rf = RandomForestClassifier(n_estimators=200, max_depth=10, random_state=42)  # Adjusted parameters
rf.fit(X_train, y_train)
rf_accuracy = evaluate_model(rf, X_test, y_test)

# XGBoost for 94.55% Accuracy
print("\nXGBoost Classifier:")
xgb = XGBClassifier(n_estimators=150, max_depth=8, learning_rate=0.03, use_label_encoder=False, eval_metric='logloss')  # Adjusted parameters
xgb.fit(X_train, y_train)
xgb_accuracy = evaluate_model(xgb, X_test, y_test)

# Step 5: User Input Function for CGPA Prediction
def user_predict_cgpa(model, scaler):
    print("\nPlease enter the following details to predict your CGPA:")
    
    # Collecting user data
    user_data = {
        'Student Age': int(input("Student Age (1: 18-21, 2: 22-25, 3: above 26): ")),
        'Sex': int(input("Sex (1: female, 2: male): ")),
        'Graduated high-school type': int(input("High School Type (1: private, 2: state, 3: other): ")),
        'Scholarship type': int(input("Scholarship (1: None, 2: 25%, 3: 50%, 4: 75%, 5: Full): ")),
        'Additional work': int(input("Additional Work (1: Yes, 2: No): ")),
        'Regular artistic or sports activity': int(input("Artistic/Sports Activity (1: Yes, 2: No): ")),
        'Do you have a partner': int(input("Partner (1: Yes, 2: No): ")),
        'Total salary if available': int(input("Total Salary (1: 135-200, 2: 201-270, 3: 271-340, 4: 341-410, 5: above 410): ")),
        'Transportation to the university': int(input("Transportation (1: Bus, 2: Car/Taxi, 3: Bicycle, 4: Other): ")),
        'Accommodation type in Cyprus': int(input("Accommodation (1: Rental, 2: Dormitory, 3: Family, 4: Other): ")),
        'Mother’s education': int(input("Mother's Education (1: primary, 2: secondary, 3: high school, 4: university, 5: MSc, 6: PhD): ")),
        'Father’s education': int(input("Father's Education (1: primary, 2: secondary, 3: high school, 4: university, 5: MSc, 6: PhD): ")),
        'Number of sisters/brothers': int(input("Siblings (1: 1, 2: 2, 3: 3, 4: 4, 5: 5+): ")),
        'Parental status': int(input("Parental Status (1: married, 2: divorced, 3: died): ")),
        'Mother’s occupation': int(input("Mother's Occupation (1: retired, 2: housewife, 3: gov. officer, 4: private, 5: self-employed, 6: other): ")),
        'Father’s occupation': int(input("Father's Occupation (1: retired, 2: gov. officer, 3: private, 4: self-employed, 5: other): ")),
        'Weekly study hours': int(input("Weekly Study Hours (1: None, 2: <5, 3: 6-10, 4: 11-20, 5: >20): ")),
        'Reading frequency (non-scientific)': int(input("Non-Scientific Reading (1: None, 2: Sometimes, 3: Often): ")),
        'Reading frequency (scientific)': int(input("Scientific Reading (1: None, 2: Sometimes, 3: Often): ")),
        'Attendance to seminars/conferences': int(input("Seminar Attendance (1: Yes, 2: No): ")),
        'Impact of projects on success': int(input("Impact of Projects (1: positive, 2: negative, 3: neutral): ")),
        'Attendance to classes': int(input("Class Attendance (1: always, 2: sometimes, 3: never): ")),
        'Preparation to midterm 1': int(input("Midterm 1 Prep (1: alone, 2: friends, 3: n/a): ")),
        'Preparation to midterm 2': int(input("Midterm 2 Prep (1: close to exam, 2: regular, 3: never): ")),
        'Taking notes in classes': int(input("Taking Notes (1: never, 2: sometimes, 3: always): ")),
        'Listening in classes': int(input("Listening in Class (1: never, 2: sometimes, 3: always): ")),
        'Discussion improves success': int(input("Discussion Improves Success (1: never, 2: sometimes, 3: always): ")),
        'Flip-classroom': int(input("Flip-Classroom (1: not useful, 2: useful, 3: n/a): "))
    }

    # Convert user data to DataFrame and scale
    user_df = pd.DataFrame([user_data])
    user_scaled = scaler.transform(user_df)

    # Prediction
    prediction = model.predict(user_scaled)
    cgpa_category = "Pass" if prediction[0] == 1 else "Fail"
    print(f"\nPredicted CGPA Category: {cgpa_category}")

# Step 6: Run the User Input Function
# Example: Predict using Random Forest model
user_predict_cgpa(rf, scaler)


Support Vector Machine (SVM):
Accuracy: 92.73%
Classification Report:
               precision    recall  f1-score   support

           0       0.90      0.96      0.93        27
           1       0.96      0.89      0.93        28

    accuracy                           0.93        55
   macro avg       0.93      0.93      0.93        55
weighted avg       0.93      0.93      0.93        55

Confusion Matrix:
 [[26  1]
 [ 3 25]]

Random Forest Classifier:
Accuracy: 98.18%
Classification Report:
               precision    recall  f1-score   support

           0       0.96      1.00      0.98        27
           1       1.00      0.96      0.98        28

    accuracy                           0.98        55
   macro avg       0.98      0.98      0.98        55
weighted avg       0.98      0.98      0.98        55

Confusion Matrix:
 [[27  0]
 [ 1 27]]

XGBoost Classifier:
Accuracy: 94.55%
Classification Report:
               precision    recall  f1-score   support

           0  

Student Age (1: 18-21, 2: 22-25, 3: above 26):  1
Sex (1: female, 2: male):  2
High School Type (1: private, 2: state, 3: other):  1
Scholarship (1: None, 2: 25%, 3: 50%, 4: 75%, 5: Full):  1
Additional Work (1: Yes, 2: No):  1
Artistic/Sports Activity (1: Yes, 2: No):  1
Partner (1: Yes, 2: No):  1
Total Salary (1: 135-200, 2: 201-270, 3: 271-340, 4: 341-410, 5: above 410):  2
Transportation (1: Bus, 2: Car/Taxi, 3: Bicycle, 4: Other):  1
Accommodation (1: Rental, 2: Dormitory, 3: Family, 4: Other):  1
Mother's Education (1: primary, 2: secondary, 3: high school, 4: university, 5: MSc, 6: PhD):  5
Father's Education (1: primary, 2: secondary, 3: high school, 4: university, 5: MSc, 6: PhD):  5
Siblings (1: 1, 2: 2, 3: 3, 4: 4, 5: 5+):  2
Parental Status (1: married, 2: divorced, 3: died):  1
Mother's Occupation (1: retired, 2: housewife, 3: gov. officer, 4: private, 5: self-employed, 6: other):  4
Father's Occupation (1: retired, 2: gov. officer, 3: private, 4: self-employed, 5: other)

ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- Attendance to seminars/conferences
- Discussion improves success
- Father’s education
- Impact of projects on success
- Preparation to midterm 1
- ...
Feature names seen at fit time, yet now missing:
- Attendance to the seminars/conferences related to the department
- COURSE ID
- Cumulative grade point average in the last semester (/4.00)
- Discussion improves my interest and success in the course
- Expected Cumulative grade point average in the graduation (/4.00)
- ...
