In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report, accuracy_score

# Load the dataset
file_path = 'Student_Data_ML.csv'
student_data = pd.read_csv(file_path)

# Binarize the Success_Level to create the target variable 'Enroll'
student_data['Enroll'] = student_data['Success_Level'].apply(lambda x: 1 if x == 'High' else 0)

# Define the features and the target variable
features = [
    'O_Level_Results', 'A_Level_Results', 'O_Level_Credits', 'A_Level_Credits',
    'Gender', 'Age', 'School_Type', 'District', 'Preferred_Program'
]
X = pd.get_dummies(student_data[features], drop_first=True)

# Handle missing values if any
X.fillna(X.median(), inplace=True)

y = student_data['Enroll']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Initialize and train the GaussianNB model
model = GaussianNB()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

# Output the accuracy and classification report
print(f"Accuracy: {accuracy:.2f}")
print(f"Classification Report:\n{report}")

Accuracy: 0.63
Classification Report:
              precision    recall  f1-score   support

           0       0.65      0.93      0.76       194
           1       0.35      0.07      0.11       106

    accuracy                           0.63       300
   macro avg       0.50      0.50      0.44       300
weighted avg       0.54      0.63      0.53       300

