In [3]:
# Step 1: Import libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Step 2: Load the dataset
data = pd.read_csv("student_performance.csv")

# Step 3: Explore the data (optional)
print("First 5 rows of the dataset:")
print(data.head(), "\n")
print("Dataset info:")
print(data.info(), "\n")

# Step 4: Separate features (X) and target (y)
X = data.drop("Pass", axis=1)  # Independent variables
y = data["Pass"]               # Target variable

# Step 5: Split into training and testing sets (80% train, 20% test)
# Try a different random_state to get both 0 and 1 in test set
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=0
)

# Step 6: Initialize and train the model
model = LogisticRegression()
model.fit(X_train, y_train)

# Step 7: Make predictions
y_pred = model.predict(X_test)

# Step 8: Evaluate the model
print("Model Evaluation:")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred, labels=[0, 1]))
print("\nClassification Report:\n", classification_report(y_test, y_pred, labels=[0, 1]))

# Step 9: Test with a new student's data (use DataFrame with feature names)
new_student = pd.DataFrame([[6, 85, 8, 7]], columns=X.columns)
prediction = model.predict(new_student)

print("\nNew Student Prediction:")
print("Input:", new_student.to_dict(orient='records'))
print("Predicted Pass (1=Yes, 0=No):", int(prediction[0]))


First 5 rows of the dataset:
   Hours_Studied  Attendance  Assignments_Submitted  Sleep_Hours  Pass
0              5          90                      8            7     1
1              2          60                      4            6     0
2              8          95                      9            8     1
3              1          40                      2            5     0
4              6          85                      8            7     1 

Dataset info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15 entries, 0 to 14
Data columns (total 5 columns):
 #   Column                 Non-Null Count  Dtype
---  ------                 --------------  -----
 0   Hours_Studied          15 non-null     int64
 1   Attendance             15 non-null     int64
 2   Assignments_Submitted  15 non-null     int64
 3   Sleep_Hours            15 non-null     int64
 4   Pass                   15 non-null     int64
dtypes: int64(5)
memory usage: 732.0 bytes
None 

Model Evaluation:
Accuracy: