In [2]:
import pandas as pd
import numpy as np

# for consistent results every time you run
np.random.seed(42)

# number of samples
n = 200

data = {
    "attendance_percent": np.random.randint(40, 100, n),
    "hours_studied_per_week": np.random.randint(0, 30, n),
    "previous_exam_score": np.random.randint(30, 100, n),
    "assignments_submitted": np.random.randint(0, 10, n),
    "participation": np.random.randint(0, 6, n)
}

# simple rule to decide pass/fail
# you can modify this rule later
data["final_exam_result"] = (
    (data["attendance_percent"] > 60) &
    (data["hours_studied_per_week"] > 5) &
    (data["previous_exam_score"] > 50)
).astype(int)

# create dataframe
df = pd.DataFrame(data)

# save to CSV
df.to_csv("student_data.csv", index=False)

print("Sample data:")
print(df.head())

from google.colab import files
files.download('student_data.csv')


Sample data:
   attendance_percent  hours_studied_per_week  previous_exam_score  \
0                  78                       6                   77   
1                  91                      26                   49   
2                  68                      18                   37   
3                  54                      21                   36   
4                  82                      27                   96   

   assignments_submitted  participation  final_exam_result  
0                      4              0                  1  
1                      0              4                  0  
2                      6              4                  0  
3                      6              5                  0  
4                      8              1                  1  


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [3]:
import pandas as pd

# load dataset
df = pd.read_csv('student_data.csv')

# show top rows
print(df.head())

# check info
print(df.info())

# check missing values
print(df.isnull().sum())


   attendance_percent  hours_studied_per_week  previous_exam_score  \
0                  78                       6                   77   
1                  91                      26                   49   
2                  68                      18                   37   
3                  54                      21                   36   
4                  82                      27                   96   

   assignments_submitted  participation  final_exam_result  
0                      4              0                  1  
1                      0              4                  0  
2                      6              4                  0  
3                      6              5                  0  
4                      8              1                  1  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 6 columns):
 #   Column                  Non-Null Count  Dtype
---  ------                  --------------  -----
 0   at

In [4]:
from sklearn.model_selection import train_test_split

# define X and y
X = df[['attendance_percent', 'hours_studied_per_week', 'previous_exam_score',
        'assignments_submitted', 'participation']]
y = df['final_exam_result']

# split data: 80% train, 20% test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Training samples:", X_train.shape[0])
print("Testing samples:", X_test.shape[0])


Training samples: 160
Testing samples: 40


In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix

# create the model
model = LogisticRegression()

# train
model.fit(X_train, y_train)

# predict on test data
y_pred = model.predict(X_test)

# evaluate
acc = accuracy_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)

print("Test Accuracy:", acc)
print("Confusion Matrix:")
print(cm)


Test Accuracy: 0.825
Confusion Matrix:
[[25  3]
 [ 4  8]]


In [7]:
import joblib

# save the trained logistic regression model
joblib.dump(model, 'student_pass_fail_model.pkl')

print("Model saved successfully.")
from google.colab import files
files.download('student_pass_fail_model.pkl')


Model saved successfully.


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [8]:
import joblib

# load the model
loaded_model = joblib.load('student_pass_fail_model.pkl')

# try a sample prediction
# let’s say: attendance=80, assignments=7, study_hours=4, sleep_hours=6, extracurricular=1
new_student = [[80, 7, 4, 6, 1]]

prediction = loaded_model.predict(new_student)

print("Predicted (0=Fail, 1=Pass):", prediction[0])


Predicted (0=Fail, 1=Pass): 0




In [9]:
import pandas as pd

# column names
cols = ['Attendance', 'Assignments', 'Study_hours', 'Sleep_hours', 'Extracurricular']

# new data as DataFrame
new_student_df = pd.DataFrame([[80, 7, 4, 6, 1]], columns=cols)

prediction = loaded_model.predict(new_student_df)

print("Predicted (0=Fail, 1=Pass):", prediction[0])


ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- Assignments
- Attendance
- Extracurricular
- Sleep_hours
- Study_hours
Feature names seen at fit time, yet now missing:
- assignments_submitted
- attendance_percent
- hours_studied_per_week
- participation
- previous_exam_score


In [10]:
import pandas as pd

# column names from training
cols = ['attendance_percent', 'assignments_submitted', 'hours_studied_per_week', 'sleep_hours', 'participation']

# new data with same feature names
new_student_df = pd.DataFrame([[80, 7, 4, 6, 1]], columns=cols)

prediction = loaded_model.predict(new_student_df)

print("Predicted (0=Fail, 1=Pass):", prediction[0])


ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- sleep_hours
Feature names seen at fit time, yet now missing:
- previous_exam_score


In [11]:
print(x.columns)

NameError: name 'x' is not defined

In [12]:
print(X.columns)

Index(['attendance_percent', 'hours_studied_per_week', 'previous_exam_score',
       'assignments_submitted', 'participation'],
      dtype='object')


In [13]:
import pandas as pd

# same columns, same order
cols = ['attendance_percent', 'hours_studied_per_week', 'previous_exam_score', 'assignments_submitted', 'participation']

# new data sample
# attendance=80%, hours_studied=5/week, previous_exam_score=70,
# assignments_submitted=8, participation=1
new_student_df = pd.DataFrame([[80, 5, 70, 8, 1]], columns=cols)

prediction = loaded_model.predict(new_student_df)

print("Predicted (0=Fail, 1=Pass):", prediction[0])


Predicted (0=Fail, 1=Pass): 0
