# 1. Import Libraries and Other Notebooks

In [1]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_absolute_error

In [5]:
# You need to install import_ipynb first: pip install import_ipynb
import import_ipynb

In [6]:
# This line imports the function from our other notebook
from data_preprocessing import load_and_prepare_data

Data loaded and prepared successfully!


# 2. Load the Data

In [7]:
# Use the function we imported to load the data.
filepath = '../sample_dataset/student_performance_dataset.csv'
data = load_and_prepare_data(filepath)
print("--- Data Loaded ---")
print(data.head())

--- Data Loaded ---
   study_hours_per_day  attendance_percentage  exam_score
0                  0.0                   85.0        56.2
1                  6.9                   97.3       100.0
2                  1.4                   94.8        34.3
3                  1.0                   71.0        26.8
4                  5.0                   90.9        66.4


# 3. Define Features (X) and Target (y)

In [8]:
# X contains the columns we use for prediction.
# y is the column we want to predict.
X = data[['study_hours_per_day', 'attendance_percentage']]
y = data['exam_score']

# 4. Split Data for Training and Testing

In [9]:
# We use 80% of the data for training and 20% for testing.
# random_state=42 ensures we get the same split every time we run the code.
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print(f"Training set size: {X_train.shape[0]} samples")
print(f"Testing set size: {X_test.shape[0]} samples")

Training set size: 800 samples
Testing set size: 200 samples


# 5. Train the Linear Regression Model

In [10]:
model = LinearRegression()
model.fit(X_train, y_train)
print("--- Model Training Complete ---")

--- Model Training Complete ---


# 6. Evaluate the Model

In [11]:
# Make predictions on the test data (data the model has never seen).
y_pred = model.predict(X_test)

In [12]:
# Calculate performance metrics
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

In [13]:
print("\n--- Model Evaluation Results ---")
print(f"R-squared (R²): {r2:.4f}")
print(f"Mean Absolute Error (MAE): {mae:.4f}")


--- Model Evaluation Results ---
R-squared (R²): 0.6623
Mean Absolute Error (MAE): 7.4299


# 7. Make a Prediction for a New Student

In [14]:
# This is the final step where we use the trained model.
# We create a new data point for a student with 4 study hours and 80% attendance.
new_student_data = np.array([[4, 80]])
predicted_score = model.predict(new_student_data)



In [15]:
print("\n--- Prediction for a New Student ---")
print("Input: 4 study hours/day, 80% attendance")
print(f"Predicted Exam Score: {predicted_score[0]:.2f}")


--- Prediction for a New Student ---
Input: 4 study hours/day, 80% attendance
Predicted Exam Score: 73.39
