In [4]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_squared_error

# Set random seed for reproducibility
np.random.seed(42)

# --- 1. Generate Synthetic Data ---
# Create a synthetic dataset
data_size = 100
study_hours = np.random.uniform(1, 10, data_size)
attendance = np.random.uniform(50, 100, data_size)
internal_marks = np.random.uniform(10, 50, data_size)

# Define the relationship for the final score (target variable)
# score = 5*hours + 0.5*attendance + 1.5*internal_marks + (intercept) + noise
y = (5 * study_hours + 
     0.5 * attendance + 
     1.5 * internal_marks + 
     5 +  # Intercept
     np.random.normal(0, 5, data_size)) # Adding noise

X = pd.DataFrame({
    'study_hours': study_hours,
    'attendance': attendance,
    'internal_marks': internal_marks
})

# --- 2. Setup Model and K-Fold Cross-Validation ---
# We use a pipeline to combine scaling and regression
# This prevents data leakage during cross-validation
pipeline = make_pipeline(StandardScaler(), LinearRegression())

# Define the number of folds
n_splits = 5
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

mse_scores = []
fold_count = 1

print(f"Running {n_splits}-Fold Cross-Validation...")

# --- 3. Run K-Fold Cross-Validation Loop ---
for train_index, test_index in kf.split(X):
    # Split the data into training and testing sets for this fold
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    # Fix: Use standard numpy indexing for y, not .iloc
    y_train, y_test = y[train_index], y[test_index]
    
    # Fit the pipeline (scaler + model) on the training data
    pipeline.fit(X_train, y_train)
    
    # Predict on the test data
    y_pred = pipeline.predict(X_test)
    
    # Evaluate the model
    mse = mean_squared_error(y_test, y_pred)
    mse_scores.append(mse)
    
    print(f"Fold {fold_count}: MSE = {mse:.4f}")
    fold_count += 1

# --- 4. Display Final Results ---
average_mse = np.mean(mse_scores)
std_dev_mse = np.std(mse_scores)

print("\n--- Cross-Validation Results ---")
print(f"Average MSE: {average_mse:.4f}")
print(f"Standard Deviation of MSE: {std_dev_mse:.4f}")

# Optional: Train the final model on all data to see coefficients
final_model = make_pipeline(StandardScaler(), LinearRegression())
final_model.fit(X, y)

# Get coefficients from the LinearRegression step in the pipeline
coefficients = final_model.named_steps['linearregression'].coef_
intercept = final_model.named_steps['linearregression'].intercept_

print("\n--- Final Model (Trained on all data) ---")
print(f"Intercept: {intercept:.4f}")
print("Coefficients (for scaled features):")
for feature, coef in zip(X.columns, coefficients):
    print(f"  {feature}: {coef:.4f}")



Running 5-Fold Cross-Validation...
Fold 1: MSE = 52.5395
Fold 2: MSE = 18.6411
Fold 3: MSE = 16.9352
Fold 4: MSE = 10.1730
Fold 5: MSE = 34.9811

--- Cross-Validation Results ---
Average MSE: 26.6540
Standard Deviation of MSE: 15.2950

--- Final Model (Trained on all data) ---
Intercept: 115.0091
Coefficients (for scaled features):
  study_hours: 13.9355
  attendance: 7.5952
  internal_marks: 17.9981
