In [None]:
# ============================================================================
# CELL 8: Create Results DataFrame
# ============================================================================

print("\n" + "=" * 70)
print("COMPLETE RESULTS TABLE")
print("=" * 70)

results_df = pd.DataFrame({
    'X': X_values,
    'Actual Y': Y_values,
    'Predicted Y': Y_pred_manual,
    'Residual (ε)': residuals,
    'Equation': [f"{beta[0]} + {beta[1]}×{x} = {beta[0] + beta[1]*x}" for x in X_values]
})

print("\n", results_df.to_string(index=False))


# ============================================================================
# CELL 9: scikit-learn Implementation with Train/Test Split
# ============================================================================

print("\n" + "=" * 70)
print("SCIKIT-LEARN IMPLEMENTATION WITH TRAIN/TEST SPLIT")
print("=" * 70)

# Prepare data for sklearn (needs 2D array for X)
X_sklearn = X_values.reshape(-1, 1)
Y_sklearn = Y_values

print(f"\nOriginal Dataset Size: {len(X_sklearn)} samples")

# Split data into train (60%) and test (40%)
X_train, X_test, Y_train, Y_test = train_test_split(
    X_sklearn, Y_sklearn, test_size=0.4, random_state=42
)

print(f"Training Set Size: {len(X_train)} samples")
print(f"Test Set Size: {len(X_test)} samples")
print(f"\nTraining Data:")
print(f"X_train = {X_train.flatten()}")
print(f"Y_train = {Y_train}")
print(f"\nTest Data:")
print(f"X_test = {X_test.flatten()}")
print(f"Y_test = {Y_test}")

# Create and fit the model on training data
model = LinearRegression()
model.fit(X_train, Y_train)

# Print coefficients
print("\n" + "-" * 70)
print("MODEL COEFFICIENTS (from sklearn)")
print("-" * 70)
print(f"Intercept (β₀): {model.intercept_:.4f}")
print(f"Slope (β₁): {model.coef_[0]:.4f}")
print(f"\n sklearn Equation: y = {model.intercept_:.4f} + {model.coef_[0]:.4f}x")

# Compare with manual calculation
print("\n" + "-" * 70)
print("COMPARISON: Manual vs sklearn")
print("-" * 70)
print(f"Manual Intercept: {beta[0]:.4f}")
print(f"sklearn Intercept: {model.intercept_:.4f}")
print(f"Difference: {abs(beta[0] - model.intercept_):.10f}")
print()
print(f"Manual Slope: {beta[1]:.4f}")
print(f"sklearn Slope: {model.coef_[0]:.4f}")
print(f"Difference: {abs(beta[1] - model.coef_[0]):.10f}")


# ============================================================================
# CELL 10: Predictions and Evaluation
# ============================================================================

print("\n" + "=" * 70)
print("MODEL EVALUATION")
print("=" * 70)

# Predictions on both train and test sets
Y_train_pred = model.predict(X_train)
Y_test_pred = model.predict(X_test)

# Evaluate on training set
r2_train = r2_score(Y_train, Y_train_pred)
mse_train = mean_squared_error(Y_train, Y_train_pred)
mae_train = mean_absolute_error(Y_train, Y_train_pred)
rmse_train = np.sqrt(mse_train)

# Evaluate on test set
r2_test = r2_score(Y_test, Y_test_pred)
mse_test = mean_squared_error(Y_test, Y_test_pred)
mae_test = mean_absolute_error(Y_test, Y_test_pred)
rmse_test = np.sqrt(mse_test)

print("\n TRAINING SET METRICS:")
print(f"R² Score: {r2_train:.4f}")
print(f"Mean Squared Error (MSE): {mse_train:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse_train:.4f}")
print(f"Mean Absolute Error (MAE): {mae_train:.4f}")

print("\n TEST SET METRICS:")
print(f"R² Score: {r2_test:.4f}")
print(f"Mean Squared Error (MSE): {mse_test:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse_test:.4f}")
print(f"Mean Absolute Error (MAE): {mae_test:.4f}")

# Create evaluation comparison DataFrame
eval_df = pd.DataFrame({
    'Metric': ['R²', 'MSE', 'RMSE', 'MAE'],
    'Training Set': [r2_train, mse_train, rmse_train, mae_train],
    'Test Set': [r2_test, mse_test, rmse_test, mae_test]
})

print("\n" + "-" * 70)
print("METRICS COMPARISON TABLE")
print("-" * 70)
print(eval_df.to_string(index=False))


# ============================================================================
# CELL 11: Visualization 1 - Training and Test Data with Regression Line
# ============================================================================

plt.figure(figsize=(12, 8))

# Plot training data
plt.scatter(X_train, Y_train, color='red', s=150, alpha=0.7,
           edgecolors='black', linewidth=2, label='Training Data', zorder=5)

# Plot test data
plt.scatter(X_test, Y_test, color='green', s=150, alpha=0.7,
           edgecolors='black', linewidth=2, label='Test Data', zorder=5)

# Plot regression line
X_line = np.linspace(0.5, 4.5, 100).reshape(-1, 1)
Y_line = model.predict(X_line)
plt.plot(X_line, Y_line, color='blue', linewidth=3, label='Regression Line', zorder=3)

# Add predictions on test set
plt.scatter(X_test, Y_test_pred, color='orange', s=100, marker='x',
           linewidth=3, label='Test Predictions', zorder=4)

# Connect actual and predicted with dashed lines (residuals)
for i in range(len(X_test)):
    plt.plot([X_test[i], X_test[i]], [Y_test[i], Y_test_pred[i]],
            'k--', alpha=0.3, linewidth=1)

plt.title('Linear Regression with Train/Test Split\n' +
         f'Equation: y = {model.intercept_:.2f} + {model.coef_[0]:.2f}x',
         fontsize=16, fontweight='bold')
plt.xlabel('X (Independent Variable)', fontsize=14, fontweight='bold')
plt.ylabel('Y (Dependent Variable)', fontsize=14, fontweight='bold')
plt.legend(fontsize=12, loc='upper left')
plt.grid(True, alpha=0.3)

# Add text box with metrics
textstr = f'Test Set Metrics:\nR² = {r2_test:.4f}\nMSE = {mse_test:.4f}\nRMSE = {rmse_test:.4f}'
props = dict(boxstyle='round', facecolor='wheat', alpha=0.8)
plt.text(0.98, 0.02, textstr, transform=plt.gca().transAxes, fontsize=11,
        verticalalignment='bottom', horizontalalignment='right', bbox=props)

plt.tight_layout()
plt.show()

print("\nVisualization 1 complete!")


# ============================================================================
# CELL 12: Visualization 2 - Residual Plot
# ============================================================================

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))

# Residual plot for training data
train_residuals = Y_train - Y_train_pred
ax1.scatter(Y_train_pred, train_residuals, color='red', s=100,
           alpha=0.7, edgecolors='black', linewidth=2)
ax1.axhline(y=0, color='blue', linestyle='--', linewidth=2)
ax1.set_title('Residual Plot - Training Set', fontsize=14, fontweight='bold')
ax1.set_xlabel('Predicted Values', fontsize=12, fontweight='bold')
ax1.set_ylabel('Residuals', fontsize=12, fontweight='bold')
ax1.grid(True, alpha=0.3)

# Residual plot for test data
test_residuals = Y_test - Y_test_pred
ax2.scatter(Y_test_pred, test_residuals, color='green', s=100,
           alpha=0.7, edgecolors='black', linewidth=2)
ax2.axhline(y=0, color='blue', linestyle='--', linewidth=2)
ax2.set_title('Residual Plot - Test Set', fontsize=14, fontweight='bold')
ax2.set_xlabel('Predicted Values', fontsize=12, fontweight='bold')
ax2.set_ylabel('Residuals', fontsize=12, fontweight='bold')
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print("\nVisualization 2 complete!")


# ============================================================================
# CELL 13: Visualization 3 - Actual vs Predicted
# ============================================================================

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))

# Training set: Actual vs Predicted
ax1.scatter(Y_train, Y_train_pred, color='red', s=150,
           alpha=0.7, edgecolors='black', linewidth=2)
ax1.plot([Y_train.min(), Y_train.max()], [Y_train.min(), Y_train.max()],
        'b--', linewidth=2, label='Perfect Prediction')
ax1.set_title('Actual vs Predicted - Training Set', fontsize=14, fontweight='bold')
ax1.set_xlabel('Actual Y', fontsize=12, fontweight='bold')
ax1.set_ylabel('Predicted Y', fontsize=12, fontweight='bold')
ax1.legend(fontsize=11)
ax1.grid(True, alpha=0.3)

# Add R² annotation
textstr = f'R² = {r2_train:.4f}'
ax1.text(0.05, 0.95, textstr, transform=ax1.transAxes, fontsize=12,
        verticalalignment='top', bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.8))

# Test set: Actual vs Predicted
ax2.scatter(Y_test, Y_test_pred, color='green', s=150,
           alpha=0.7, edgecolors='black', linewidth=2)
ax2.plot([Y_test.min(), Y_test.max()], [Y_test.min(), Y_test.max()],
        'b--', linewidth=2, label='Perfect Prediction')
ax2.set_title('Actual vs Predicted - Test Set', fontsize=14, fontweight='bold')
ax2.set_xlabel('Actual Y', fontsize=12, fontweight='bold')
ax2.set_ylabel('Predicted Y', fontsize=12, fontweight='bold')
ax2.legend(fontsize=11)
ax2.grid(True, alpha=0.3)

# Add R² annotation
textstr = f'R² = {r2_test:.4f}'
ax2.text(0.05, 0.95, textstr, transform=ax2.transAxes, fontsize=12,
        verticalalignment='top', bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.8))

plt.tight_layout()
plt.show()

print("\n Visualization 3 complete!")


# ============================================================================
# CELL 14: Visualization 4 - Complete Dataset with Manual Calculation
# ============================================================================

plt.figure(figsize=(12, 8))

# Plot all original data points
plt.scatter(X_values, Y_values, color='purple', s=200, alpha=0.6,
           edgecolors='black', linewidth=2, label='Original Data', zorder=5)

# Plot predictions from manual calculation
plt.scatter(X_values, Y_pred_manual, color='orange', s=150, marker='D',
           alpha=0.8, edgecolors='black', linewidth=2,
           label='Manual Predictions', zorder=4)

# Plot regression line
X_line_extended = np.linspace(0, 5, 100)
Y_line_extended = beta[0] + beta[1] * X_line_extended
plt.plot(X_line_extended, Y_line_extended, color='red', linewidth=3,
        label='Regression Line (Manual)', linestyle='--', zorder=3)

# Add residual lines
for i in range(len(X_values)):
    plt.plot([X_values[i], X_values[i]], [Y_values[i], Y_pred_manual[i]],
            'gray', linestyle=':', linewidth=2, alpha=0.5)

plt.title('Linear Regression - Manual Matrix Calculation\n' +
         f'y = {beta[0]:.2f} + {beta[1]:.2f}x  |  R² = {R2:.4f}',
         fontsize=16, fontweight='bold')
plt.xlabel('X (Independent Variable)', fontsize=14, fontweight='bold')
plt.ylabel('Y (Dependent Variable)', fontsize=14, fontweight='bold')
plt.legend(fontsize=12, loc='upper left')
plt.grid(True, alpha=0.3)

# Annotate each point
for i, (x, y) in enumerate(zip(X_values, Y_values)):
    plt.annotate(f'({x}, {y})',
                xy=(x, y), xytext=(10, 10),
                textcoords='offset points', fontsize=10,
                bbox=dict(boxstyle='round,pad=0.3', facecolor='yellow', alpha=0.7))

plt.tight_layout()
plt.show()

print("\n Visualization 4 complete!")


# ============================================================================
# CELL 15: Summary Report
# ============================================================================

print("\n" + "=" * 70)
print("FINAL SUMMARY REPORT")
print("=" * 70)

print("\n DATASET INFORMATION:")
print(f"   Total Samples: {len(X_values)}")
print(f"   Training Samples: {len(X_train)} (60%)")
print(f"   Test Samples: {len(X_test)} (40%)")

print("\n MODEL COEFFICIENTS:")
print(f"   Manual Calculation:")
print(f"      β₀ (Intercept) = {beta[0]:.4f}")
print(f"      β₁ (Slope) = {beta[1]:.4f}")
print(f"   sklearn Calculation:")
print(f"      β₀ (Intercept) = {model.intercept_:.4f}")
print(f"      β₁ (Slope) = {model.coef_[0]:.4f}")

print("\n REGRESSION EQUATION:")
print(f"   y = {beta[0]:.2f} + {beta[1]:.2f}x")

print("\n MODEL PERFORMANCE:")
print(f"   Full Dataset R²: {R2:.4f}")
print(f"   Training Set R²: {r2_train:.4f}")
print(f"   Test Set R²: {r2_test:.4f}")

print("\n KEY INSIGHTS:")
if R2 == 1.0:
    print("  Perfect linear fit on full dataset")
    print("  All predictions match actual values exactly")
    print(" Zero residual error")
else:
    print(f"  Model explains {R2*100:.2f}% of variance")
    print(f"  Average prediction error: {MAE:.4f}")

if r2_test >= 0.9:
    print(" Excellent performance on test set")
elif r2_test >= 0.7:
    print(" Good performance on test set")
else:
    print("   Model may need improvement")

print("\n" + "=" * 70)
print("ANALYSIS COMPLETE!")
print("=" * 70)