# SHAP Value Calculation for Linear Regression

In [None]:

# SHAP Value Calculation for Linear Regression

## **Step 1: Load Data and Train Model**

import numpy as np
import pandas as pd
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

# Load Boston Housing dataset
boston = load_boston()
X = pd.DataFrame(boston.data, columns=boston.feature_names)
y = pd.Series(boston.target, name="MEDV")

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a linear regression model
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)

# Select one instance to calculate SHAP values
instance_idx = 0
instance = X_test.iloc[instance_idx]

## **Step 2: Calculate Baseline Value**

# The baseline value is the mean prediction across the training set
baseline_value = linear_model.predict(X_train).mean()
print(f"Baseline Value (Mean Prediction): {baseline_value:.2f}")

## **Step 3: Get Feature Coefficients and Means**

# Coefficients from the trained linear regression model
coefficients = linear_model.coef_

# Feature means across the training set
feature_means = X_train.mean()

## **Step 4: Calculate SHAP Values**

# Calculate SHAP values for each feature
shap_values = {}
for i, feature in enumerate(X_test.columns):
    # SHAP value formula for linear regression
    shap_values[feature] = (instance[feature] - feature_means[feature]) * coefficients[i]

# Display SHAP values for each feature
print("\nSHAP Values for Each Feature:")
for feature, value in shap_values.items():
    print(f"{feature}: {value:.2f}")

## **Step 5: Verify the Prediction**

# Add SHAP values to the baseline value to get the final prediction
prediction = baseline_value + sum(shap_values.values())

# Display the prediction
print(f"\nPrediction for Instance {instance_idx}: {prediction:.2f}")
print(f"Actual Value: {y_test.iloc[instance_idx]:.2f}")

## **Step 6: Interpret Results**

# Key points:
# 1. The baseline value represents the model's average prediction across the dataset.
# 2. Positive SHAP values increase the prediction above the baseline.
# 3. Negative SHAP values decrease the prediction below the baseline.
