<a href="https://colab.research.google.com/github/Prashantthhh/explainable_ai_2081_39/blob/main/assignement_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
data = pd.DataFrame({
    "Flyers_100s": [1, 2, 3, 1, 2],
    "Cars_Washed": [12, 22, 29, 14, 24]
})

X = data[["Flyers_100s"]]
y = data["Cars_Washed"]

model = LinearRegression()
model.fit(X, y)

coef = model.coef_[0]
intercept = model.intercept_
print(f"Coefficient: {coef:.2f}, Intercept: {intercept:.2f}")

baseline = y.mean()
print(f"Baseline (mean y): {baseline:.2f}")


predictions = model.predict(X)

shap_values = predictions - baseline


check = np.isclose(predictions, baseline + shap_values).all()
print(f"Check Prediction = Baseline + SHAP? {check}")


comparison = np.where(predictions > y, "Over",
                      np.where(predictions < y, "Under", "Exact"))


results = pd.DataFrame({
    "Flyers_100s": X["Flyers_100s"],
    "Actual": y,
    "Predicted": predictions.round(2),
    "Baseline": baseline,
    "SHAP_Value": shap_values.round(2),
    "Over/Under": comparison
})

print(results)



Coefficient: 8.29, Intercept: 5.29
Baseline (mean y): 20.20
Check Prediction = Baseline + SHAP? True
   Flyers_100s  Actual  Predicted  Baseline  SHAP_Value Over/Under
0            1      12      13.57      20.2       -6.63       Over
1            2      22      21.86      20.2        1.66      Under
2            3      29      30.14      20.2        9.94       Over
3            1      14      13.57      20.2       -6.63      Under
4            2      24      21.86      20.2        1.66      Under


In [2]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression

data = pd.DataFrame({
    "Chargers": [5, 3, 4, 2, 5],
    "PeakHour": [1, 0, 1, 0, 0],
    "Sessions": [80, 40, 70, 30, 60]
})

X = data[["Chargers", "PeakHour"]]
y = data["Sessions"]


model = LinearRegression()
model.fit(X, y)

coef_chargers, coef_peak = model.coef_
intercept = model.intercept_
print(f"Coefficients -> Chargers: {coef_chargers:.2f}, PeakHour: {coef_peak:.2f}")
print(f"Intercept: {intercept:.2f}")


baseline = y.mean()
print(f"Baseline (mean y): {baseline:.2f}")


predictions = model.predict(X)


mean_X = X.mean()
shap_chargers = (X["Chargers"] - mean_X["Chargers"]) * coef_chargers
shap_peak = (X["PeakHour"] - mean_X["PeakHour"]) * coef_peak


check = np.isclose(predictions, baseline + shap_chargers + shap_peak).all()
print(f"Check Prediction Equation: {check}")

comparison = np.where(predictions > y, "Over",
                      np.where(predictions < y, "Under", "Exact"))


results = pd.DataFrame({
    "Chargers": X["Chargers"],
    "PeakHour": X["PeakHour"],
    "Actual": y,
    "Predicted": predictions.round(2),
    "Baseline": baseline,
    "SHAP_Chargers": shap_chargers.round(2),
    "SHAP_PeakHour": shap_peak.round(2),
    "Over/Under": comparison
})

print("\nFinal Results Table:\n")
print(results)

Coefficients -> Chargers: 10.00, PeakHour: 20.00
Intercept: 10.00
Baseline (mean y): 56.00
Check Prediction Equation: True

Final Results Table:

   Chargers  PeakHour  Actual  Predicted  Baseline  SHAP_Chargers  \
0         5         1      80       80.0      56.0           12.0   
1         3         0      40       40.0      56.0           -8.0   
2         4         1      70       70.0      56.0            2.0   
3         2         0      30       30.0      56.0          -18.0   
4         5         0      60       60.0      56.0           12.0   

   SHAP_PeakHour Over/Under  
0           12.0      Exact  
1           -8.0      Exact  
2           12.0      Exact  
3           -8.0       Over  
4           -8.0      Exact  


In [3]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_diabetes
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

diabetes = load_diabetes()
X = pd.DataFrame(diabetes.data, columns=diabetes.feature_names)
y = pd.Series(diabetes.target, name="disease_progression")

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


model = LinearRegression()
model.fit(X_train, y_train)

coefficients = pd.Series(model.coef_, index=X.columns)
intercept = model.intercept_
print("Model Coefficients:\n", coefficients)
print(f"Intercept: {intercept:.2f}")

baseline = y_train.mean()
print(f"Baseline (mean training y): {baseline:.2f}")


predictions = model.predict(X_test)


mean_X_train = X_train.mean()
shap_values_df = (X_test - mean_X_train) * coefficients

verification = np.allclose(predictions, baseline + shap_values_df.sum(axis=1))
print(f"Prediction verification: {verification}")

comparison = np.where(predictions > y_test, "Over",
                      np.where(predictions < y_test, "Under", "Exact"))


results = pd.DataFrame({
    "Actual": y_test.values,
    "Predicted": predictions.round(2),
    "Baseline": baseline,
    "Total_SHAP_Sum": shap_values_df.sum(axis=1).round(2),
    "Over/Under": comparison
}, index=X_test.index)


results = pd.concat([results, shap_values_df.round(2)], axis=1)

print("\nSample Results with SHAP values:\n")
print(results.head())

Model Coefficients:
 age     37.904021
sex   -241.964362
bmi    542.428759
bp     347.703844
s1    -931.488846
s2     518.062277
s3     163.419983
s4     275.317902
s5     736.198859
s6      48.670657
dtype: float64
Intercept: 151.35
Baseline (mean training y): 153.74
Prediction verification: True

Sample Results with SHAP values:

     Actual  Predicted    Baseline  Total_SHAP_Sum Over/Under   age    sex  \
287   219.0     139.55  153.736544          -14.19      Under  1.66  10.85   
211    70.0     179.52  153.736544           25.78       Over  3.45  10.85   
72    202.0     134.04  153.736544          -19.70      Under  2.35 -12.22   
321   230.0     291.42  153.736544          137.68       Over  3.59  10.85   
73    111.0     123.79  153.736544          -29.95       Over  0.42 -12.22   

       bmi     bp      s1     s2     s3     s4     s5    s6  
287  -4.31  -5.97 -116.97  65.28   3.30   9.34  22.98 -0.35  
211  19.08   7.20   22.73  -8.21   0.29 -10.98 -17.47 -1.15  
72   -3.14 

In [4]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

data = pd.DataFrame({
    "studytime": [2, 3, 1, 4, 2],
    "absences": [4, 2, 10, 0, 6],
    "failures": [0, 1, 2, 0, 1],
    "health": [5, 3, 4, 2, 4],
    "G3": [15, 14, 10, 18, 12]
})

X = data.drop(columns=["G3"])
y = data["G3"]


X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.4, random_state=42
)

model = LinearRegression()
model.fit(X_train, y_train)

coefficients = pd.Series(model.coef_, index=X.columns)
intercept = model.intercept_
print("Model Coefficients:\n", coefficients)
print(f"Intercept: {intercept:.2f}")

baseline = y_train.mean()
print(f"Baseline (mean training G3): {baseline:.2f}")


predictions = model.predict(X_test)


mean_X_train = X_train.mean()
shap_values_df = (X_test - mean_X_train) * coefficients


verification = np.allclose(predictions, baseline + shap_values_df.sum(axis=1))
print(f"Prediction verification: {verification}")


comparison = np.where(predictions > y_test, "Over",
                      np.where(predictions < y_test, "Under", "Exact"))

results = pd.DataFrame({
    "Actual": y_test.values,
    "Predicted": predictions.round(2),
    "Baseline": baseline,
    "Total_SHAP_Sum": shap_values_df.sum(axis=1).round(2),
    "Over/Under": comparison
}, index=X_test.index)


results = pd.concat([results, shap_values_df.round(2)], axis=1)

print("\nFinal Results with SHAP values:\n")
print(results)

Model Coefficients:
 studytime    0.142235
absences    -0.725689
failures    -0.220610
health       0.062409
dtype: float64
Intercept: 17.31
Baseline (mean training G3): 14.33
Prediction verification: True

Final Results with SHAP values:

   Actual  Predicted   Baseline  Total_SHAP_Sum Over/Under  studytime  \
1      14      16.25  14.333333            1.91       Over       0.09   
4      12      13.27  14.333333           -1.07       Over      -0.05   

   absences  failures  health  
1      1.94     -0.07   -0.04  
4     -0.97     -0.07    0.02  
