<a href="https://colab.research.google.com/github/Shivamani162/EAI_lab/blob/main/Lab_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Question 1**

In [3]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_absolute_error

uploads = [1, 2, 3, 1, 2]
views = [500, 750, 950, 550, 800]

X = np.array(uploads).reshape(-1, 1)
y = np.array(views)

model = LinearRegression()
model.fit(X, y)
slope = model.coef_[0]
intercept = model.intercept_

baseline = np.mean(y)
predictions = model.predict(X)
shap_values = predictions - baseline

df = pd.DataFrame({
    "Videos Uploaded": uploads,
    "Actual Views": y,
    "Predicted Views": predictions.round(2),
    "SHAP Value": shap_values.round(2),
    "Over/Under": ["Over" if p > a else "Under" if p < a else "Exact"
                   for p, a in zip(predictions, y)]
})

print("\n--- Linear Regression Implementation ---")
print(f"Coefficient (slope): {slope:.2f}")
print(f"Intercept: {intercept:.2f}")

print("\n--- Baseline ---")
print(f"Mean Weekly Views: {baseline:.2f}")

print("\n--- SHAP Values & Predictions ---")
print(df.to_string(index=False))

print("\n--- Explanation of Input Influence ---")
for idx, row in df.iterrows():
    effect = row["SHAP Value"]
    sign = "increase" if effect > 0 else "decrease"
    print(f"{row['Videos Uploaded']} videos → {sign} of {abs(effect):.2f} views from baseline.")

print("\n--- Accuracy Metrics ---")
r2 = r2_score(y, predictions)
mae = mean_absolute_error(y, predictions)
print(f"R² Score: {r2:.4f}")
print(f"Mean Absolute Error: {mae:.2f}")

print("\n--- Trend Analysis ---")
if slope > 0:
    print(f"Weekly views increase by about {slope:.2f} for each additional video uploaded.")
elif slope < 0:
    print(f"Weekly views decrease by about {abs(slope):.2f} for each additional video uploaded.")
else:
    print("Number of videos uploaded has no linear effect on weekly views.")

print("\n--- SHAP Interpretation Insights ---")
print("SHAP values show how much the number of uploaded videos shifts the prediction away from the baseline.")
print("Positive SHAP → more views than baseline; Negative SHAP → fewer views than baseline.")
print("The farther SHAP is from zero, the stronger the effect of that input on the prediction.")



--- Linear Regression Implementation ---
Coefficient (slope): 217.86
Intercept: 317.86

--- Baseline ---
Mean Weekly Views: 710.00

--- SHAP Values & Predictions ---
 Videos Uploaded  Actual Views  Predicted Views  SHAP Value Over/Under
               1           500           535.71     -174.29       Over
               2           750           753.57       43.57       Over
               3           950           971.43      261.43       Over
               1           550           535.71     -174.29      Under
               2           800           753.57       43.57      Under

--- Explanation of Input Influence ---
1 videos → decrease of 174.29 views from baseline.
2 videos → increase of 43.57 views from baseline.
3 videos → increase of 261.43 views from baseline.
1 videos → decrease of 174.29 views from baseline.
2 videos → increase of 43.57 views from baseline.

--- Accuracy Metrics ---
R² Score: 0.9700
Mean Absolute Error: 24.29

--- Trend Analysis ---
Weekly views increas

**Question 2**

In [4]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression

notifications = [5, 7, 4, 6, 3]
delivery_time = [30, 25, 35, 20, 40]
retention = [75, 85, 70, 90, 65]

X = np.array(list(zip(notifications, delivery_time)))
y = np.array(retention)

model = LinearRegression()
model.fit(X, y)
coef_notif, coef_time = model.coef_
intercept = model.intercept_

baseline = np.mean(y)

predictions = model.predict(X)
shap_total = predictions - baseline
shap_notif = shap_total * (coef_notif / (abs(coef_notif) + abs(coef_time)))
shap_time = shap_total * (coef_time / (abs(coef_notif) + abs(coef_time)))

df = pd.DataFrame({
    "Notifications": notifications,
    "Delivery Time": delivery_time,
    "Actual Retention": y,
    "Predicted Retention": predictions.round(2),
    "SHAP (Notifications)": shap_notif.round(2),
    "SHAP (Delivery Time)": shap_time.round(2),
    "Verification": np.round(baseline + shap_notif + shap_time, 2),
    "Over/Under": ["Over" if p > a else "Under" if p < a else "Exact"
                   for p, a in zip(predictions, y)]
})

print("\n--- Model Coefficients ---")
print(f"Notifications Coefficient: {coef_notif:.4f}")
print(f"Delivery Time Coefficient: {coef_time:.4f}")
print(f"Intercept: {intercept:.4f}")

print("\n--- Baseline Value ---")
print(f"Mean Retention: {baseline:.2f}%")

print("\n--- Predictions & SHAP Values ---")
print(df.to_string(index=False))

print("\n--- Interpretation ---")
for idx, row in df.iterrows():
    print(
        f"Row {idx+1}: Notifications {row['Notifications']} "
        f"→ SHAP {row['SHAP (Notifications)']:+.2f}, "
        f"Delivery Time {row['Delivery Time']} "
        f"→ SHAP {row['SHAP (Delivery Time)']:+.2f}. "
        f"Predicted {row['Predicted Retention']}% vs Actual {row['Actual Retention']}% "
        f"→ {row['Over/Under']} prediction."
    )



--- Model Coefficients ---
Notifications Coefficient: 0.7895
Delivery Time Coefficient: -1.1579
Intercept: 107.7895

--- Baseline Value ---
Mean Retention: 77.00%

--- Predictions & SHAP Values ---
 Notifications  Delivery Time  Actual Retention  Predicted Retention  SHAP (Notifications)  SHAP (Delivery Time)  Verification Over/Under
             5             30                75                77.00                  0.00                 -0.00         77.00       Over
             7             25                85                84.37                  2.99                 -4.38         75.61      Under
             4             35                70                70.42                 -2.67                  3.91         78.24       Over
             6             20                90                89.37                  5.01                 -7.35         74.66      Under
             3             40                65                63.84                 -5.33                  7.8

**Question 3**

In [5]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_diabetes
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

data = load_diabetes()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = data.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LinearRegression()
model.fit(X_train, y_train)

baseline = np.mean(y_train)

predictions = model.predict(X_test)
shap_total = predictions - baseline
coefs = model.coef_
shap_values = shap_total.reshape(-1, 1) * (coefs / sum(abs(coefs)))

df = pd.DataFrame(X_test)
for i, col in enumerate(X_test.columns):
    df[f"SHAP {col}"] = shap_values[:, i]
df["Predicted"] = predictions.round(2)
df["Actual"] = y_test
df["Verification"] = np.round(baseline + shap_values.sum(axis=1), 2)
df["Over/Under"] = ["Over" if p > a else "Under" if p < a else "Exact"
                    for p, a in zip(predictions, y_test)]

print("\n--- Diabetes Results (first 5 rows) ---")
print(df.head().to_string(index=False))

print("\n--- Interpretation ---")
for idx, row in df.head().iterrows():
    print(f"Row {idx+1}:")
    for col in X_test.columns:
        print(f"  {col}: SHAP {row[f'SHAP {col}']:+.2f}")
    print(f"Predicted {row['Predicted']} vs Actual {row['Actual']} → {row['Over/Under']} prediction.")



--- Diabetes Results (first 5 rows) ---
     age       sex       bmi        bp        s1        s2        s3        s4        s5        s6  SHAP age  SHAP sex  SHAP bmi   SHAP bp    SHAP s1   SHAP s2   SHAP s3   SHAP s4   SHAP s5   SHAP s6  Predicted  Actual  Verification Over/Under
0.045341 -0.044642 -0.006206 -0.015999  0.125019  0.125198  0.019187  0.034309  0.032432 -0.005220 -0.139942  0.893335 -2.002653 -1.283726   3.439067 -1.912691 -0.603348 -1.016477 -2.718054 -0.179693     139.55   219.0        148.21      Under
0.092564 -0.044642  0.036907  0.021872 -0.024960 -0.016658  0.000779 -0.039493 -0.022517 -0.021788  0.254268 -1.623144  3.638718  2.332465  -6.248609  3.475263  1.096253  1.846886  4.938566  0.326492     179.52    70.0        163.77       Over
0.063504  0.050680 -0.004050 -0.012556  0.103003  0.048790  0.056003 -0.002592  0.084492 -0.017646 -0.194274  1.240168 -2.780173 -1.782127   4.774267 -2.655284 -0.837595 -1.411119 -3.773325 -0.249457     134.04   202.0        1

**Question 4**

In [6]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

try:
    df = pd.read_csv("student_performance.csv")
except FileNotFoundError:
    df = pd.DataFrame({
        "studytime": [2, 3, 1, 4, 2],
        "absences": [4, 2, 10, 0, 6],
        "G1": [15, 14, 10, 18, 12],
        "G2": [14, 15, 9, 19, 13],
        "final_score": [15, 16, 8, 20, 12]
    })

X = df.drop(columns=["final_score"])
y = df["final_score"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

model = LinearRegression()
model.fit(X_train, y_train)

baseline = np.mean(y_train)

predictions = model.predict(X_test)
shap_total = predictions - baseline
coefs = model.coef_
shap_values = shap_total.reshape(-1, 1) * (coefs / sum(abs(coefs)))

df_results = X_test.copy()
for i, col in enumerate(X_test.columns):
    df_results[f"SHAP {col}"] = shap_values[:, i]
df_results["Predicted"] = predictions.round(2)
df_results["Actual"] = y_test
df_results["Verification"] = np.round(baseline + shap_values.sum(axis=1), 2)
df_results["Over/Under"] = ["Over" if p > a else "Under" if p < a else "Exact"
                            for p, a in zip(predictions, y_test)]

print("\n--- Student Performance Results ---")
print(df_results.to_string(index=False))

print("\n--- Interpretation ---")
for idx, row in df_results.iterrows():
    print(f"Row {idx+1}:")
    for col in X_test.columns:
        print(f"  {col}: SHAP {row[f'SHAP {col}']:+.2f}")
    print(f"Predicted {row['Predicted']} vs Actual {row['Actual']} → {row['Over/Under']} prediction.")



--- Student Performance Results ---
 studytime  absences  G1  G2  SHAP studytime  SHAP absences   SHAP G1   SHAP G2  Predicted  Actual  Verification Over/Under
         3         2  14  15       -0.020585      -0.617544  0.545497  0.360234      15.88      16         14.60      Under
         2         6  12  13        0.027836       0.835088 -0.737661 -0.487135      12.25      12         13.97       Over

--- Interpretation ---
Row 2:
  studytime: SHAP -0.02
  absences: SHAP -0.62
  G1: SHAP +0.55
  G2: SHAP +0.36
Predicted 15.88 vs Actual 16 → Under prediction.
Row 5:
  studytime: SHAP +0.03
  absences: SHAP +0.84
  G1: SHAP -0.74
  G2: SHAP -0.49
Predicted 12.25 vs Actual 12 → Over prediction.
