<a href="https://colab.research.google.com/github/Shivamani162/EAI_lab/blob/main/Lab_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Question 1**

In [None]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_absolute_error

uploads = [1, 2, 3, 1, 2]
views = [500, 750, 950, 550, 800]

X = np.array(uploads).reshape(-1, 1)
y = np.array(views)

model = LinearRegression()
model.fit(X, y)
slope = model.coef_[0]
intercept = model.intercept_

baseline = np.mean(y)
predictions = model.predict(X)
shap_values = predictions - baseline

df = pd.DataFrame({
    "Videos Uploaded": uploads,
    "Actual Views": y,
    "Predicted Views": predictions.round(2),
    "SHAP Value": shap_values.round(2),
    "Over/Under": ["Over" if p > a else "Under" if p < a else "Exact"
                   for p, a in zip(predictions, y)]
})

print("\n--- Linear Regression Implementation ---")
print(f"Coefficient (slope): {slope:.2f}")
print(f"Intercept: {intercept:.2f}")

print("\n--- Baseline ---")
print(f"Mean Weekly Views: {baseline:.2f}")

print("\n--- SHAP Values & Predictions ---")
print(df.to_string(index=False))

print("\n--- Explanation of Input Influence ---")
for idx, row in df.iterrows():
    effect = row["SHAP Value"]
    sign = "increase" if effect > 0 else "decrease"
    print(f"{row['Videos Uploaded']} videos → {sign} of {abs(effect):.2f} views from baseline.")

print("\n--- Accuracy Metrics ---")
r2 = r2_score(y, predictions)
mae = mean_absolute_error(y, predictions)
print(f"R² Score: {r2:.4f}")
print(f"Mean Absolute Error: {mae:.2f}")

print("\n--- Trend Analysis ---")
if slope > 0:
    print(f"Weekly views increase by about {slope:.2f} for each additional video uploaded.")
elif slope < 0:
    print(f"Weekly views decrease by about {abs(slope):.2f} for each additional video uploaded.")
else:
    print("Number of videos uploaded has no linear effect on weekly views.")

print("\n--- SHAP Interpretation Insights ---")
print("SHAP values show how much the number of uploaded videos shifts the prediction away from the baseline.")
print("Positive SHAP → more views than baseline; Negative SHAP → fewer views than baseline.")
print("The farther SHAP is from zero, the stronger the effect of that input on the prediction.")



--- Linear Regression Implementation ---
Coefficient (slope): 217.86
Intercept: 317.86

--- Baseline ---
Mean Weekly Views: 710.00

--- SHAP Values & Predictions ---
 Videos Uploaded  Actual Views  Predicted Views  SHAP Value Over/Under
               1           500           535.71     -174.29       Over
               2           750           753.57       43.57       Over
               3           950           971.43      261.43       Over
               1           550           535.71     -174.29      Under
               2           800           753.57       43.57      Under

--- Explanation of Input Influence ---
1 videos → decrease of 174.29 views from baseline.
2 videos → increase of 43.57 views from baseline.
3 videos → increase of 261.43 views from baseline.
1 videos → decrease of 174.29 views from baseline.
2 videos → increase of 43.57 views from baseline.

--- Accuracy Metrics ---
R² Score: 0.9700
Mean Absolute Error: 24.29

--- Trend Analysis ---
Weekly views increas

**Question 2**

In [None]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression

notifications = [5, 7, 4, 6, 3]
delivery_time = [30, 25, 35, 20, 40]
retention = [75, 85, 70, 90, 65]

X = np.array(list(zip(notifications, delivery_time)))
y = np.array(retention)

model = LinearRegression()
model.fit(X, y)
coef_notif, coef_time = model.coef_
intercept = model.intercept_

baseline = np.mean(y)

predictions = model.predict(X)
shap_total = predictions - baseline
shap_notif = shap_total * (coef_notif / (abs(coef_notif) + abs(coef_time)))
shap_time = shap_total * (coef_time / (abs(coef_notif) + abs(coef_time)))

df = pd.DataFrame({
    "Notifications": notifications,
    "Delivery Time": delivery_time,
    "Actual Retention": y,
    "Predicted Retention": predictions.round(2),
    "SHAP (Notifications)": shap_notif.round(2),
    "SHAP (Delivery Time)": shap_time.round(2),
    "Verification": np.round(baseline + shap_notif + shap_time, 2),
    "Over/Under": ["Over" if p > a else "Under" if p < a else "Exact"
                   for p, a in zip(predictions, y)]
})

print("\n--- Model Coefficients ---")
print(f"Notifications Coefficient: {coef_notif:.4f}")
print(f"Delivery Time Coefficient: {coef_time:.4f}")
print(f"Intercept: {intercept:.4f}")

print("\n--- Baseline Value ---")
print(f"Mean Retention: {baseline:.2f}%")

print("\n--- Predictions & SHAP Values ---")
print(df.to_string(index=False))

print("\n--- Interpretation ---")
for idx, row in df.iterrows():
    print(
        f"Row {idx+1}: Notifications {row['Notifications']} "
        f"→ SHAP {row['SHAP (Notifications)']:+.2f}, "
        f"Delivery Time {row['Delivery Time']} "
        f"→ SHAP {row['SHAP (Delivery Time)']:+.2f}. "
        f"Predicted {row['Predicted Retention']}% vs Actual {row['Actual Retention']}% "
        f"→ {row['Over/Under']} prediction."
    )



--- Model Coefficients ---
Notifications Coefficient: 0.7895
Delivery Time Coefficient: -1.1579
Intercept: 107.7895

--- Baseline Value ---
Mean Retention: 77.00%

--- Predictions & SHAP Values ---
 Notifications  Delivery Time  Actual Retention  Predicted Retention  SHAP (Notifications)  SHAP (Delivery Time)  Verification Over/Under
             5             30                75                77.00                  0.00                 -0.00         77.00       Over
             7             25                85                84.37                  2.99                 -4.38         75.61      Under
             4             35                70                70.42                 -2.67                  3.91         78.24       Over
             6             20                90                89.37                  5.01                 -7.35         74.66      Under
             3             40                65                63.84                 -5.33                  7.8

**Question 3**

In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

df = pd.read_csv("/content/diabetes.csv")

X = df.drop(columns=["Outcome"])
y = df["Outcome"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LinearRegression()
model.fit(X_train, y_train)

coefficients = model.coef_
intercept = model.intercept_
feature_names = X.columns

baseline = np.mean(y_train)

y_pred = model.predict(X_test)
shap_values = pd.DataFrame(index=X_test.index, columns=feature_names)

for feature in feature_names:
    shap_values[feature] = model.coef_[list(feature_names).index(feature)] * (X_test[feature] - np.mean(X_train[feature]))

total_shap = shap_values.sum(axis=1)
verification = np.isclose(y_pred, baseline + total_shap)

comparison = ["Over" if p > a else "Under" if p < a else "Exact"
              for p, a in zip(y_pred, y_test)]

df_results = X_test.copy()
df_results["Actual Outcome"] = y_test
df_results["Predicted Outcome"] = y_pred.round(4)
df_results["Baseline"] = baseline
for feature in feature_names:
    df_results[f"SHAP {feature}"] = shap_values[feature].round(4)
df_results["Total SHAP"] = total_shap.round(4)
df_results["Verification"] = verification
df_results["Prediction Type"] = comparison

print("Multiple Linear Regression Implementation")
print(f"Intercept: {intercept:.4f}")
for name, coef in zip(feature_names, coefficients):
    print(f"{name}: {coef:.4f}")

print("Baseline Value")
print(f"Baseline: {baseline:.4f}")

print("SHAP Values and Predictions Table")
print(df_results.head())

print("Explanation of Feature Influence")
for idx in df_results.index[:5]:
    print(f"Record {idx}:")
    for feature in feature_names:
        contrib = "increased" if df_results.loc[idx, f"SHAP {feature}"] > 0 else "decreased"
        print(f"  {feature} {contrib} prediction by {abs(df_results.loc[idx, f'SHAP {feature}']):.4f}")
    print(f"Predicted={df_results.loc[idx, 'Predicted Outcome']}, "
          f"Actual={df_results.loc[idx, 'Actual Outcome']}, "
          f"Type={df_results.loc[idx, 'Prediction Type']}")

print("Model Accuracy")
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"R² Score: {r2:.4f}")
print(f"MAE: {mae:.4f}")
print(f"RMSE: {rmse:.4f}")

print("SHAP Interpretation Insights")
print("SHAP values represent how much each feature contributed to the prediction relative to the baseline.")
print("Positive values indicate an increase in predicted disease progression score, negative values indicate a decrease.")


Multiple Linear Regression Implementation
Intercept: -0.9488
Pregnancies: 0.0105
Glucose: 0.0056
BloodPressure: -0.0023
SkinThickness: 0.0005
Insulin: -0.0003
BMI: 0.0150
DiabetesPedigreeFunction: 0.1113
Age: 0.0065
Baseline Value
Baseline: 0.3469
SHAP Values and Predictions Table
     Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
668            6       98             58             33      190  34.0   
324            2      112             75             32        0  35.7   
624            2      108             64              0        0  30.8   
690            8      107             80              0        0  24.6   
473            7      136             90              0        0  29.9   

     DiabetesPedigreeFunction  Age  Actual Outcome  Predicted Outcome  ...  \
668                     0.430   43               0             0.3355  ...   
324                     0.148   21               0             0.2381  ...   
624                     0.158   21     

**Question 4**

In [6]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

df = pd.read_csv("/content/Student_Performance.csv")

df["Extracurricular Activities"] = df["Extracurricular Activities"].map({"Yes": 1, "No": 0})

X = df.drop(columns=["Performance Index"])
y = df["Performance Index"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

model = LinearRegression()
model.fit(X_train, y_train)

intercept = model.intercept_
coefficients = pd.Series(model.coef_, index=X.columns)

baseline = np.mean(y_train)

y_pred = model.predict(X_test)

shap_df = pd.DataFrame(index=X_test.index, columns=X.columns, dtype=float)
for feature in X.columns:
    shap_df[feature] = model.coef_[list(X.columns).index(feature)] * (X_test[feature] - X_train[feature].mean())

total_shap = shap_df.sum(axis=1)
verification = np.isclose(y_pred, baseline + total_shap)

over_under = ["Over" if p > a else "Under" if p < a else "Exact" for p, a in zip(y_pred, y_test)]

df_results = X_test.copy()
df_results["Actual Score"] = y_test
df_results["Predicted Score"] = np.round(y_pred, 2)
df_results["Baseline"] = baseline
for col in X.columns:
    df_results[f"SHAP {col}"] = np.round(shap_df[col], 4)
df_results["Total SHAP"] = np.round(total_shap, 4)
df_results["Verification (Baseline + SHAPs)"] = np.round(baseline + total_shap, 4)
df_results["Over/Under"] = over_under

print("Multiple Linear Regression Implementation")
print("Intercept:", float(intercept))
print("Coefficients:")
for name, val in coefficients.items():
    print(f"  {name}: {val}")

print("Baseline (mean of training final scores)")
print("Baseline:", float(baseline))

print("SHAP values and predictions (first 10 rows)")
print(df_results.head(10).to_string())

print("Explanation of contributions for each test record (first 10 rows)")
for idx in df_results.index[:10]:
    print(f"Record index {idx}:")
    for col in X.columns:
        val = df_results.loc[idx, f"SHAP {col}"]
        direction = "increased" if val > 0 else "decreased" if val < 0 else "no change"
        print(f"  {col}: {direction} prediction by {abs(val):.4f}")
    print(f"  Predicted Score: {df_results.loc[idx, 'Predicted Score']}, Actual Score: {df_results.loc[idx, 'Actual Score']}, {df_results.loc[idx, 'Over/Under']} prediction")

r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print("Model accuracy on test set")
print("R2 score:", r2)
print("MAE:", mae)
print("RMSE:", rmse)

Multiple Linear Regression Implementation
Intercept: -33.836543807141744
Coefficients:
  Hours Studied: 2.8589522305235064
  Previous Scores: 1.0155197942880816
  Extracurricular Activities: 0.5817128541736178
  Sleep Hours: 0.47967675981703317
  Sample Question Papers Practiced: 0.19039415655051053
Baseline (mean of training final scores)
Baseline: 55.39971428571429
SHAP values and predictions (first 10 rows)
      Hours Studied  Previous Scores  Extracurricular Activities  Sleep Hours  Sample Question Papers Practiced  Actual Score  Predicted Score   Baseline  SHAP Hours Studied  SHAP Previous Scores  SHAP Extracurricular Activities  SHAP Sleep Hours  SHAP Sample Question Papers Practiced  Total SHAP  Verification (Baseline + SHAPs) Over/Under
6252              5               69                           0            8                                 2          51.0            54.75  55.399714              0.0441               -0.6119                          -0.2852            0.69