In [None]:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import shap
import xgboost as xgb

# Set the random seed for reproducibility
np.random.seed(42)

# Load dataset
from google.colab import files
uploaded = files.upload()  # Allows user to upload dataset

# Read dataset (assuming it's a CSV file)
df = pd.read_csv(next(iter(uploaded.keys())))

# Ensure correct column names and types
df = df.rename(columns={df.columns[0]: "GPA"})
df["GPA"] = pd.to_numeric(df["GPA"], errors="coerce")

# Define Dean’s List cutoff
cutoff = 3.5

# Create Running Variable (GPA centered at cutoff)
df["Running_Var"] = df["GPA"] - cutoff

# Define treatment assignment (1 = Dean's List, 0 = Not on Dean’s List)
df["Dean_List"] = (df["GPA"] >= cutoff).astype(int)

# Simulate next-term GPA (Outcome variable) with a small treatment effect
df["Next_Term_GPA"] = df["GPA"] + np.random.normal(0.1, 0.3, len(df))

### PART 1: VISUALIZE REGRESSION DISCONTINUITY DESIGN (RD PLOT) ###
plt.figure(figsize=(10, 6))
sns.scatterplot(x=df["Running_Var"], y=df["Next_Term_GPA"], alpha=0.5, label="Students")
sns.regplot(x=df[df["Running_Var"] < 0]["Running_Var"], y=df[df["Running_Var"] < 0]["Next_Term_GPA"],
            scatter=False, color="red", ci=None, label="Below Threshold (Control)")
sns.regplot(x=df[df["Running_Var"] >= 0]["Running_Var"], y=df[df["Running_Var"] >= 0]["Next_Term_GPA"],
            scatter=False, color="blue", ci=None, label="Above Threshold (Treatment)")
plt.axvline(0, linestyle="--", color="black", label="GPA 3.5 Cutoff")
plt.xlabel("GPA - 3.5 (Centered Running Variable)")
plt.ylabel("Next Term GPA")
plt.title("Regression Discontinuity: Effect of Dean’s List on Next-Term GPA")
plt.legend()
plt.grid()
plt.show()

### PART 2: LOCAL LINEAR REGRESSION (RD ESTIMATION) ###
# Select data near the cutoff (-0.5 to +0.5 around 3.5)
df_near_cutoff = df[(df["Running_Var"] >= -0.5) & (df["Running_Var"] <= 0.5)]

# Define X and Y for RD estimation
X = sm.add_constant(df_near_cutoff["Running_Var"])
y = df_near_cutoff["Next_Term_GPA"]

# Fit separate regressions for control (below 3.5) and treatment (above 3.5)
model_control = sm.OLS(y[df_near_cutoff["Dean_List"] == 0], X[df_near_cutoff["Dean_List"] == 0]).fit()
model_treatment = sm.OLS(y[df_near_cutoff["Dean_List"] == 1], X[df_near_cutoff["Dean_List"] == 1]).fit()

# Print RD estimates
print("Regression Discontinuity Estimates:")
print("Control Group (Below 3.5 GPA):")
print(model_control.summary())
print("\nTreatment Group (Above 3.5 GPA):")
print(model_treatment.summary())

### PART 3: MACHINE LEARNING ANALYSIS WITH SHAP ###
# Select Features for ML
features = ["GPA", "Dean_List"]
target = "Next_Term_GPA"

# Prepare data
X = df[features]
y = df[target]

# Train XGBoost model
model = xgb.XGBRegressor(objective="reg:squarederror", n_estimators=100, random_state=42)
model.fit(X, y)

# Compute SHAP values for interpretability
explainer = shap.Explainer(model)
shap_values = explainer(X)

# SHAP Summary Plot (Feature Importance)
plt.figure(figsize=(10, 6))
shap.summary_plot(shap_values, X, plot_type="bar", show=False)
plt.title("SHAP Feature Importance: Predictors of Next-Term GPA")
plt.show()

### PART 4: CORRELATION HEATMAP ###
correlation_matrix = df[["GPA", "Dean_List", "Next_Term_GPA"]].corr()

# Heatmap visualization
plt.figure(figsize=(6, 5))
sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm", fmt=".2f", linewidths=0.5)
plt.title("Correlation Between Predictors and Next-Term GPA")
plt.show()


