In [None]:
pip install pandas numpy nltk seaborn transformers torch xgboost scikit-learn matplotlib

In [None]:
import os
import pandas as pd
import numpy as np
import re
import nltk
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from transformers import BertTokenizer, BertModel
import torch
from sklearn.linear_model import LinearRegression

In [None]:
# Set paths
DATA_PATH = os.path.join("data", "kc_house_data.csv")

# Load data
df = pd.read_csv(DATA_PATH)

In [None]:
# ==========================
# STEP 1: Preprocess Structured Data
# ==========================

# Handle missing values
if "date" in df.columns:
    df["date"] = pd.to_datetime(df["date"], errors="coerce")

num_cols = df.select_dtypes(include=["number"]).columns
df[num_cols] = df[num_cols].fillna(df[num_cols].median())

# Feature engineering
df["price_per_sqft"] = df["price"] / df["sqft_living"]

# Normalize numerical columns
num_cols = ["sqft_living", "sqft_lot", "sqft_above", "sqft_basement", "price_per_sqft"]
scaler = StandardScaler()
df[num_cols] = scaler.fit_transform(df[num_cols])

# One-hot encode categorical features
categorical_cols = ["zipcode"]
encoder = OneHotEncoder(sparse_output=False, drop="first")
encoded = encoder.fit_transform(df[categorical_cols])
df = df.drop(columns=categorical_cols)
df = pd.concat([df, pd.DataFrame(encoded)], axis=1)

In [None]:
# ==========================
# STEP 2: Preprocess Textual Data
# ==========================

nltk.download("stopwords")
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

stop_words = set(stopwords.words("english"))

def clean_text(text):
    text = re.sub(r"[^a-zA-Z\s]", "", str(text))  # Remove special characters
    tokens = word_tokenize(text.lower())
    tokens = [word for word in tokens if word not in stop_words]
    return " ".join(tokens)

# Assuming a 'description' column exists for property descriptions
if "description" in df.columns:
    df["description_clean"] = df["description"].apply(clean_text)

# Convert text to embeddings using BERT
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased")

def text_to_embedding(text):
    tokens = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
    with torch.no_grad():
        output = model(**tokens)
    return output.last_hidden_state.mean(dim=1).numpy()

if "description_clean" in df.columns:
    df["text_embedding"] = df["description_clean"].apply(text_to_embedding)

In [None]:
# ==========================
# STEP 3: Build Predictive Models
# ==========================

# Split Data
X = df.drop(columns=["price"])
y = df["price"]
for col in X.select_dtypes(include=["datetime64"]).columns:
    X[col] = X[col].astype(np.int64) // 10**9
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.columns = X_train.columns.astype(str)
X_test.columns = X_test.columns.astype(str)

# Train Random Forest
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

# Train XGBoost
xgb = XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
xgb.fit(X_train, y_train)
y_pred_xgb = xgb.predict(X_test)

# Evaluate models
def evaluate(y_true, y_pred, model_name):
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    print(f"{model_name} Performance:\n RMSE: {rmse}, MAE: {mae}, R²: {r2}\n")

evaluate(y_test, y_pred_rf, "Random Forest")
evaluate(y_test, y_pred_xgb, "XGBoost")

In [None]:
# ==========================
# STEP 4: Causal Inference (Regression Discontinuity)
# ==========================

df["treatment"] = (df["price"] >= 500000).astype(int)

rdd_model = LinearRegression()
rdd_model.fit(df[["price"]].values.reshape(-1, 1), df["price"])
df["predicted_price"] = rdd_model.predict(df[["price"]].values.reshape(-1, 1))

# Plot Regression Discontinuity
plt.figure(figsize=(10,6))
sns.scatterplot(x=df["price"], y=df["price"], alpha=0.5)
plt.plot(df["price"], df["predicted_price"], color="red", linewidth=2)
plt.axvline(x=500000, color="black", linestyle="--", label="Policy Threshold")
plt.xlabel("Price")
plt.ylabel("Predicted Price")
plt.title("Regression Discontinuity Analysis")
plt.legend()
plt.show()