In [1]:
!pip install scikit-learn==1.5.2

Collecting scikit-learn==1.5.2
  Downloading scikit_learn-1.5.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Downloading scikit_learn-1.5.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.3/13.3 MB[0m [31m26.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: scikit-learn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 1.6.1
    Uninstalling scikit-learn-1.6.1:
      Successfully uninstalled scikit-learn-1.6.1
Successfully installed scikit-learn-1.5.2


In [11]:
from google.colab import drive
drive.mount('/content/drive')

MessageError: Error: credential propagation was unsuccessful

In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from xgboost import XGBRegressor

housing = pd.read_csv('https://raw.githubusercontent.com/byui-cse/cse450-course/master/data/housing.csv')

In [6]:
# Load housing data
housing_data = pd.read_csv('https://raw.githubusercontent.com/byui-cse/cse450-course/master/data/housing.csv')

housing_data['date'] = pd.to_datetime(housing_data['date'], format='%Y%m%dT%H%M%S', errors='coerce')

# Drop ID column if present
housing_data.drop(columns=['id'], inplace=True, errors='ignore')

# Feature Engineering
housing_data['house_age'] = 2025 - housing_data['yr_built']
housing_data['was_renovated'] = (housing_data['yr_renovated'] > 0).astype(int)
housing_data.drop(columns=['date', 'yr_built', 'yr_renovated'], inplace=True)

# One-hot encode categorical variable (zipcode)
housing_data = pd.get_dummies(housing_data, columns=['zipcode'])

# Split into features and target
X = housing_data.drop(columns=['price'])
y = housing_data['price']

# Train/validation split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

# Train model
model = XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=6, random_state=42)
model.fit(X_train_scaled, y_train)

# Predictions and confidence interval
y_pred = model.predict(X_val_scaled)
residuals = y_val - y_pred
std_residual = np.std(residuals)
confidence_interval = 1.96 * std_residual
y_pred_lower = y_pred - confidence_interval
y_pred_upper = y_pred + confidence_interval

# Evaluation metrics
mae = mean_absolute_error(y_val, y_pred)
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
r2 = r2_score(y_val, y_pred)

# Load holdout test datasets
test_full = pd.read_csv('https://raw.githubusercontent.com/byui-cse/cse450-course/master/data/housing_holdout_test.csv')
test_mini = pd.read_csv('https://raw.githubusercontent.com/byui-cse/cse450-course/master/data/housing_holdout_test_mini.csv')

# Function to prepare test set
all_predictions = {}
for test_set, name in zip([test_full, test_mini], ['Full Test Set', 'Mini Test Set']):
    test_set['house_age'] = 2025 - test_set['yr_built']
    test_set['was_renovated'] = (test_set['yr_renovated'] > 0).astype(int)
    test_set.drop(columns=['id', 'date', 'yr_built', 'yr_renovated'], inplace=True)
    test_set = pd.get_dummies(test_set, columns=['zipcode'])
    test_set = test_set.reindex(columns=X.columns, fill_value=0)
    test_scaled = scaler.transform(test_set)
    test_pred = model.predict(test_scaled)
    print(f"\nSample Predictions for {name}:")
    print(pd.DataFrame({"Predicted Price": test_pred[:5]}))
    all_predictions[name] = test_pred

# Save predictions from full test set to CSV with correct format
submission = pd.DataFrame({'price': all_predictions['Full Test Set']})
submission.to_csv('team8-module3-predictions.csv', index=False)



# Final evaluation summary
eval_results = {
    "Mean Absolute Error": mae,
    "Root Mean Squared Error": rmse,
    "R² Score": r2,
    "Confidence Interval ±": confidence_interval
}
print("\nEvaluation Results:\n", eval_results)


Sample Predictions for Full Test Set:
   Predicted Price
0     3.768997e+05
1     8.468454e+05
2     1.167513e+06
3     2.138142e+06
4     6.901859e+05

Sample Predictions for Mini Test Set:
   Predicted Price
0       446557.250
1       665407.125
2       191530.500
3       349510.250
4       465175.250

Evaluation Results:
 {'Mean Absolute Error': np.float64(69574.790515625), 'Root Mean Squared Error': np.float64(138383.31716643626), 'R² Score': 0.8730264553614253, 'Confidence Interval ±': 271017.6367705162}
