In [19]:
from google.colab import files
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import pickle
import warnings
warnings.filterwarnings('ignore')

print("Upload your AmesHousing.csv file:")
uploaded = files.upload()

df = pd.read_csv('AmesHousing.csv')
print(f"\n✓ Dataset loaded: {df.shape[0]} rows, {df.shape[1]} columns")

print("\n" + "="*60)
print("analysing data")
print("="*60)

df = df.drop(['Order', 'PID'], axis=1, errors='ignore')
X = df.drop('SalePrice', axis=1)
y = df['SalePrice']

numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns

for col in numeric_features:
    X[col].fillna(X[col].median(), inplace=True)

for col in categorical_features:
    X[col].fillna('None', inplace=True)

if 'Total Bsmt SF' in X.columns and 'Gr Liv Area' in X.columns:
    X['Total_SF'] = X['Total Bsmt SF'] + X['Gr Liv Area']

if 'Year Built' in X.columns and 'Yr Sold' in X.columns:
    X['House_Age'] = X['Yr Sold'] - X['Year Built']

X = pd.get_dummies(X, columns=categorical_features, drop_first=True)
print(f"✓ Features after encoding: {X.shape[1]}")

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"\n✓ Train: {X_train.shape[0]} | Test: {X_test.shape[0]}")

print("\n" + "="*60)
print("training phase of models")
print("="*60)

models = {
    'Linear Regression': LinearRegression(),
    'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1),
    'XGBoost': XGBRegressor(n_estimators=200, learning_rate=0.05, random_state=42, n_jobs=-1)
}

results = {}
best_model = None
best_score = 0
best_name = ""

for name, model in models.items():
    print(f"\nTraining {name}...")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    results[name] = {'RMSE': rmse, 'MAE': mae, 'R2': r2}

    print(f"  RMSE: ${rmse:,.2f}")
    print(f"  MAE: ${mae:,.2f}")
    print(f"  R² Score: {r2:.4f}")

    if r2 > best_score:
        best_score = r2
        best_model = model
        best_name = name

print("\n" + "="*60)
print("comparison between models")
print("="*60)

comparison_df = pd.DataFrame(results).T.sort_values('R2', ascending=False)
print("\n" + comparison_df.to_string())

y_pred_best = best_model.predict(X_test)
rmse_best = np.sqrt(mean_squared_error(y_test, y_pred_best))
r2_best = r2_score(y_test, y_pred_best)

print("\n" + "="*60)
print(f" BEST MODEL: {best_name}")
print("="*60)
print(f"RMSE: ${rmse_best:,.2f}")
print(f"MAE: ${results[best_name]['MAE']:,.2f}")
print(f"R² Score: {r2_best:.4f}")
print("="*60)

print("\n" + "="*60)
print("samples to check model's performance")
print("="*60)

predictions_df = pd.DataFrame({
    'Actual Price': y_test.values[:10],
    'Predicted Price': y_pred_best[:10],
})

predictions_df['Error ($)'] = predictions_df['Actual Price'] - predictions_df['Predicted Price']
predictions_df['Error (%)'] = (predictions_df['Error ($)'] / predictions_df['Actual Price']) * 100

for i, row in predictions_df.iterrows():
    print(f"\n House #{i+1}:")
    print(f"   Actual Price:    ${row['Actual Price']:>10,.0f}")
    print(f"   Predicted Price: ${row['Predicted Price']:>10,.0f}")
    print(f"   Error:           ${abs(row['Error ($)']):>10,.0f} ({abs(row['Error (%)']):>5.2f}%)")

    if abs(row['Error (%)']) < 5:
        print(f"   Excellent prediction!")
    elif abs(row['Error (%)']) < 10:
        print(f"    Good prediction")
    else:
        print(f"    Needs improvement")

print("\n" + "="*60)
print("OVERALL PREDICTION STATISTICS")
print("="*60)

all_predictions = pd.DataFrame({
    'Actual': y_test.values,
    'Predicted': y_pred_best,
    'Error': y_test.values - y_pred_best
})

all_predictions['Abs_Error'] = abs(all_predictions['Error'])
all_predictions['Percent_Error'] = (all_predictions['Abs_Error'] / all_predictions['Actual']) * 100
print(f"\nTotal houses tested: {len(y_test)}")
print(f"\nAverage Prediction Error: ${all_predictions['Abs_Error'].mean():,.2f}")

print("\n" + "="*60)
with open('housing_price_model.pkl', 'wb') as f:
    pickle.dump(best_model, f)
print("  saving  as 'housing_price_model.pkl'")
print("="*60)

files.download('housing_price_model.pkl')


Upload your AmesHousing.csv file:


Saving datasets_ames_housing.py to datasets_ames_housing (2).py

✓ Dataset loaded: 2930 rows, 82 columns

analysing data
✓ Features after encoding: 278

✓ Train: 2344 | Test: 586

training phase of models

Training Linear Regression...
  RMSE: $35,193.69
  MAE: $16,368.47
  R² Score: 0.8455

Training Random Forest...
  RMSE: $25,340.41
  MAE: $15,505.09
  R² Score: 0.9199

Training XGBoost...
  RMSE: $22,944.12
  MAE: $14,179.39
  R² Score: 0.9343

comparison between models

                           RMSE           MAE        R2
XGBoost            22944.115760  14179.392578  0.934340
Random Forest      25340.410502  15505.094795  0.919909
Linear Regression  35193.691886  16368.466164  0.845514

 BEST MODEL: XGBoost
RMSE: $22,944.12
MAE: $14,179.39
R² Score: 0.9343

samples to check model's performance

 House #1:
   Actual Price:    $   161,000
   Predicted Price: $   170,241
   Error:           $     9,241 ( 5.74%)
    Good prediction

 House #2:
   Actual Price:    $   116,000
   Pr

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>