# Trying Different Models

In [4]:
import pandas as pd
! pip install xgboost
#numpy==1.26.4

from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import root_mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.svm import SVR
from xgboost import XGBRegressor

Collecting xgboost
  Obtaining dependency information for xgboost from https://files.pythonhosted.org/packages/29/22/e3ff2dfafe862a91733dfa0aecdb4794aa1d9a18e09a14e118bde0cbc2db/xgboost-3.0.2-py3-none-win_amd64.whl.metadata
  Downloading xgboost-3.0.2-py3-none-win_amd64.whl.metadata (2.1 kB)
Downloading xgboost-3.0.2-py3-none-win_amd64.whl (150.0 MB)
   ---------------------------------------- 0.0/150.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/150.0 MB ? eta -:--:--
   ---------------------------------------- 0.1/150.0 MB 1.6 MB/s eta 0:01:33
   ---------------------------------------- 0.4/150.0 MB 3.0 MB/s eta 0:00:50
   ---------------------------------------- 0.5/150.0 MB 3.5 MB/s eta 0:00:43
   ---------------------------------------- 1.0/150.0 MB 5.0 MB/s eta 0:00:30
   ---------------------------------------- 1.3/150.0 MB 4.9 MB/s eta 0:00:31
   ---------------------------------------- 1.7/150.0 MB 5.8 MB/s eta 0:00:26
    ---------------------------------

In [6]:
df = pd.read_csv('../data/housing_cleaned.csv')

df.head()

Unnamed: 0,price,city,bedrooms,bathrooms,sqft_living,sqft_lot,floors,condition,sqft_basement,pct_basement,house_age,was_renovated,renovation_age,sqft_living15,sqft_lot15
0,231300.0,Seattle,2,1.0,1180,5650,1.0,3,0,0.0,70,0,70,1340,5650
1,538000.0,Seattle,3,2.25,2570,7242,2.0,3,400,0.155642,74,1,34,1690,7639
2,180000.0,Kenmore,2,1.0,770,10000,1.0,3,0,0.0,92,0,92,2720,8062
3,604000.0,Seattle,4,3.0,1960,5000,1.0,5,910,0.464286,60,0,60,1360,5000
4,510000.0,Sammamish,3,2.0,1680,8080,1.0,3,0,0.0,38,0,38,1800,7503


In [8]:
X = df.drop('price', axis=1)
y = df['price']
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=X['city'], random_state=42,
)

In [14]:
binary = ['was_renovated']
categorical = ['city']
numerical = X.select_dtypes(include=['float64', 'int64']).columns.drop(binary).tolist()

preprocessor = ColumnTransformer([
    ('num', StandardScaler(), numerical),
    ('cat', OneHotEncoder(drop='first'), categorical),
    ('bin', 'passthrough', binary)
])

In [16]:
results = []

models = {
    'Linear Regression': LinearRegression(),
    'Random Forest': RandomForestRegressor(random_state=42),
    'XGBoost': XGBRegressor(random_state=42),
    'Support Vector Machine': SVR(),
    'k-Nearest Neighbors': KNeighborsRegressor(),
}

for model_name, model in models.items():
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('model', model)
    ])

    pipeline.fit(X_train, y_train)

    y_pred_train = pipeline.predict(X_train)
    y_pred_test = pipeline.predict(X_test)

    train_rmse = root_mean_squared_error(y_train, y_pred_train)
    test_rmse = root_mean_squared_error(y_test, y_pred_test)
    
    results.append({
        'Model': model_name,
        'Train RMSE': train_rmse,
        'Test RMSE': test_rmse,
    })

results_df = pd.DataFrame(results)

results_df

Unnamed: 0,Model,Train RMSE,Test RMSE
0,Linear Regression,208844.726508,197167.777538
1,Random Forest,68120.380641,175523.39202
2,XGBoost,100873.660769,170416.144392
3,Support Vector Machine,380435.578294,365782.006885
4,k-Nearest Neighbors,162638.299886,197473.365736
