## Model for predicting house prices
This notebook documents the process of building, training, and evaluating a machine learning model for predicting house prices using various input features.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import skew

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

from sklearn.preprocessing import RobustScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.feature_selection import SelectKBest, f_regression

In [51]:
df = pd.read_csv('../Data/cleaned_train.csv')
df['SalePrice_Log'] = np.log(df['SalePrice'])

In [52]:
numeric_features = df.select_dtypes(include=['int64', 'float64']).drop(['SalePrice', 'SalePrice_Log'], axis=1).columns.tolist()
categorical_features = df.select_dtypes(include=['object']).columns.tolist()

In [None]:
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', RobustScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='None')),
    ('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

In [54]:
preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, numeric_features), 
    ('cat', categorical_transformer, categorical_features)    
])


In [55]:
X = df.drop(['SalePrice', 'SalePrice_Log'], axis=1)
y = df['SalePrice_Log']

In [56]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [57]:
models = {
    'Linear Regression' : LinearRegression(),
    'ridge regression' : Ridge(alpha=1.0),
    'Lasso regression' : Lasso(alpha=0.01),
    'Random Forest' : RandomForestRegressor(n_estimators=100, random_state=42),
    'XG Boost' : XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
}

In [58]:
for name, model in models.items():
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])
    
    pipeline.fit(X_train, y_train)
    y_pred_log = pipeline.predict(X_test)
    y_pred = np.expm1(y_pred_log)
    y_test_actual = np.expm1(y_test)
    
    rmse = mean_squared_error(y_test_actual, y_pred, squared=False)
    r2 = r2_score(y_test_actual, y_pred)
    
    print(f"{name} have RMSE: {rmse:.2f}, R2: {r2:.4f}")



Linear Regression have RMSE: 23446.18, R2: 0.9283
ridge regression have RMSE: 23516.82, R2: 0.9279




Lasso regression have RMSE: 31470.64, R2: 0.8709




Random Forest have RMSE: 29142.32, R2: 0.8893
XG Boost have RMSE: 26821.94, R2: 0.9062




In [65]:
r2_train = pipeline.score(X_train, y_train)
r2_test = pipeline.score(X_test, y_test)
print(f"Train R2: {r2_train:.4f}, Test R2: {r2_test:.4f}")

from sklearn.model_selection import cross_val_score

cv_score = cross_val_score(pipeline, X_train, y_train, scoring='r2', cv=5)
print("Mean CV R2:", np.mean(cv_score))



Train R2: 0.9948, Test R2: 0.8869
Mean CV R2: 0.8712052300341119


In [60]:
pipeline_linear = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression)
])

In [61]:
pipeline_linear.fit(X_train, y_train)
y_pred_lin = pipeline_linear.predict(X_test)

AttributeError: 'csr_matrix' object has no attribute '_validate_params'