<a href="https://colab.research.google.com/github/SerDavidE/HousePricePredict/blob/main/Predicting_Housing_Prices_Phase_2_Regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.metrics import r2_score
from sklearn.preprocessing import MinMaxScaler, StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import VarianceThreshold, SelectKBest, f_regression, RFECV, SelectFromModel

In [None]:
file_path = '/content/drive/MyDrive/Colab Notebooks/Chapter_7_Supervised_Machine_Learning/train.csv'
df = pd.read_csv(file_path)

In [None]:
# Calculate the percentage of missing values for each column
missing_data = df.isnull().sum()
missing_percent = (missing_data / len(df)) * 100

# Create a DataFrame to display missing data information
missing_info = pd.DataFrame({'Missing Values': missing_data, 'Percentage': missing_percent})
missing_info = missing_info[missing_info['Missing Values'] > 0].sort_values(by='Percentage', ascending=False)

# Display columns with missing values
missing_info

Unnamed: 0,Missing Values,Percentage
PoolQC,1453,99.520548
MiscFeature,1406,96.30137
Alley,1369,93.767123
Fence,1179,80.753425
FireplaceQu,690,47.260274
LotFrontage,259,17.739726
GarageType,81,5.547945
GarageYrBlt,81,5.547945
GarageFinish,81,5.547945
GarageQual,81,5.547945


In [None]:
from sklearn.impute import SimpleImputer

# Drop columns with a high percentage of missing values
df.drop(columns=['PoolQC', 'MiscFeature', 'Alley', 'Fence'], inplace=True)

# Impute numerical columns with median
numerical_imputer = SimpleImputer(strategy='median')
df[['LotFrontage', 'GarageYrBlt', 'MasVnrArea']] = numerical_imputer.fit_transform(df[['LotFrontage', 'GarageYrBlt', 'MasVnrArea']])

# Impute categorical columns with most frequent value
categorical_imputer = SimpleImputer(strategy='most_frequent')
categorical_cols_with_na = ['FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond',
                            'BsmtExposure', 'BsmtFinType2', 'BsmtFinType1', 'BsmtCond', 'BsmtQual',
                            'MasVnrType', 'Electrical']
df[categorical_cols_with_na] = categorical_imputer.fit_transform(df[categorical_cols_with_na])

# Verify if all missing values have been handled
df.isnull().sum().max()

0

In [None]:
# Identify categorical columns
categorical_cols = df.select_dtypes(include=['object']).columns

# Perform one-hot encoding
df_encoded = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

# Display the first few rows of the encoded dataset
df_encoded.head()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,SaleType_ConLI,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,1,60,65.0,8450,7,5,2003,2003,196.0,706,...,0,0,0,0,1,0,0,0,1,0
1,2,20,80.0,9600,6,8,1976,1976,0.0,978,...,0,0,0,0,1,0,0,0,1,0
2,3,60,68.0,11250,7,5,2001,2002,162.0,486,...,0,0,0,0,1,0,0,0,1,0
3,4,70,60.0,9550,7,5,1915,1970,0.0,216,...,0,0,0,0,1,0,0,0,0,0
4,5,60,84.0,14260,8,5,2000,2000,350.0,655,...,0,0,0,0,1,0,0,0,1,0


In [None]:
from sklearn.preprocessing import StandardScaler

# Drop the 'Id' column as it is not a feature for modeling
df_encoded.drop(columns=['Id'], inplace=True)

# Separating the features (X) from the target variable (y)
X = df_encoded.drop(columns=['SalePrice'])
y = df_encoded['SalePrice']

# Perform scaling
scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

# Display the first few rows of scaled data
X_scaled.head()

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,SaleType_ConLI,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,0.073375,-0.220875,-0.207142,0.651479,-0.5172,1.050994,0.878668,0.514104,0.575425,-0.288653,...,-0.058621,-0.058621,-0.301962,-0.045376,0.390293,-0.052414,-0.091035,-0.117851,0.467651,-0.305995
1,-0.872563,0.46032,-0.091886,-0.071836,2.179628,0.156734,-0.429577,-0.57075,1.171992,-0.288653,...,-0.058621,-0.058621,-0.301962,-0.045376,0.390293,-0.052414,-0.091035,-0.117851,0.467651,-0.305995
2,0.073375,-0.084636,0.07348,0.651479,-0.5172,0.984752,0.830215,0.325915,0.092907,-0.288653,...,-0.058621,-0.058621,-0.301962,-0.045376,0.390293,-0.052414,-0.091035,-0.117851,0.467651,-0.305995
3,0.309859,-0.44794,-0.096897,0.651479,-0.5172,-1.863632,-0.720298,-0.57075,-0.499274,-0.288653,...,-0.058621,-0.058621,-0.301962,-0.045376,0.390293,-0.052414,-0.091035,-0.117851,-2.138345,-0.305995
4,0.073375,0.641972,0.375148,1.374795,-0.5172,0.951632,0.733308,1.366489,0.463568,-0.288653,...,-0.058621,-0.058621,-0.301962,-0.045376,0.390293,-0.052414,-0.091035,-0.117851,0.467651,-0.305995


In [None]:
from sklearn.model_selection import train_test_split

# Split the data into training and test sets (30% of data for testing)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

# Display the shapes of train and test datasets to ensure the split
(X_train.shape, X_test.shape), (y_train.shape, y_test.shape)

(((1022, 236), (438, 236)), ((1022,), (438,)))

In [None]:
from sklearn.feature_selection import VarianceThreshold
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Initialize VarianceThreshold and fit to X_train
selector = VarianceThreshold(threshold=0.1)
X_train_var = selector.fit_transform(X_train)
X_test_var = selector.transform(X_test)

# Train Random Forest model
rf_var = RandomForestRegressor(random_state=42)
rf_var.fit(X_train_var, y_train)

# Predictions and Evaluation
y_pred_var = rf_var.predict(X_test_var)
rmse_var = np.sqrt(mean_squared_error(y_test, y_pred_var))
r2_var = r2_score(y_test, y_pred_var)

rmse_var, r2_var

(26886.888320011, 0.8964036901339321)

In [None]:
from sklearn.feature_selection import SelectKBest, f_regression

# Initialize SelectKBest and fit to X_train
selector = SelectKBest(score_func=f_regression, k=10)
X_train_kbest = selector.fit_transform(X_train, y_train)
X_test_kbest = selector.transform(X_test)

# Train Random Forest model
rf_kbest = RandomForestRegressor(random_state=42)
rf_kbest.fit(X_train_kbest, y_train)

# Predictions and Evaluation
y_pred_kbest = rf_kbest.predict(X_test_kbest)
rmse_kbest = np.sqrt(mean_squared_error(y_test, y_pred_kbest))
r2_kbest = r2_score(y_test, y_pred_kbest)

rmse_kbest, r2_kbest

  X_norms = np.sqrt(row_norms(X.T, squared=True) - n_samples * X_means**2)


(28323.277942629054, 0.8850390807435304)

In [None]:
from sklearn.feature_selection import RFECV

# Initialize RFECV and fit to X_train
selector = RFECV(estimator=RandomForestRegressor(random_state=42), step=1, cv=5)
X_train_rfecv = selector.fit_transform(X_train, y_train)
X_test_rfecv = selector.transform(X_test)

# Train Random Forest model
rf_rfecv = RandomForestRegressor(random_state=42)
rf_rfecv.fit(X_train_rfecv, y_train)

# Predictions and Evaluation
y_pred_rfecv = rf_rfecv.predict(X_test_rfecv)
rmse_rfecv = np.sqrt(mean_squared_error(y_test, y_pred_rfecv))
r2_rfecv = r2_score(y_test, y_pred_rfecv)

rmse_rfecv, r2_rfecv

(26307.17536556314, 0.900822846550383)

In [None]:
from sklearn.feature_selection import SelectFromModel

# Initialize SelectFromModel and fit to X_train
selector = SelectFromModel(RandomForestRegressor(random_state=42))
X_train_sfm = selector.fit_transform(X_train, y_train)
X_test_sfm = selector.transform(X_test)

# Train Random Forest model
rf_sfm = RandomForestRegressor(random_state=42)
rf_sfm.fit(X_train_sfm, y_train)

# Predictions and Evaluation
y_pred_sfm = rf_sfm.predict(X_test_sfm)
rmse_sfm = np.sqrt(mean_squared_error(y_test, y_pred_sfm))
r2_sfm = r2_score(y_test, y_pred_sfm)

rmse_sfm, r2_sfm

(27142.810346107588, 0.8944221475485599)

# Principal Component Analysis (PCA)

In [None]:
from sklearn.decomposition import PCA

# Initialize and fit PCA to the feature sets used in the previous models
pca_var = PCA(n_components=None)
X_train_var_pca = pca_var.fit_transform(X_train_var)
X_test_var_pca = pca_var.transform(X_test_var)

pca_kbest = PCA(n_components=None)
X_train_kbest_pca = pca_kbest.fit_transform(X_train_kbest)
X_test_kbest_pca = pca_kbest.transform(X_test_kbest)

pca_sfm = PCA(n_components=None)
X_train_sfm_pca = pca_sfm.fit_transform(X_train_sfm)
X_test_sfm_pca = pca_sfm.transform(X_test_sfm)

# Train Random Forest model for each PCA transformed feature set
# For VarianceThreshold with PCA
rf_var_pca = RandomForestRegressor(random_state=42)
rf_var_pca.fit(X_train_var_pca, y_train)
y_pred_var_pca = rf_var_pca.predict(X_test_var_pca)
rmse_var_pca = np.sqrt(mean_squared_error(y_test, y_pred_var_pca))
r2_var_pca = r2_score(y_test, y_pred_var_pca)

# For SelectKBest with PCA
rf_kbest_pca = RandomForestRegressor(random_state=42)
rf_kbest_pca.fit(X_train_kbest_pca, y_train)
y_pred_kbest_pca = rf_kbest_pca.predict(X_test_kbest_pca)
rmse_kbest_pca = np.sqrt(mean_squared_error(y_test, y_pred_kbest_pca))
r2_kbest_pca = r2_score(y_test, y_pred_kbest_pca)

# For SelectFromModel with PCA
rf_sfm_pca = RandomForestRegressor(random_state=42)
rf_sfm_pca.fit(X_train_sfm_pca, y_train)
y_pred_sfm_pca = rf_sfm_pca.predict(X_test_sfm_pca)
rmse_sfm_pca = np.sqrt(mean_squared_error(y_test, y_pred_sfm_pca))
r2_sfm_pca = r2_score(y_test, y_pred_sfm_pca)

rmse_var_pca, r2_var_pca, rmse_kbest_pca, r2_kbest_pca, rmse_sfm_pca, r2_sfm_pca

(30805.960812119247,
 0.8640019389148932,
 31829.40621435379,
 0.8548154933616181,
 29184.30519528542,
 0.8779432178886466)

## Pipeline for Random Forest with RFECV

In [None]:
# Initialize RFECV
selector = RFECV(estimator=RandomForestRegressor(random_state=42), step=1, cv=5)

# Initialize Random Forest Regressor with best hyperparameters
rf_best = RandomForestRegressor(
    n_estimators=100,
    max_depth=None,
    min_samples_split=2,
    min_samples_leaf=2,
    random_state=42
)

# Create a pipeline
pipeline = Pipeline([
    ('feature_selector', selector),
    ('regressor', rf_best)
])

# Fit the pipeline to training data
pipeline.fit(X_train, y_train)

# Make predictions on test data
y_pred = pipeline.predict(X_test)

# Evaluate the model
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"Root Mean Square Error: {rmse}")
print(f"R-squared: {r2}")

Root Mean Square Error: 26864.35084951752
R-squared: 0.8965772929842624


## Pipeline for Random Forest with VarianceThreshold

In [None]:
# Initialize VarianceThreshold
selector = VarianceThreshold(threshold=0.1)

# Initialize Random Forest Regressor with best hyperparameters
rf_best = RandomForestRegressor(
    n_estimators=100,
    max_depth=None,
    min_samples_split=2,
    min_samples_leaf=2,
    random_state=42
)

# Create a pipeline
pipeline_var = Pipeline([
    ('feature_selector', selector),
    ('regressor', rf_best)
])

# Fit the pipeline to training data
pipeline_var.fit(X_train, y_train)

# Make predictions on test data
y_pred_var = pipeline_var.predict(X_test)

# Evaluate the model
rmse_var = np.sqrt(mean_squared_error(y_test, y_pred_var))
r2_var = r2_score(y_test, y_pred_var)

print(f"Root Mean Square Error for VarianceThreshold: {rmse_var}")
print(f"R-squared for VarianceThreshold: {r2_var}")

Root Mean Square Error for VarianceThreshold: 26883.34033239119
R-squared for VarianceThreshold: 0.8964310294205781


In [None]:
pipeline_var

## Pipeline for Random Forest with SelectFromModel

In [None]:
# Initialize SelectFromModel
selector = SelectFromModel(RandomForestRegressor(random_state=42))

# Initialize Random Forest Regressor with best hyperparameters
rf_best = RandomForestRegressor(
    n_estimators=100,
    max_depth=None,
    min_samples_split=2,
    min_samples_leaf=2,
    random_state=42
)

# Create a pipeline
pipeline_sfm = Pipeline([
    ('feature_selector', selector),
    ('regressor', rf_best)
])

# Fit the pipeline to training data
pipeline_sfm.fit(X_train, y_train)

# Make predictions on test data
y_pred_sfm = pipeline_sfm.predict(X_test)

# Evaluate the model
rmse_sfm = np.sqrt(mean_squared_error(y_test, y_pred_sfm))
r2_sfm = r2_score(y_test, y_pred_sfm)

print(f"Root Mean Square Error for SelectFromModel: {rmse_sfm}")
print(f"R-squared for SelectFromModel: {r2_sfm}")

Root Mean Square Error for SelectFromModel: 27472.00202870519
R-squared for SelectFromModel: 0.8918456929183491


**Results 3 pipelines:**

Root Mean Square Error: 26864.35084951752

R-squared: 0.8965772929842624
>
Root Mean Square Error for VarianceThreshold: 26606.384447935365

R-squared for VarianceThreshold: 0.89855400090009
>
Root Mean Square Error for SelectFromModel: 27483.12760473666

R-squared for SelectFromModel: 0.8917580747959789
