In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import VarianceThreshold
from sklearn.metrics import mean_absolute_error

In [3]:
file_path = '/content/Melbourne_housing_FULL.csv'
melbourne_data = pd.read_csv(file_path)

In [4]:
melbourne_data_clean = melbourne_data.dropna(subset=['Price'])
X_full = melbourne_data_clean.drop(['Price'], axis=1)
y_full = melbourne_data_clean['Price']

In [5]:
# Select numeric columns only
X_numeric = X_full.select_dtypes(exclude=['object'])

In [6]:
# Split into training and test datasets
X_train, X_test, y_train, y_test = train_test_split(X_numeric, y_full, test_size=0.2, random_state=0)

In [7]:
# Impute missing values for numerical columns
imputer = SimpleImputer(strategy='mean')
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)


In [8]:
# Step 1: Train model without removing low-variance features
model = RandomForestRegressor(random_state=0)
model.fit(X_train_imputed, y_train)
y_pred_initial = model.predict(X_test_imputed)
mae_initial = mean_absolute_error(y_test, y_pred_initial)
from sklearn.metrics import r2_score
r2_initial = r2_score(y_test, y_pred_initial)



In [9]:
# Step 2: Remove low-variance features
variance_threshold = VarianceThreshold(threshold=0.01)  # Adjust threshold if needed
X_train_var_filtered = variance_threshold.fit_transform(X_train_imputed)
X_test_var_filtered = variance_threshold.transform(X_test_imputed)

In [10]:
# Step 3: Train the model after removing low-variance features
model.fit(X_train_var_filtered, y_train)
y_pred_filtered = model.predict(X_test_var_filtered)
mae_after_removal = mean_absolute_error(y_test, y_pred_filtered)
r2_final = r2_score(y_test, y_pred_filtered)

In [11]:
# Output the results
print("MAE before removing low-variance features:", mae_initial)
print("MAE after removing low-variance features:", mae_after_removal)

MAE before removing low-variance features: 177684.04652441444
MAE after removing low-variance features: 180584.93886446857


In [12]:
print("R2 before removing low-variance features:", r2_initial)
print("R2 after removing low-variance features:", r2_final)

R2 before removing low-variance features: 0.7692490565510182
R2 after removing low-variance features: 0.7635169483457628
