## Missing Values Ratio

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.ensemble import RandomForestRegressor
import numpy as np

melbourne_data = pd.read_csv('melbourne_housing_raw.csv')

# Filter out columns with more than 20% missing values
missing_values_ratio = melbourne_data.isnull().mean() * 100
features_to_remove = missing_values_ratio[missing_values_ratio > 20].index
reduced_data = melbourne_data.drop(columns=features_to_remove)

# Print the features removed and the shape of the reduced dataset
print(f"Features removed due to missing values: {features_to_remove}")
print(f"Reduced dataset shape: {reduced_data.shape}")

# Check if 'Price' is still in the reduced dataset
if 'Price' in reduced_data.columns:
    # Step 2: Prepare features (X) and target (y)
    X = reduced_data.drop(columns=['Price']).values  # Features
    y = reduced_data['Price'].values  # Target variable

    # Step 3: Split dataset into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Step 4: Train Linear Regression model
    model = LinearRegression()
    model.fit(X_train, y_train)

    # Step 5: Make predictions and evaluate using Mean Squared Error
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    print(f"Mean Squared Error after removing features with >20% missing values: {mse}")
else:
    print("The target variable 'Price' was removed due to missing values. No model can be trained.")


Features removed due to missing values: Index(['Price', 'Bedroom2', 'Bathroom', 'Car', 'Landsize', 'BuildingArea',
       'YearBuilt', 'Lattitude', 'Longtitude'],
      dtype='object')
Reduced dataset shape: (34857, 11)
The target variable 'Price' was removed due to missing values. No model can be trained.


## High Correlation Filter

In [None]:
# Drop non-numeric columns and keep only numeric types for correlation calculation
numeric_data = melbourne_data.select_dtypes(include=['float64', 'int64'])

# Calculate the correlation matrix on numeric data
correlation_matrix = numeric_data.corr()

# Identify highly correlated features (correlation > 0.85)
threshold = 0.85
features_to_remove = set()  # Using a set to avoid duplicates

for i in range(len(correlation_matrix.columns)):
    for j in range(i):
        if abs(correlation_matrix.iloc[i, j]) > threshold:
            # Add the feature with higher index to the set
            features_to_remove.add(correlation_matrix.columns[i])

# Remove the identified features from the original dataset
reduced_data = melbourne_data.drop(columns=features_to_remove)

# Print the features removed and the shape of the reduced dataset
print(f"Features removed due to high correlation: {features_to_remove}")
print(f"Reduced dataset shape: {reduced_data.shape}")

# Check if 'Price' is still in the reduced dataset
if 'Price' not in reduced_data.columns:
    print("The target variable 'Price' was removed due to high correlation. No model can be trained.")
else:
    # Drop rows with missing values
    reduced_data = reduced_data.dropna()

    # One-hot encode categorical variables
    reduced_data = pd.get_dummies(reduced_data, drop_first=True)  # Drop first to avoid dummy variable trap

    # Prepare features (X) and target (y)
    X = reduced_data.drop(columns=['Price']).values  # Features
    y = reduced_data['Price'].values  # Target variable

    # Split the dataset into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

    # Train the Linear Regression model
    model = LinearRegression()
    model.fit(X_train, y_train)

    # Make predictions and evaluate using Mean Squared Error
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    print(f"Mean Squared Error after removing features with high correlation: {mse}")


Features removed due to high correlation: {'Bedroom2'}
Reduced dataset shape: (34857, 19)
Mean Squared Error after removing features with high correlation: 4.554932281711646e+22


## Low Variance Filter

In [None]:
# Filter out low variance features
# Calculate variance for each numeric feature
numeric_features = melbourne_data.select_dtypes(include=['float64', 'int64']).columns
variances = melbourne_data[numeric_features].var()

# Select features with variance above a threshold
low_variance_threshold = 0.01
features_to_keep = variances[variances > low_variance_threshold].index

# Keep only the features with high variance
reduced_data = melbourne_data[features_to_keep]

# Check if 'Price' is in the remaining features
if 'Price' not in reduced_data.columns:
    print("The target variable 'Price' was removed due to low variance. No model can be trained.")
else:
    # Drop rows with missing values
    reduced_data = reduced_data.dropna()

    # Prepare features (X) and target (y)
    X = reduced_data.drop(columns=['Price']).values  # Features
    y = reduced_data['Price'].values  # Target variable

    # Split dataset into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

    # Train the Linear Regression model
    model = LinearRegression()
    model.fit(X_train, y_train)

    # Make predictions and evaluate using Mean Squared Error
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    print(f"Mean Squared Error after removing low variance features: {mse}")


Mean Squared Error after removing low variance features: 152644792220.06802


## Forward Feature Selection

In [None]:
# Preprocess the data
# Drop rows with missing values
melbourne_data = melbourne_data.dropna()

# Select only numeric features for the regression model
numeric_features = melbourne_data.select_dtypes(include=['float64', 'int64']).columns.tolist()

# Check if 'Price' is in numeric features
if 'Price' in numeric_features:
    numeric_features.remove('Price')  # Remove target variable from features

# Prepare features (X) and target (y)
X = melbourne_data[numeric_features].values  # Features
y = melbourne_data['Price'].values  # Target variable

# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Create a Linear Regression model
model = LinearRegression()

# Implement Forward Feature Selection
sfs = SequentialFeatureSelector(model,
                                 n_features_to_select='auto',
                                 direction='forward',
                                 scoring='neg_mean_squared_error',
                                 cv=5)

# Fit SFS to the training data
sfs.fit(X_train, y_train)

# Get selected feature indices
selected_features_indices = sfs.get_support(indices=True)

# Get selected feature names
selected_features = [numeric_features[i] for i in selected_features_indices]

print(f"Selected features: {selected_features}")

# Step 5: Train the model with selected features
X_train_selected = sfs.transform(X_train)
X_test_selected = sfs.transform(X_test)

model.fit(X_train_selected, y_train)

# Make predictions and evaluate using Mean Squared Error
y_pred = model.predict(X_test_selected)
mse = mean_squared_error(y_test, y_pred)

print(f"Mean Squared Error with selected features: {mse}")


Selected features: ['Rooms', 'Distance', 'Bathroom', 'BuildingArea', 'YearBuilt', 'Lattitude']
Mean Squared Error with selected features: 161414427141.72278


## Backward Feature Elimination

In [None]:
# Preprocess the data
# Drop rows with missing values
melbourne_data = melbourne_data.dropna()

# Select only numeric features for the regression model
numeric_features = melbourne_data.select_dtypes(include=['float64', 'int64']).columns.tolist()

# Check if 'Price' is in numeric features
if 'Price' in numeric_features:
    numeric_features.remove('Price')  # Remove target variable from features

# Prepare features (X) and target (y)
X = melbourne_data[numeric_features].values  # Features
y = melbourne_data['Price'].values  # Target variable

# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Create a Random Forest model
model = RandomForestRegressor(n_estimators=100, random_state=42)

# Backward Feature Elimination
remaining_features = numeric_features.copy()

while len(remaining_features) > 1:  # Ensure at least one feature remains
    # Fit the model
    model.fit(X_train[:, :len(remaining_features)], y_train)

    # Get feature importances
    importances = model.feature_importances_

    # Find the least important feature
    least_important_index = np.argmin(importances)

    # Remove the least important feature
    least_important_feature = remaining_features[least_important_index]
    remaining_features.remove(least_important_feature)

    # Train the model on the reduced feature set
    X_train_reduced = X_train[:, :len(remaining_features)]
    X_test_reduced = X_test[:, :len(remaining_features)]

    # Check if we still have features to train on
    if X_train_reduced.shape[1] == 0:  # If no features left, break the loop
        break

    model.fit(X_train_reduced, y_train)

    # Make predictions and evaluate the model
    y_pred = model.predict(X_test_reduced)
    mse = mean_squared_error(y_test, y_pred)

    print(f"Removed feature: {least_important_feature}, MSE: {mse}")

# Remaining features after elimination
print(f"Remaining features: {remaining_features}")


Removed feature: Bedroom2, MSE: 65056356982.440735
Removed feature: Bathroom, MSE: 68518665251.77405
Removed feature: Car, MSE: 69139475783.0304
Removed feature: Landsize, MSE: 74938031952.27202
Removed feature: BuildingArea, MSE: 82540180649.84428
Removed feature: Longtitude, MSE: 114323355240.92207
Removed feature: YearBuilt, MSE: 107082506931.72218
Removed feature: Lattitude, MSE: 110240429939.68059
Removed feature: Propertycount, MSE: 108820786828.80994
Removed feature: Rooms, MSE: 176984255433.44705
Removed feature: Distance, MSE: 311765920534.43243
Remaining features: ['Postcode']


## Random Forest

In [None]:
# Preprocess the data
# Drop rows with missing values
melbourne_data = melbourne_data.dropna()

# Select only numeric features for the regression model
numeric_features = melbourne_data.select_dtypes(include=['float64', 'int64']).columns.tolist()

# Check if 'Price' is in numeric features
if 'Price' in numeric_features:
    numeric_features.remove('Price')  # Remove target variable from features

# Prepare features (X) and target (y)
X = melbourne_data[numeric_features].values  # Features
y = melbourne_data['Price'].values  # Target variable

# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Create and train a Random Forest model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Look at feature importances
importances = model.feature_importances_
feature_importance_df = pd.DataFrame({
    'Feature': numeric_features,
    'Importance': importances
}).sort_values(by='Importance', ascending=False)

print("Feature importances:\n", feature_importance_df)

# Remove the least important feature and retrain the model
least_important_feature = feature_importance_df.tail(1)['Feature'].values[0]
remaining_features = numeric_features.copy()
remaining_features.remove(least_important_feature)

# Prepare new feature sets
X_train_reduced = X_train[:, :len(remaining_features)]
X_test_reduced = X_test[:, :len(remaining_features)]

# Retrain the model on the reduced feature set
model_reduced = RandomForestRegressor(n_estimators=100, random_state=42)
model_reduced.fit(X_train_reduced, y_train)

# Make predictions and evaluate the reduced model
y_pred_reduced = model_reduced.predict(X_test_reduced)
mse_reduced = mean_squared_error(y_test, y_pred_reduced)

print(f"Removed feature: {least_important_feature}, MSE after removal: {mse_reduced}")


Feature importances:
           Feature  Importance
7    BuildingArea    0.368577
1        Distance    0.156417
8       YearBuilt    0.117948
9       Lattitude    0.078950
2        Postcode    0.078624
6        Landsize    0.069979
10     Longtitude    0.060464
11  Propertycount    0.018377
4        Bathroom    0.015139
0           Rooms    0.013745
5             Car    0.012759
3        Bedroom2    0.009021
Removed feature: Bedroom2, MSE after removal: 65056356982.440735
