In [None]:
import numpy as np
import pandas as pd

In [None]:
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Data/melbourne_housing_raw.csv')
df.head()

Unnamed: 0,Suburb,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,Abbotsford,2,h,,SS,Jellis,3/9/16,2.5,3067.0,2.0,1.0,1.0,126.0,,,Yarra City Council,-37.8014,144.9958,Northern Metropolitan,4019.0
1,Abbotsford,2,h,1480000.0,S,Biggin,3/12/16,2.5,3067.0,2.0,1.0,1.0,202.0,,,Yarra City Council,-37.7996,144.9984,Northern Metropolitan,4019.0
2,Abbotsford,2,h,1035000.0,S,Biggin,4/2/16,2.5,3067.0,2.0,1.0,0.0,156.0,79.0,1900.0,Yarra City Council,-37.8079,144.9934,Northern Metropolitan,4019.0
3,Abbotsford,3,u,,VB,Rounds,4/2/16,2.5,3067.0,3.0,2.0,1.0,0.0,,,Yarra City Council,-37.8114,145.0116,Northern Metropolitan,4019.0
4,Abbotsford,3,h,1465000.0,SP,Biggin,4/3/17,2.5,3067.0,3.0,2.0,0.0,134.0,150.0,1900.0,Yarra City Council,-37.8093,144.9944,Northern Metropolitan,4019.0


In [None]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
for column in df.columns:
    if df[column].dtype == object:
        df[column] = label_encoder.fit_transform(df[column])

In [None]:
missing_ratio = df.isnull().mean() * 100
print(missing_ratio)

Suburb            0.000000
Rooms             0.000000
Type              0.000000
Price            21.832057
Method            0.000000
SellerG           0.000000
Date              0.000000
Distance          0.002869
Postcode          0.002869
Bedroom2         23.573457
Bathroom         23.599277
Car              25.039447
Landsize         33.881286
BuildingArea     60.576068
YearBuilt        55.386293
CouncilArea       0.000000
Lattitude        22.882061
Longtitude       22.882061
Regionname        0.000000
Propertycount     0.008607
dtype: float64


In [None]:
X = df.drop('Price', axis=1)
y = df['Price']

Missing Values Ratio

In [None]:
threshold_missing = 20
X_missing = X.loc[:, missing_ratio <= threshold_missing]
X_missing.head()

Unnamed: 0,Suburb,Rooms,Type,Method,SellerG,Date,Distance,Postcode,CouncilArea,Regionname,Propertycount
0,0,2,0,6,155,59,2.5,3067.0,31,2,4019.0
1,0,2,0,2,33,55,2.5,3067.0,31,2,4019.0
2,0,2,0,2,33,64,2.5,3067.0,31,2,4019.0
3,0,3,2,7,296,64,2.5,3067.0,31,2,4019.0
4,0,3,0,5,33,65,2.5,3067.0,31,2,4019.0


In [None]:
X_missing = X_missing.fillna(X_missing.mean())
y = df['Price'].fillna(df['Price'].mean())

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_missing, y, test_size=0.2, random_state=42)
# Train a Random Forest model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
r2_score(y_test, model.predict(X_test))

0.45248582186884756

# High Correlation Filter

In [None]:
correlation_threshold = 0.85

corr_matrix = X_train.corr().abs()

upper_triangle = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

to_drop = [column for column in upper_triangle.columns if any(upper_triangle[column] > correlation_threshold)]

X_train_filtered = X_train.drop(to_drop, axis=1)
X_test_filtered = X_test.drop(to_drop, axis=1)

model_filtered = RandomForestRegressor(n_estimators=100, random_state=42)
model_filtered.fit(X_train_filtered, y_train)

y_pred_filtered = model_filtered.predict(X_test_filtered)
r2_filtered = r2_score(y_test, y_pred_filtered)

print(f"R-squared with High Correlation Filter: {r2_filtered}")

R-squared with High Correlation Filter: 0.4510181503776536


## Low Variance Filter

In [None]:
variance_threshold = 0.05

variances = X_train.var()

low_variance_features = variances[variances < variance_threshold].index

X_train_filtered_variance = X_train.drop(low_variance_features, axis=1)
X_test_filtered_variance = X_test.drop(low_variance_features, axis=1)

model_filtered_variance = RandomForestRegressor(n_estimators=100, random_state=42)
model_filtered_variance.fit(X_train_filtered_variance, y_train)

y_pred_filtered_variance = model_filtered_variance.predict(X_test_filtered_variance)
r2_filtered_variance = r2_score(y_test, y_pred_filtered_variance)

print(f"R-squared with Low Variance Filter: {r2_filtered_variance}")

R-squared with Low Variance Filter: 0.4510181503776536


# Forward Feature Selection

In [None]:
def forward_feature_selection(X_train, y_train, X_test, y_test):
    features = list(X_train.columns)
    best_features = []
    best_score = 0

    while features:
        best_new_score = 0
        best_new_feature = None

        for feature in features:
            current_features = best_features + [feature]
            model = RandomForestRegressor(n_estimators=100, random_state=42)
            model.fit(X_train[current_features], y_train)
            y_pred = model.predict(X_test[current_features])
            score = r2_score(y_test, y_pred)

            if score > best_new_score:
                best_new_score = score
                best_new_feature = feature

        if best_new_score > best_score:
            best_score = best_new_score
            best_features.append(best_new_feature)
            features.remove(best_new_feature)
        else:
            break

    return best_features, best_score

# Apply Forward Feature Selection
best_features, best_r2 = forward_feature_selection(X_train, y_train, X_test, y_test)

print(f"Best Features: {best_features}")
print(f"Best R-squared with Forward Feature Selection: {best_r2}")

# Train a model with the selected features
model_forward = RandomForestRegressor(n_estimators=100, random_state=42)
model_forward.fit(X_train[best_features], y_train)
y_pred_forward = model_forward.predict(X_test[best_features])
r2_forward = r2_score(y_test, y_pred_forward)

print(f"R-squared using selected features from Forward Feature Selection: {r2_forward}")

# Backward Feature Elimination

In [None]:
def backward_feature_elimination(X_train, y_train, X_test, y_test):
    features = list(X_train.columns)
    best_features = features
    best_score = 0

    while features:
        worst_feature = None
        worst_score_reduction = float('inf')

        for feature in features:
            current_features = [f for f in features if f != feature]
            model = RandomForestRegressor(n_estimators=100, random_state=42)
            model.fit(X_train[current_features], y_train)
            y_pred = model.predict(X_test[current_features])
            score = r2_score(y_test, y_pred)

            if score > best_score:
                best_score = score
                worst_feature = feature
                worst_score_reduction = score

        if worst_feature:
            features.remove(worst_feature)
        else:
            break

    return features, best_score

# Apply Backward Feature Elimination
best_features_backward, best_r2_backward = backward_feature_elimination(X_train, y_train, X_test, y_test)


print(f"Best Features Backward: {best_features_backward}")
print(f"Best R-squared with Backward Feature Elimination: {best_r2_backward}")

# Train a model with the selected features
model_backward = RandomForestRegressor(n_estimators=100, random_state=42)
model_backward.fit(X_train[best_features_backward], y_train)
y_pred_backward = model_backward.predict(X_test[best_features_backward])
r2_backward = r2_score(y_test, y_pred_backward)

print(f"R-squared using selected features from Backward Feature Elimination: {r2_backward}")

# Random Forest

In [None]:
from sklearn.ensemble import RandomForestRegressor

# Train a Random Forest model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Get feature importances
feature_importances = rf_model.feature_importances_

# Create a DataFrame to display feature importances
feature_importance_df = pd.DataFrame({'Feature': X_train.columns, 'Importance': feature_importances})
feature_importance_df = feature_importance_df.sort_values('Importance', ascending=False)

print("Feature Importances for Predicting Housing Prices:")
print(feature_importance_df)