In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

# Ignore all warnings
warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv('melb_data.csv')

In [3]:
df['Type'].value_counts()

Unnamed: 0_level_0,count
Type,Unnamed: 1_level_1
h,9449
u,3017
t,1114


In [4]:
df.isnull().sum()

Unnamed: 0,0
Suburb,0
Address,0
Rooms,0
Type,0
Price,0
Method,0
SellerG,0
Date,0
Distance,0
Postcode,0


In [5]:
columns = ['Rooms','Bathroom','Landsize','Lattitude','Longtitude','Type','Price']

In [6]:
df = df[columns]

In [7]:
df

Unnamed: 0,Rooms,Bathroom,Landsize,Lattitude,Longtitude,Type,Price
0,2,1.0,202.0,-37.79960,144.99840,h,1480000.0
1,2,1.0,156.0,-37.80790,144.99340,h,1035000.0
2,3,2.0,134.0,-37.80930,144.99440,h,1465000.0
3,3,2.0,94.0,-37.79690,144.99690,h,850000.0
4,4,1.0,120.0,-37.80720,144.99410,h,1600000.0
...,...,...,...,...,...,...,...
13575,4,2.0,652.0,-37.90562,145.16761,h,1245000.0
13576,3,2.0,333.0,-37.85927,144.87904,h,1031000.0
13577,3,2.0,436.0,-37.85274,144.88738,h,1170000.0
13578,4,1.0,866.0,-37.85908,144.89299,h,2500000.0


In [46]:
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import MinMaxScaler,StandardScaler
from sklearn.pipeline import Pipeline,make_pipeline
from sklearn.feature_selection import SelectKBest,chi2
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import Binarizer
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.preprocessing import MultiLabelBinarizer

from sklearn.model_selection import cross_val_score


In [31]:
X = df.iloc[:,0:6]

In [41]:
y=df['Price']

AttributeError: 'Series' object has no attribute 'reshape'

In [33]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [34]:
t1 = ColumnTransformer([
    ('Scaling',MinMaxScaler(),[2]),
    ('scaler',StandardScaler(),[3,4]),
    ('ohe',OneHotEncoder(),[5])
],remainder='passthrough')

In [35]:
def create_pipeline(algo):
  return Pipeline([
      ('t1',t1),
      ('classifier',algo)
  ])

In [50]:
from sklearn import ensemble, gaussian_process, linear_model, neighbors, svm, tree
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from xgboost import XGBRegressor
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error,mean_squared_error

algorithms = [
    # Ensemble Methods
    ensemble.AdaBoostRegressor(),
    ensemble.BaggingRegressor(),
    ensemble.ExtraTreesRegressor(),
    ensemble.GradientBoostingRegressor(),
    ensemble.RandomForestRegressor(),

    # Gaussian Processes
    gaussian_process.GaussianProcessRegressor(),

    # GLM
    linear_model.LinearRegression(),
    linear_model.RidgeCV(),
    linear_model.LassoCV(),
    linear_model.ElasticNetCV(),
    linear_model.SGDRegressor(),

    # Nearest Neighbor
    neighbors.KNeighborsRegressor(),

    # SVM
    svm.SVR(),
    svm.NuSVR(),
    svm.LinearSVR(),

    # Trees
    tree.DecisionTreeRegressor(),
    tree.ExtraTreeRegressor(),

    # XGBoost
    XGBRegressor()
]


In [56]:
from sklearn.model_selection import cross_validate

scoring = {
    'r2': 'r2',
    'mae': 'neg_mean_absolute_error'
}

for algo in algorithms:
    pipeline = create_pipeline(algo)
    scores = cross_validate(pipeline, X_train, y_train, cv=5, scoring=scoring,n_jobs=-1)

    print(f"Results for {algo.__class__.__name__}:")
    print(f"  Mean R² Score: {scores['test_r2'].mean():.4f}")
    print(f"  Mean MAE: {abs(scores['test_mae'].mean()):.4f}")  # Take absolute value
    print("-" * 50)


Results for AdaBoostRegressor:
  Mean R² Score: 0.2629
  Mean MAE: 435118.6535
--------------------------------------------------
Results for BaggingRegressor:
  Mean R² Score: 0.7588
  Mean MAE: 184490.2187
--------------------------------------------------
Results for ExtraTreesRegressor:
  Mean R² Score: 0.7607
  Mean MAE: 179278.8348
--------------------------------------------------
Results for GradientBoostingRegressor:
  Mean R² Score: 0.7256
  Mean MAE: 205461.1756
--------------------------------------------------
Results for RandomForestRegressor:
  Mean R² Score: 0.7726
  Mean MAE: 176169.3325
--------------------------------------------------
Results for GaussianProcessRegressor:
  Mean R² Score: -45645.1393
  Mean MAE: 10813733.9515
--------------------------------------------------
Results for LinearRegression:
  Mean R² Score: 0.3856
  Mean MAE: 331548.8443
--------------------------------------------------
Results for RidgeCV:
  Mean R² Score: 0.3855
  Mean MAE: 331550.

In [62]:
from sklearn.ensemble import VotingRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from xgboost import XGBRegressor

# Define the base models
rf = RandomForestRegressor(n_estimators=150, random_state=42)
et = ExtraTreesRegressor(n_estimators=150, random_state=42)
xgb = XGBRegressor(n_estimators=150, random_state=42)

# Create Voting Regressor
voting_regressor = VotingRegressor(estimators=[
    ('rf', rf),
    ('et', et),
    ('xgb', xgb)
])

In [63]:
pipeline = Pipeline([
    ('preprocessing', t1),             # Apply preprocessing
    ('voting_regressor', voting_regressor)  # Apply Voting Ensemble
])

# Train the pipeline
pipeline.fit(X_train, y_train)

# Evaluate with Cross-Validation
from sklearn.model_selection import cross_validate

scoring = {
    'r2': 'r2',
    'mae': 'neg_mean_absolute_error'
}

scores = cross_validate(pipeline, X_train, y_train, cv=5, scoring=scoring,n_jobs=-1)

# Print results
print(f"Mean R² Score: {scores['test_r2'].mean():.4f}")
print(f"Mean MAE: {abs(scores['test_mae'].mean()):.4f}")

Mean R² Score: 0.7878
Mean MAE: 169983.2822
