In [829]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_validate
from sklearn.impute import SimpleImputer



In [830]:
# Load the dataset
housing = pd.read_csv("dataset.csv")
target_name = "Price"
data = housing.drop(columns=target_name)
target = housing[target_name]

print(f"Dataset size: {data.shape}")

Dataset size: (13580, 20)


### Columns description
**Rooms**: Number of rooms
**Price**: Price in dollars
**Method**: S - property sold; SP - property sold prior; PI - property passed in; PN - sold prior not disclosed; SN - sold not disclosed; NB - no bid; VB - vendor bid; W - withdrawn prior to auction; SA - sold after auction; SS - sold after auction price not disclosed. N/A - price or highest bid not available.
**Type**: br - bedroom(s); h - house,cottage,villa, semi,terrace; u - unit, duplex; t - townhouse; dev site - development site; o res - other residential.
**SellerG**: Real Estate Agent
**Date**: Date sold
**Distance**: Distance from CBD
**Regionname**: General Region (West, North West, North, North east …etc)
**Propertycount**: Number of properties that exist in the suburb.
**Bedroom2**: Scraped # of Bedrooms (from different source)
**Bathroom**: Number of Bathrooms
**Car**: Number of carspots
**Landsize**: Land Size
**BuildingArea**: Building Size
**CouncilArea**: Governing council for the area

In [831]:
numerical_features = ["Rooms", "Distance", "Propertycount", "Bedroom2", "Bathroom", "Car", "Landsize", "BuildingArea"]

categorical_features = ["Type", "SellerG", "Regionname", "CouncilArea"]

data = data[numerical_features + categorical_features]
data.head()

Unnamed: 0,Rooms,Distance,Propertycount,Bedroom2,Bathroom,Car,Landsize,BuildingArea,Type,SellerG,Regionname,CouncilArea
0,2,2.5,4019.0,2.0,1.0,1.0,202.0,,h,Biggin,Northern Metropolitan,Yarra
1,2,2.5,4019.0,2.0,1.0,0.0,156.0,79.0,h,Biggin,Northern Metropolitan,Yarra
2,3,2.5,4019.0,3.0,2.0,0.0,134.0,150.0,h,Biggin,Northern Metropolitan,Yarra
3,3,2.5,4019.0,3.0,2.0,1.0,94.0,,h,Biggin,Northern Metropolitan,Yarra
4,4,2.5,4019.0,3.0,1.0,2.0,120.0,142.0,h,Nelson,Northern Metropolitan,Yarra


In [832]:
# Building area has missing values we check the percentage of missing values
data["BuildingArea"].isna().mean() * 100

np.float64(47.49631811487481)

In [833]:
# We can see that 47% of the values are missing, if we drop the rows with missing values we will lose a lot of data
# We can use the mean value to fill the missing values
data["BuildingArea"] = data["BuildingArea"].fillna(data["BuildingArea"].mean())
data.head()

Unnamed: 0,Rooms,Distance,Propertycount,Bedroom2,Bathroom,Car,Landsize,BuildingArea,Type,SellerG,Regionname,CouncilArea
0,2,2.5,4019.0,2.0,1.0,1.0,202.0,151.96765,h,Biggin,Northern Metropolitan,Yarra
1,2,2.5,4019.0,2.0,1.0,0.0,156.0,79.0,h,Biggin,Northern Metropolitan,Yarra
2,3,2.5,4019.0,3.0,2.0,0.0,134.0,150.0,h,Biggin,Northern Metropolitan,Yarra
3,3,2.5,4019.0,3.0,2.0,1.0,94.0,151.96765,h,Biggin,Northern Metropolitan,Yarra
4,4,2.5,4019.0,3.0,1.0,2.0,120.0,142.0,h,Nelson,Northern Metropolitan,Yarra


In [834]:
# Landsize and BuildingArea seem to have a long tail distribution
data[["Landsize", "BuildingArea"]].describe()

Unnamed: 0,Landsize,BuildingArea
count,13580.0,13580.0
mean,558.416127,151.96765
std,3990.669241,392.002962
min,0.0,0.0
25%,177.0,122.0
50%,440.0,151.96765
75%,651.0,151.96765
max,433014.0,44515.0


In [835]:
# If we remove the outliers we can see the distribution more clearly but we will lose a lot of data so we will keep the outliers

In [836]:
# We check all the columns with missing values
data.isna().mean() * 100

Rooms             0.000000
Distance          0.000000
Propertycount     0.000000
Bedroom2          0.000000
Bathroom          0.000000
Car               0.456554
Landsize          0.000000
BuildingArea      0.000000
Type              0.000000
SellerG           0.000000
Regionname        0.000000
CouncilArea      10.081001
dtype: float64

In [837]:
# We can see that Car column has missing values, we can use the most frequent value to fill the missing values
data["Car"].value_counts()

data["Car"] = data["Car"].fillna(data["Car"].mode()[0])

data.isna().mean() * 100

Rooms             0.000000
Distance          0.000000
Propertycount     0.000000
Bedroom2          0.000000
Bathroom          0.000000
Car               0.000000
Landsize          0.000000
BuildingArea      0.000000
Type              0.000000
SellerG           0.000000
Regionname        0.000000
CouncilArea      10.081001
dtype: float64

#### We create a pipeline to apply the transformations to the data

In [838]:
numerical_features_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler())
], verbose=True)

categorical_features_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder())
], verbose=True)

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numerical_features_transformer, numerical_features),
        #("cat", categorical_features_transformer, categorical_features)
    ]
)

# We also need to define the classifier, which in this case is a Linear Regression model
classifier = LinearRegression()

model = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", classifier)
])

# Now we have a pipeline that can apply the transformations to the data

#### Splitting the data into training and testing sets

In [839]:
data.shape

(13580, 12)

In [840]:
target.shape

(13580,)

In [841]:
test_size = 0.2 # 20% of the data will be used for testing
X_train, X_test, y_train, y_test = train_test_split(data[numerical_features], target, test_size=test_size)
print(f"Training set size: {X_train.shape}")
print(f"Testing set size: {X_test.shape}")
print(f"Training target size: {y_train.shape}")
print(f"Testing target size: {y_test.shape}")

Training set size: (10864, 8)
Testing set size: (2716, 8)
Training target size: (10864,)
Testing target size: (2716,)


#### Applying the transformations to the training and testing sets

In [842]:
model.fit(X_train, y_train)

[Pipeline] ........... (step 1 of 2) Processing imputer, total=   0.0s
[Pipeline] ............ (step 2 of 2) Processing scaler, total=   0.0s


#### Evaluating the model

In [843]:
X_train.head()

Unnamed: 0,Rooms,Distance,Propertycount,Bedroom2,Bathroom,Car,Landsize,BuildingArea
8211,3,9.9,7485.0,3.0,1.0,1.0,488.0,151.96765
1555,4,7.8,8920.0,4.0,2.0,1.0,652.0,189.0
8437,2,11.2,5457.0,2.0,1.0,0.0,269.0,124.0
10433,4,22.7,11806.0,4.0,2.0,4.0,702.0,140.0
7814,4,8.0,9264.0,4.0,2.0,2.0,639.0,151.96765


In [844]:
y_train.head()

8211      806000.0
1555     2650000.0
8437     1260000.0
10433     805000.0
7814     1720000.0
Name: Price, dtype: float64

In [845]:
from sklearn.model_selection import GridSearchCV

# param_grid = [
#     {"preprocessor__num__imputer__strategy": ["mean"]},
#     # {"preprocessor__cat__imputer__strategy": ["most_frequent"]}
# ]
#
# cv = GridSearchCV(model, param_grid, cv=5, scoring="neg_mean_squared_error",
#                            verbose=2, n_jobs=8)
#
# cv.fit(X_train, y_train)

cv = cross_validate(model, X_train, y_train, cv=5, scoring="neg_mean_squared_error", n_jobs=8, verbose=2)

[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done   2 out of   5 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=8)]: Done   5 out of   5 | elapsed:    0.0s finished


In [846]:
# We can see the best parameters
# cv.best_params_
cv


{'fit_time': array([0.01012707, 0.01164865, 0.01112938, 0.00904012, 0.01155949]),
 'score_time': array([0.00452256, 0.00451851, 0.00451922, 0.0050602 , 0.00351214]),
 'test_score': array([-2.83456742e+11, -2.84345466e+11, -2.15663247e+11, -6.80348310e+11,
        -2.10033329e+11])}