## Imports

In [10]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error

## Processing data

In [11]:
df = pd.read_csv("resale_data.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 223415 entries, 0 to 223414
Data columns (total 11 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   month                223415 non-null  object 
 1   town                 223415 non-null  object 
 2   flat_type            223415 non-null  object 
 3   block                223415 non-null  object 
 4   street_name          223415 non-null  object 
 5   storey_range         223415 non-null  object 
 6   floor_area_sqm       223415 non-null  float64
 7   flat_model           223415 non-null  object 
 8   lease_commence_date  223415 non-null  int64  
 9   remaining_lease      223415 non-null  object 
 10  resale_price         223415 non-null  float64
dtypes: float64(2), int64(1), object(8)
memory usage: 18.7+ MB


In [12]:
df["month"] = pd.to_datetime(df["month"], format="%Y-%m")

## Baseline Linear Regression

To predict the resale price

In [None]:
df["log_price"] = np.log(df["resale_price"])

#features
X = df[["town","flat_type","storey_range","floor_area_sqm", "remaining_lease"]]

#predictors
y = df[["log_price"]]

cat_cols = df["town","flat_type","storey_range"]
num_cols = df["floor_area_sqm", "remaining_lease"]

preprocess = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
        ("num", "passthrough", num_cols)
    ]
)

model = Pipeline(steps=[("preprocess", preprocess),("regressor", LinearRegression())])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model.fit(X_train, y_train)
pred = model.predict(X_test)

mae = mean_absolute_error(y_test, pred)
rmse = mean_squared_error(y_test, pred, squared=False)

print("MAE:", mae)
print("RMSE:", rmse)

ValueError: No valid specification of the columns. Only a scalar, list or slice of all integers or all strings, or boolean mask is allowed