In [None]:
#Importing necessary libraries for data handling and model building
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer


In [None]:
# Load the dataset from a CSV file
df = pd.read_csv("Dataset.csv")

In [None]:
# Remove rows where resale price is missing
df = df[df['resale_price'].notna()]

In [None]:
# Keep only rows where flat_type is 3 ROOM, 4 ROOM, or 5 ROOM
df = df[df['flat_type'].isin(['3 ROOM', '4 ROOM', '5 ROOM'])]

In [None]:
# Define input features (X) and target variable (y)
X = df[['floor_area_sqm', 'flat_type', 'town']]
y = df['resale_price']
df.head

<bound method NDFrame.head of             month        town flat_type block       street_name storey_range  \
0        1/1/2000  ANG MO KIO    3 ROOM   170  ANG MO KIO AVE 4     07 TO 09   
1        1/1/2000  ANG MO KIO    3 ROOM   174  ANG MO KIO AVE 4     04 TO 06   
2        1/1/2000  ANG MO KIO    3 ROOM   216  ANG MO KIO AVE 1     07 TO 09   
3        1/1/2000  ANG MO KIO    3 ROOM   215  ANG MO KIO AVE 1     07 TO 09   
4        1/1/2000  ANG MO KIO    3 ROOM   218  ANG MO KIO AVE 1     07 TO 09   
...           ...         ...       ...   ...               ...          ...   
458998  1/12/2014      YISHUN    5 ROOM   612      YISHUN ST 61     04 TO 06   
458999  1/12/2014      YISHUN    5 ROOM   713      YISHUN ST 71     01 TO 03   
459000  1/12/2014      YISHUN    5 ROOM   757      YISHUN ST 72     01 TO 03   
459001  1/12/2014      YISHUN    5 ROOM   819      YISHUN ST 81     01 TO 03   
459002  1/12/2014      YISHUN    5 ROOM   816      YISHUN ST 81     10 TO 12   

        f

In [None]:
# Define which features are categorical and which are numerical
categorical_features = ['flat_type', 'town']
numeric_features = ['floor_area_sqm']

In [None]:
# Create a transformer that applies one-hot encoding to categorical features
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

In [None]:
# Combine transformers into a single preprocessor using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_features),
        ('num', 'passthrough', numeric_features)
    ])


In [None]:
# Create a pipeline that first preprocesses the data and then applies linear regression
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])


In [None]:
# Split the data into training and test sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
# Fit the model on the training data
model.fit(X_train, y_train)


In [None]:
# Create a sample input to test prediction
sample = pd.DataFrame({
    'floor_area_sqm': [90],
    'flat_type': ['4 ROOM'],
    'town': ['ANG MO KIO']  # Add the town just like flat_type
})
# Predict resale price for the sample input
predicted_price = model.predict(sample)
# Print the predicted resale price
print(f"Predicted resale price: ${predicted_price[0]:,.2f}")


Predicted resale price: $312,375.27
