In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LinearRegression
import pickle

# Load the dataset
data = pd.read_csv("ResaleData.csv")
print("Columns in CSV:", data.columns.tolist())

# Assume your CSV has a 'month' column which stores a date (e.g., "2023-07")
# Convert the 'month' column to datetime format and extract year and month number.
# Adjust the format string if your date format differs.
data['month'] = pd.to_datetime(data['month'], format="%Y-%m", errors='coerce')
data['year'] = data['month'].dt.year
data['month_num'] = data['month'].dt.month

# Preprocess text columns: remove extra whitespace and convert to uppercase for consistency.
data['town'] = data['town'].str.strip().str.upper()
data['flat_type'] = data['flat_type'].str.strip().str.upper()

# Define feature and target columns.
features = ['town', 'flat_type', 'year', 'month_num']
target = 'resale_price'

X = data[features]
y = data[target]

# Define the processing for categorical and numeric features.
categorical_features = ['town', 'flat_type']
numeric_features = ['year', 'month_num']

# For categorical features, use OneHotEncoder.
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# For numeric features, use StandardScaler.
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

# Combine the transformers into a ColumnTransformer.
preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features)
])

# Build the full pipeline that first preprocesses the data and then fits a Linear Regression model.
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

# Split the data into training and testing sets for evaluation.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the pipeline using the training data.
pipeline.fit(X_train, y_train)

# Optionally evaluate performance on the test set.
score = pipeline.score(X_test, y_test)
print("Test R^2 score:", score)

# Save the entire pipeline (preprocessing + model) to a pickle file.
with open("model_pipeline.pkl", "wb") as f:
    pickle.dump(pipeline, f)

print("Model pipeline saved to model_pipeline.pkl")



Columns in CSV: ['month', 'town', 'flat_type', 'resale_price']
Test R^2 score: 0.6853270309966333
Model pipeline saved to model_pipeline.pkl


In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LinearRegression
import pickle

# Load the dataset
data = pd.read_csv("ResaleData.csv")
print("Columns in CSV:", data.columns.tolist())

# Convert the 'month' column to datetime format and extract year and month number.
# Adjust the format string if your date format differs (e.g., "2023-07").
data['month'] = pd.to_datetime(data['month'], format="%Y-%m", errors='coerce')
data['year'] = data['month'].dt.year
data['month_num'] = data['month'].dt.month

# Preprocess text columns: remove extra whitespace and convert to uppercase for consistency.
data['town'] = data['town'].str.strip().str.upper()
data['flat_type'] = data['flat_type'].str.strip().str.upper()

# Define feature and target columns.
features = ['town', 'flat_type', 'year', 'month_num']
target = 'resale_price'

X = data[features]
y = data[target]

# Define the processing for categorical and numeric features.
categorical_features = ['town', 'flat_type']
numeric_features = ['year', 'month_num']

# For categorical features, use OneHotEncoder.
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# For numeric features, use StandardScaler.
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

# Combine the transformers into a ColumnTransformer.
preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features)
])

# Build the full pipeline that first preprocesses the data and then fits a Linear Regression model.
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

# Split the data into training and testing sets for evaluation.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the pipeline using the training data.
pipeline.fit(X_train, y_train)

# Optionally evaluate performance on the test set.
score = pipeline.score(X_test, y_test)
print("Test R^2 score:", score)

# Save the entire pipeline (preprocessing + model) to a pickle file.
with open("model_pipeline.pkl", "wb") as f:
    pickle.dump(pipeline, f)

print("Model pipeline saved to model_pipeline.pkl")


Columns in CSV: ['month', 'town', 'flat_type', 'resale_price']
Test R^2 score: 0.6853270309966333
Model pipeline saved to model_pipeline.pkl
