# **Step 1: Data cleaning**

In [None]:
# import libraries

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, KFold, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR

import xgboost as xgb

In [None]:
# import dataset

file_path = r"C:\Users\sandy\Desktop\cleaned_output.csv"
df = pd.read_csv(file_path)

## 1a:  Remove missing values (these will be replaced later in the pipeline with the median as the values are not normally distributed)

In [None]:
target = "price" 
df = df.dropna(subset=[target])    

X = df.drop(columns=[target])
y = df[target]



## 1b: Limit the number of rooms to 15

In [None]:
if "number_rooms" in df.columns:
    df["number_rooms"] = df["number_rooms"].clip(upper=15)
else:
    raise KeyError("Column 'number_rooms' was not found in the dataset.")

## 1c: Identify categorical and numerical columns (categorical will be replaced by numerical later in the pipeline)

In [None]:
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object', 'category']).columns

print("Numeric features:", list(numeric_features))
print("Categorical features:", list(categorical_features))

# **Step 2: Data splitting (train and test split)**

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print("Train shape:", X_train.shape, "Test shape:", X_test.shape)

# **Step 3: Pipeline preprocessing (imputation, one-hot encoding, standardization)**

In [None]:
# Numeric: impute missing + standardize
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

# Categorical: impute missing + one-hot encode
categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

# Combine transformers
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features)
    ]
)

# Model definitions

models = {
    "Linear Regression": LinearRegression(),
    "Random Forest": RandomForestRegressor(n_estimators=300, random_state=42),
    "Decision Tree": DecisionTreeRegressor(random_state=42),
    "Support Vector Regressor": SVR(),
    "XGBoost": xgb.XGBRegressor(n_estimators=300, random_state=42, verbosity=0)
}
 

# **Step 4: Hyperparameter tuning and cross-validation**

In [None]:
# Small grid for LR (no parameters)
param_grids = {
    "Linear Regression": {},

    "Decision Tree": {
        "regressor__max_depth": [5, 10, 20, None],
        "regressor__min_samples_split": [2, 5, 10]
    },

    # Large models ‚Üí better to use RandomizedSearch
    "Random Forest": {
        "regressor__n_estimators": [100, 200, 300, 500],
        "regressor__max_depth": [10, 20, 30, None],
        "regressor__min_samples_split": [2, 5, 10]
    },

    "Support Vector Regressor": {
        "regressor__C": [0.1, 1, 10],
        "regressor__epsilon": [0.01, 0.1, 0.2],
        "regressor__kernel": ["rbf", "linear"]
    },

    "XGBoost": {
        "regressor__n_estimators": [200, 300, 400],
        "regressor__learning_rate": [0.05, 0.1, 0.2],
        "regressor__max_depth": [3, 5, 7]
    }
}



cv_strategy = KFold(n_splits=5, shuffle=True, random_state=42) # cross-validation strategy



pipelines = {}                    # hyperparameter-tuned pipelines
best_params = {}

print("\n===== Running Hyperparameter Tuning =====")

for name, model in models.items():
    print(f"\nüîç Tuning {name}...")

    pipe = Pipeline(steps=[
        ("preprocess", preprocessor),
        ("regressor", model)
    ])

    param_grid = param_grids[name]

    # Use GridSearch for smaller models, RandomizedSearch for large grids
    if name in ["Random Forest", "XGBoost"]:
        search = RandomizedSearchCV(
            estimator=pipe,
            param_distributions=param_grid,
            cv=cv_strategy,
            n_iter=10,                     # faster
            scoring='neg_root_mean_squared_error',
            n_jobs=-1,
            verbose=1,
            random_state=42
        )
    else:
        search = GridSearchCV(
            estimator=pipe,
            param_grid=param_grid,
            cv=cv_strategy,
            scoring='neg_root_mean_squared_error',
            n_jobs=-1,
            verbose=1
        )

    search.fit(X_train, y_train)

    pipelines[name] = search.best_estimator_
    best_params[name] = search.best_params_

    print(f"Best Parameters for {name}:\n{search.best_params_}\n")


# **Step 5: Evaluate models (linear regression, decision tree, random forest, Support Vector Machine, XGBoost)**

In [None]:
def evaluate(model, X_test, y_test, name):
    preds = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, preds))
    mae = mean_absolute_error(y_test, preds)
    r2 = r2_score(y_test, preds)

    print(f"\n===== {name} =====")
    print(f"RMSE: {rmse:.3f}")
    print(f"MAE:  {mae:.3f}")
    print(f"R¬≤:   {r2:.3f}")

    return preds



results = {}      # evaluate models

for name, pipe in pipelines.items():
    results[name] = evaluate(pipe, X_test, y_test, name)


# **Step 6: Use the pipeline for predictions based on the first 5 rows**

In [None]:
sample_X = X_test.head(5)
for name, pipe in pipelines.items():
    preds = pipe.predict(sample_X)
    print(f"\nPredictions by {name}:\n{preds}")