In [1]:
# Import necessary libraries
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

# Load your dataset
data = pd.read_csv('Housing_new.csv')

In [3]:
# Define the features to be encoded and scaled
categorical_features = ['mainroad_yes', 'guestroom_yes', 'basement_yes', 'hotwaterheating_yes',
                        'airconditioning_yes', 'prefarea_yes', 'furnishingstatus_semi-furnished', 'furnishingstatus_unfurnished']
numerical_features = ['area', 'bedrooms', 'bathrooms', 'stories', 'parking']

In [7]:
# Create transformers for one-hot encoding and standardization
categorical_transformer = OneHotEncoder(drop='first')  # Drop the first category to avoid multicollinearity
numerical_transformer = StandardScaler()

In [8]:
# Combine transformers into a ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

In [9]:
# Apply the transformations to the dataset
data_prepared = preprocessor.fit_transform(data)

In [10]:
# The output is a numpy array, so let's convert it to a DataFrame for easier interpretation
# Get feature names for the one-hot encoded columns
one_hot_features = preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_features)
# Combine all feature names
all_features = numerical_features + list(one_hot_features)

In [11]:
# Convert the numpy array back into a DataFrame
data_prepared_df = pd.DataFrame(data_prepared, columns=all_features)

In [12]:
# Show the first few rows of the prepared DataFrame
print(data_prepared_df.head())

       area  bedrooms  bathrooms   stories   parking  mainroad_yes_1  \
0 -9.487719  0.709911   0.348534  1.643973  0.013324             1.0   
1 -7.853121  0.709911   2.938145  2.939569  0.720786             1.0   
2 -6.791694  0.001372   0.348534  0.348377  0.013324             1.0   
3 -9.402805  0.709911   0.348534  0.348377  0.720786             1.0   
4 -9.487719  0.709911  -0.946271  0.348377  0.013324             1.0   

   guestroom_yes_1  basement_yes_1  hotwaterheating_yes_1  \
0              0.0             0.0                    0.0   
1              0.0             0.0                    0.0   
2              0.0             1.0                    0.0   
3              0.0             1.0                    0.0   
4              1.0             1.0                    0.0   

   airconditioning_yes_1  prefarea_yes_1  furnishingstatus_semi-furnished_1  \
0                    1.0             1.0                                0.0   
1                    1.0             0.0  

Need to:

Separate Features and Target: We'll split the processed data into features (X) and the target variable (y), where y will be the Price column from the original dataset, and X will be the processed features from data_prepared_df.

Model Training: We'll train a Linear Regression model using the separated features and target.

Model Evaluation: We'll evaluate the model's performance using the R-squared metric to assess how well our model explains the variance in housing prices.

In [13]:
# 1. Separate Features and Target:

# Import necessary libraries for this section
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import pandas as pd

In [14]:
# Separate the features and the target variable from the original dataset
X = data_prepared_df  # Your processed features
y = data['price']  # Target variable from the original dataset

In [15]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [16]:
# Initialize the Linear Regression model
lr_model = LinearRegression()

In [17]:
# Train the model on the training data
lr_model.fit(X_train, y_train)

In [18]:
# Predict housing prices on the testing set
y_pred = lr_model.predict(X_test)

In [19]:
# Evaluate the model's performance
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

In [20]:
# Print the performance metrics
print(f"Mean Squared Error (MSE): {mse}")
print(f"R-squared (R²): {r2}")

Mean Squared Error (MSE): 19618864548.135345
R-squared (R²): 0.8161095099370088


Since the R-squared didn't meeet requirements, will explore using the Gradient Boosting Regressor and apply hyperparameter tuning to it.

In [21]:
# Assuming necessary libraries are already imported and data is preprocessed

from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split, GridSearchCV

In [22]:
# Prepare your features and target variable
X = data_prepared_df  # Your processed features
y = data['price']     # Target variable

In [23]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [24]:
# Initialize the Gradient Boosting Regressor
gb_model = GradientBoostingRegressor(random_state=1)

In [25]:
# Hyperparameter tuning with Grid Search
param_grid = {
    'n_estimators': [100, 200, 300],  # Number of boosting stages to be run
    'learning_rate': [0.01, 0.1, 0.2],  # Shrinks the contribution of each tree by learning_rate
    'max_depth': [3, 4, 5],  # Maximum depth of the individual regression estimators
    'min_samples_split': [2, 4],  # The minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2],  # The minimum number of samples required to be at a leaf node
}

grid_search = GridSearchCV(estimator=gb_model, param_grid=param_grid, cv=5, n_jobs=-1, scoring='r2')
grid_search.fit(X_train, y_train)

In [26]:
# Best model from grid search
best_model = grid_search.best_estimator_

In [27]:
# Predict housing prices on the testing set with the best model
y_pred = best_model.predict(X_test)

In [28]:
# Evaluate the model's performance
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

In [29]:
# Print the performance metrics
print(f"Mean Squared Error (MSE): {mse}")
print(f"R-squared (R²): {r2}")

Mean Squared Error (MSE): 6159217103.016054
R-squared (R²): 0.9605929958471559
