In [4]:
import pandas as pd

# Download the dataset from the Google Drive link
file_id = '1Sx7KUA5jnG2o9VLoYAhc_LQvSgZsTrPR'
file_url = f"https://drive.google.com/file/d/1Sx7KUA5jnG2o9VLoYAhc_LQvSgZsTrPR/view?usp=sharing"
data = pd.read_csv("/data.csv")


In [5]:
# Display the shape of the dataset
print("Shape of the dataset:", data.shape)

Shape of the dataset: (4600, 18)


In [6]:
# Display the column names and their data types
print("\nColumn Names and Data Types:")
print(data.dtypes)


Column Names and Data Types:
date              object
price            float64
bedrooms         float64
bathrooms        float64
sqft_living        int64
sqft_lot           int64
floors           float64
waterfront         int64
view               int64
condition          int64
sqft_above         int64
sqft_basement      int64
yr_built           int64
yr_renovated       int64
street            object
city              object
statezip          object
country           object
dtype: object


In [7]:
# Display the first few rows of the dataset
print("\nFirst few rows of the dataset:")
print(data.head())








First few rows of the dataset:
                  date      price  bedrooms  bathrooms  sqft_living  sqft_lot  \
0  2014-05-02 00:00:00   313000.0       3.0       1.50         1340      7912   
1  2014-05-02 00:00:00  2384000.0       5.0       2.50         3650      9050   
2  2014-05-02 00:00:00   342000.0       3.0       2.00         1930     11947   
3  2014-05-02 00:00:00   420000.0       3.0       2.25         2000      8030   
4  2014-05-02 00:00:00   550000.0       4.0       2.50         1940     10500   

   floors  waterfront  view  condition  sqft_above  sqft_basement  yr_built  \
0     1.5           0     0          3        1340              0      1955   
1     2.0           0     4          5        3370            280      1921   
2     1.0           0     0          4        1930              0      1966   
3     1.0           0     0          4        1000           1000      1963   
4     1.0           0     0          4        1140            800      1976   

   yr_

In [8]:
# Check for missing values
missing_values = data.isnull().sum()
missing_percentage = (missing_values / len(data)) * 100

In [9]:
# Create a DataFrame to display missing values information
missing_data = pd.concat([missing_values, missing_percentage], axis=1, keys=['Missing Values', 'Percentage'])
missing_data = missing_data[missing_data['Missing Values'] > 0].sort_values(by='Missing Values', ascending=False)

In [10]:
print("\nMissing Values:")
print(missing_data)



Missing Values:
Empty DataFrame
Columns: [Missing Values, Percentage]
Index: []


In [13]:
# Print the column names of the dataset
print(data.columns)


Index(['date', 'price', 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot',
       'floors', 'waterfront', 'view', 'condition', 'sqft_above',
       'sqft_basement', 'yr_built', 'yr_renovated', 'street', 'city',
       'statezip', 'country'],
      dtype='object')


In [18]:
from sklearn.model_selection import train_test_split

# Select the target variable (house prices)
target = data['price']

# Select the features (excluding the target variable)
features = data.drop('price', axis=1)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Display the shapes of the training and testing sets
print("Training set shape:", X_train.shape)
print("Testing set shape:", X_test.shape)


Training set shape: (3680, 17)
Testing set shape: (920, 17)


In [20]:
import numpy as np

# Add a small constant value to 'price' to handle zero or negative values
data['price'] += 1e-8

# Apply logarithmic transformation
data['log_price'] = np.log(data['price'])

# Rest of the code for feature engineering
data['total_area'] = data['sqft_above'] + data['sqft_basement']
data['year_built'] = data['date'].apply(lambda x: int(x[:4]))
data['bathrooms_bin'] = pd.cut(data['bathrooms'], bins=[0, 1, 2, 3, np.inf], labels=['1', '2', '3', '4+'])
corr_matrix = data.corr()
relevant_features = corr_matrix['price'].abs().sort_values(ascending=False).index[:10]
selected_data = data[relevant_features]

# Print the updated dataset
print(selected_data.head())


       price  sqft_living  total_area  sqft_above  bathrooms  view  log_price  \
0   313000.0         1340        1340        1340       1.50     0  12.653958   
1  2384000.0         3650        3650        3370       2.50     4  14.684290   
2   342000.0         1930        1930        1930       2.00     0  12.742566   
3   420000.0         2000        2000        1000       2.25     0  12.948010   
4   550000.0         1940        1940        1140       2.50     0  13.217674   

   sqft_basement  bedrooms  floors  
0              0       3.0     1.5  
1            280       5.0     2.0  
2              0       3.0     1.0  
3           1000       3.0     1.0  
4            800       4.0     1.0  


  corr_matrix = data.corr()


In [21]:
corr_matrix = data.corr(numeric_only=True)


In [23]:
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer

# Select numerical features
numerical_features = ['sqft_living', 'total_area', 'sqft_above', 'bathrooms', 'view', 'sqft_basement', 'bedrooms', 'floors']

# Create transformer for numerical features
numerical_transformer = StandardScaler()

# Create a column transformer to apply the transformation to the numerical columns
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features)
    ])

# Apply preprocessing to the selected features
X = selected_data.drop('price', axis=1)
y = selected_data['price']
X_preprocessed = preprocessor.fit_transform(X)

# Split the preprocessed data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_preprocessed, y, test_size=0.2, random_state=42)

# Display the shapes of the training and testing sets
print("Training set shape:", X_train.shape)
print("Testing set shape:", X_test.shape)


Training set shape: (3680, 8)
Testing set shape: (920, 8)


In [24]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error

# Initialize the Random Forest regressor
rf_regressor = RandomForestRegressor(random_state=42)

# Train the model
rf_regressor.fit(X_train, y_train)

# Make predictions on the test set
y_pred = rf_regressor.predict(X_test)

# Calculate evaluation metrics
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)

# Print the evaluation metrics
print("Mean Squared Error (MSE):", mse)
print("Root Mean Squared Error (RMSE):", rmse)
print("Mean Absolute Error (MAE):", mae)


Mean Squared Error (MSE): 1006143131767.9264
Root Mean Squared Error (RMSE): 1003066.8630594505
Mean Absolute Error (MAE): 231010.5426611742


In [25]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5, 10]
}

# Initialize the Random Forest regressor
rf_regressor = RandomForestRegressor(random_state=42)

# Initialize the GridSearchCV object
grid_search = GridSearchCV(estimator=rf_regressor, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error')

# Perform grid search on the training data
grid_search.fit(X_train, y_train)

# Get the best parameter values
best_params = grid_search.best_params_

# Train the model with the best parameter values
best_rf_regressor = RandomForestRegressor(random_state=42, **best_params)
best_rf_regressor.fit(X_train, y_train)

# Make predictions on the test set using the best model
y_pred = best_rf_regressor.predict(X_test)

# Calculate evaluation metrics for the best model
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)

# Print the best parameter values and evaluation metrics
print("Best Parameters:", best_params)
print("Mean Squared Error (MSE):", mse)
print("Root Mean Squared Error (RMSE):", rmse)
print("Mean Absolute Error (MAE):", mae)


Best Parameters: {'max_depth': 5, 'min_samples_split': 2, 'n_estimators': 300}
Mean Squared Error (MSE): 996458274133.6969
Root Mean Squared Error (RMSE): 998227.5663062491
Mean Absolute Error (MAE): 218343.0702180188
