# Essential libraries

In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import pickle

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler


# Load / Read dataset

In [2]:
# Data reading / loading
df = pd.read_csv("data.csv")

# Data Exploration
print(f"Number of rows {df.shape[0]}, Number of columns {df.shape[1]}")
print(df.shape)

print("-- Attributes in Data --")
for cols in df.columns:
    print(cols)


Number of rows 4600, Number of columns 18
(4600, 18)
-- Attributes in Data --
date
price
bedrooms
bathrooms
sqft_living
sqft_lot
floors
waterfront
view
condition
sqft_above
sqft_basement
yr_built
yr_renovated
street
city
statezip
country


In [3]:
df

Unnamed: 0,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,sqft_above,sqft_basement,yr_built,yr_renovated,street,city,statezip,country
0,5/2/2014 0:00,3.130000e+05,3.0,1.50,1340.0,7912.0,1.5,0.0,0.0,3,1340.0,0.0,1955.0,2005,18810 Densmore Ave N,Shoreline,WA 98133,USA
1,5/2/2014 0:00,2.384000e+06,5.0,2.50,3650.0,9050.0,2.0,0.0,4.0,5,3370.0,280.0,1921.0,0,709 W Blaine St,Seattle,WA 98119,USA
2,5/2/2014 0:00,3.420000e+05,3.0,2.00,1930.0,11947.0,1.0,0.0,0.0,4,1930.0,0.0,1966.0,0,26206-26214 143rd Ave SE,Kent,WA 98042,USA
3,5/2/2014 0:00,4.200000e+05,3.0,2.25,2000.0,8030.0,1.0,0.0,0.0,4,1000.0,1000.0,1963.0,0,857 170th Pl NE,Bellevue,WA 98008,USA
4,5/2/2014 0:00,5.500000e+05,4.0,2.50,1940.0,10500.0,1.0,0.0,0.0,4,1140.0,800.0,1976.0,1992,9105 170th Ave NE,Redmond,WA 98052,USA
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4595,7/9/2014 0:00,3.081667e+05,3.0,1.75,1510.0,6360.0,1.0,0.0,0.0,4,1510.0,0.0,1954.0,1979,501 N 143rd St,Seattle,WA 98133,USA
4596,7/9/2014 0:00,5.343333e+05,3.0,,1460.0,7573.0,2.0,0.0,0.0,3,1460.0,0.0,1983.0,2009,14855 SE 10th Pl,Bellevue,WA 98007,USA
4597,7/9/2014 0:00,4.169042e+05,3.0,,3010.0,7014.0,2.0,0.0,0.0,3,3010.0,0.0,2009.0,0,759 Ilwaco Pl NE,Renton,WA 98059,USA
4598,7/10/2014 0:00,2.034000e+05,4.0,2.00,2090.0,6630.0,1.0,0.0,0.0,3,1070.0,1020.0,1974.0,0,5148 S Creston St,Seattle,WA 98178,USA


# Handling Missing Values

In [4]:
# Handle missing values

def fill_na_with_mode(df, columns):
    for col in columns:
        df[col] = df[col].fillna(df[col].mode()[0])

def fill_na_with_mean(df, columns):
    for col in columns:
        df[col] = df[col].fillna(df[col].mean())

# Fill missing values in categorical and numerical columns
categorical_columns = ["street", "city", "statezip", "country"]
numerical_columns = ["price", "sqft_living", "sqft_lot", "sqft_above", "sqft_basement"]

fill_na_with_mode(df, categorical_columns)
fill_na_with_mean(df, numerical_columns)

In [5]:
# Drop columns that are unecessary cols
df.drop(columns=["date", "street"], inplace=True)


In [6]:
# Step 1: Replace non-finite values with appropriate values
df["price"] = df["price"].fillna(0)  # Replace NaN with 0 (example for price)
df["bedrooms"] = df["bedrooms"].fillna(df["bedrooms"].median())
df["bathrooms"] = df["bathrooms"].fillna(df["bathrooms"].median())
df["sqft_living"] = df["sqft_living"].fillna(df["sqft_living"].median())
df["sqft_lot"] = df["sqft_lot"].fillna(df["sqft_lot"].median())
df["sqft_above"] = df["sqft_above"].fillna(df["sqft_above"].median())
df["sqft_basement"] = df["sqft_basement"].fillna(df["sqft_basement"].median())
df["yr_built"] = df["yr_built"].fillna(df["yr_built"].median())

# Replace inf and -inf with a finite value (e.g., the median or 0)
df.replace([np.inf, -np.inf], 0, inplace=True)

# Step 2: Convert to int64
df["price"] = df["price"].astype("int64")
df["bedrooms"] = df["bedrooms"].astype("int64")
df["bathrooms"] = df["bathrooms"].astype("int64")
df["sqft_living"] = df["sqft_living"].astype("int64")
df["sqft_lot"] = df["sqft_lot"].astype("int64")
df["sqft_above"] = df["sqft_above"].astype("int64")
df["sqft_basement"] = df["sqft_basement"].astype("int64")
df["yr_built"] = df["yr_built"].astype("int64")


# Encoding

In [7]:
# Encode categorical features using Label Encoding
def encode_categorical_columns(df, columns):
    for col in columns:
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col])

encode_categorical_columns(df, ["city", "statezip", "country"])


# Data Splitting

In [8]:
# Data Splitting
X = df.drop(columns=["price"])
y = df["price"]

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=42)


# Feature Scaling

In [9]:

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


# Model Selection: Random Forest Regressor

In [10]:

print("-- Training Random Forest Regressor on Training Data --")
model_rf = RandomForestRegressor(n_estimators=100, random_state=42)


-- Training Random Forest Regressor on Training Data --


# Hyperparameter Tuning using GridSearchCV

In [11]:

param_grid = {
    "n_estimators": [50, 100, 200],
    "max_depth": [10, 20, None],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4]
}

grid_search = GridSearchCV(estimator=model_rf, param_grid=param_grid, cv=3, scoring='neg_mean_squared_error', verbose=2, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Get the best model after hyperparameter tuning
best_model = grid_search.best_estimator_
print(f"Best Model Parameters: {grid_search.best_params_}")


Fitting 3 folds for each of 81 candidates, totalling 243 fits
Best Model Parameters: {'max_depth': 20, 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 200}


In [12]:
# Predictions
y_pred = best_model.predict(X_test)


# Accuracy

In [13]:
mae = mean_absolute_error(y_test, y_pred)
mean_actual = y_test.mean()

# Calculate accuracy as a percentage
accuracy = 1 - (mae / mean_actual)
accuracy_percentage = accuracy * 100

print(f"Mean Absolute Error (MAE): {mae}")
print(f"Mean of Actual Values: {mean_actual}")
print(f"Accuracy: {accuracy_percentage:.2f}%")

Mean Absolute Error (MAE): 163431.44683694976
Mean of Actual Values: 580421.8565217392
Accuracy: 71.84%


In [14]:
pickle.dump(best_model, open("random_forest_model.pkl", "wb"))