In [1]:
# Importing the required packages
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.preprocessing import OneHotEncoder

In [2]:
# Function to import the dataset using url
def load_housing_data():
    url = ("https://raw.githubusercontent.com/abdelaziztestas/spark_book/main/housing.csv")
    return pd.read_csv(url)

# Calling the loading function
pandas_df = load_housing_data()

In [3]:
print(f"Shape of the Datatset: {pandas_df.shape}")

Shape of the Datatset: (545, 13)


In [4]:
print(f"Name of the columns: {pandas_df.columns}")

Name of the columns: Index(['price', 'area', 'bedrooms', 'bathrooms', 'stories', 'mainroad',
       'guestroom', 'basement', 'hotwaterheating', 'airconditioning',
       'parking', 'prefarea', 'furnishingstatus'],
      dtype='object')


In [5]:
print(f"Data types of each column: \n{pandas_df.dtypes}")

Data types of each column: 
price                int64
area                 int64
bedrooms             int64
bathrooms            int64
stories              int64
mainroad            object
guestroom           object
basement            object
hotwaterheating     object
airconditioning     object
parking              int64
prefarea            object
furnishingstatus    object
dtype: object


In [6]:
# unique values in the columns with object data type
for col in ["mainroad", "guestroom", "basement", "hotwaterheating",
            "airconditioning", "prefarea", "furnishingstatus"]:
    unique_values = pandas_df[col].unique()
    print(f"Unique Values in {col} is: {unique_values}")

Unique Values in mainroad is: ['yes' 'no']
Unique Values in guestroom is: ['no' 'yes']
Unique Values in basement is: ['no' 'yes']
Unique Values in hotwaterheating is: ['no' 'yes']
Unique Values in airconditioning is: ['yes' 'no']
Unique Values in prefarea is: ['yes' 'no']
Unique Values in furnishingstatus is: ['furnished' 'semi-furnished' 'unfurnished']


In [7]:
# Splitting the data into X and y datasets
X = pandas_df.drop("price", axis = 1)
y = pandas_df["price"]

In [8]:
# Seperating the categorical columns
cat_col = ["mainroad", "guestroom", "basement", "hotwaterheating", 
           "airconditioning", "prefarea", "furnishingstatus"]

# Initialing the OneHotEncoder
onehot_encoder = OneHotEncoder(sparse_output = False)

# Applying the OneHotEncoder on the categorical columns
X_encoded = onehot_encoder.fit_transform(X[cat_col])

# Turning encoded columns into a DataFrame 
X_encoded_df = pd.DataFrame(X_encoded, columns = onehot_encoder.get_feature_names_out(cat_col))

In [9]:
# Dropping the Categorical columns from the X dataset
X = X.drop(cat_col, axis = 1)

# Concatenating the X and X_encoded dataframe
X = pd.concat([X, X_encoded_df], axis = 1)

In [10]:
print(f"New shape of the dataset: {X.shape}")

New shape of the dataset: (545, 20)


In [11]:
print(f"New columns of the dataset: {X.columns}")

New columns of the dataset: Index(['area', 'bedrooms', 'bathrooms', 'stories', 'parking', 'mainroad_no',
       'mainroad_yes', 'guestroom_no', 'guestroom_yes', 'basement_no',
       'basement_yes', 'hotwaterheating_no', 'hotwaterheating_yes',
       'airconditioning_no', 'airconditioning_yes', 'prefarea_no',
       'prefarea_yes', 'furnishingstatus_furnished',
       'furnishingstatus_semi-furnished', 'furnishingstatus_unfurnished'],
      dtype='object')


In [12]:
# Splitting the dataset into X_train, X_test, y_train, y_test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 10)

In [13]:
# initialising the random forest model
random_forest_model = RandomForestRegressor()

# Training the model on the X_train and y_train dataset
random_forest_model.fit(X_train, y_train)

In [14]:
# Predicting the values for X_test 
y_pred = random_forest_model.predict(X_test)

In [15]:
# Calculating the r2_score
r2 = r2_score(y_test, y_pred)

# Calculating the root mean squared error
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

In [16]:
print(f"r2_score: {r2}")
print(f"Root Mean Squared Error: {rmse}")

r2_score: 0.6389285407157679
Root Mean Squared Error: 1007580.0039849392


In [17]:
# Number of the trees in the random forest
print(f"Number of trees that are made by the Random Forest Model: {random_forest_model.n_estimators}")

Number of trees that are made by the Random Forest Model: 100


In [18]:
importance = random_forest_model.feature_importances_
feature_names = X.columns
indices = np.argsort(importance)[::-1]
print("Feature Importance:")
for i in indices:
    print(f"{feature_names[i]}: {importance[i]}")

Feature Importance:
area: 0.4674872218428287
bathrooms: 0.1388421974304244
stories: 0.06341673105718494
parking: 0.05709638483099567
bedrooms: 0.03713834157911944
furnishingstatus_unfurnished: 0.035646475147997216
airconditioning_no: 0.03313536914669941
airconditioning_yes: 0.023262673778508082
prefarea_no: 0.018302679417039433
basement_no: 0.01708171884781921
prefarea_yes: 0.016062719468278787
furnishingstatus_furnished: 0.015310019238969373
basement_yes: 0.015123885233059586
furnishingstatus_semi-furnished: 0.011403490691154346
guestroom_no: 0.010908611882047246
hotwaterheating_no: 0.010746816357674063
guestroom_yes: 0.010315867023947677
hotwaterheating_yes: 0.009651239893326339
mainroad_no: 0.004567124124668921
mainroad_yes: 0.0045004330082572585


In [19]:
tree_depths = [estimator.tree_.max_depth for estimator in random_forest_model.estimators_]

print("Depth of each tree: ")
for i in range(len(tree_depths)):
    if i < len(tree_depths) - 1:
        print(tree_depths[i], end = ", ")
        if i % 25 == 0 and i != 0:
            print("\n")
    else:
        print(tree_depths[i])

Depth of each tree: 
16, 17, 16, 18, 20, 18, 17, 19, 16, 18, 16, 16, 18, 20, 17, 17, 17, 18, 18, 15, 19, 18, 17, 17, 20, 16, 

18, 18, 16, 23, 17, 15, 20, 17, 14, 24, 18, 18, 21, 18, 18, 16, 16, 18, 18, 19, 17, 22, 18, 16, 18, 

20, 17, 18, 19, 17, 16, 19, 20, 19, 18, 14, 21, 17, 15, 17, 19, 20, 16, 16, 17, 19, 19, 18, 17, 16, 

15, 16, 16, 15, 17, 18, 16, 16, 17, 16, 15, 16, 20, 16, 15, 17, 18, 18, 15, 18, 16, 20, 18, 17
