In [1]:
# Dependencies
import pandas as pd
import numpy as np
import os
from scipy import stats
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error
from math import sqrt
from tqdm import tqdm
import time
from threading import Thread
from multiprocessing import Process, Queue

In [2]:
# Loading Data
raw_data = pd.read_csv('train.csv', index_col="Id")

In [3]:
# Cleaning Data

def remove_outliers(df: pd.DataFrame) -> pd.DataFrame:

    numerical_columns = cleaned_data.select_dtypes(include=[np.number]).columns
    
    Q1 = df.quantile(0.25)
    Q3 = df.quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    cleaned_df = df[(df[numerical_columns] >= lower_bound) & (df[numerical_columns] <= upper_bound)]
    cleaned_df = cleaned_df.dropna()
    return cleaned_df

def remove_skew(data: pd.DataFrame) -> pd.DataFrame:
    df = data.copy()
    for col in df.columns:
        skew_val = df[col].skew()
        if skew_val > 0:
            if df[col].min() <= 0:
                shift = abs(df[col].min()) + 1
                df[col] = np.log(df[col] + shift)
            else:
                df[col] = np.log(df[col])
        elif skew_val < 0:
            if df[col].min() < 0:
                shift = abs(df[col].min())
                df[col] = np.sqrt(df[col] + shift)
            else:
                df[col] = np.sqrt(df[col])
    return df

columns_to_delete = ['MoSold', 'YrSold', 'SaleType', 'SaleCondition', 'Alley', 'FireplaceQu', 'PoolQC', 'Fence', 'MiscFeature', "PoolArea", "BsmtFinType2", "2ndFlrSF", "BsmtFullBath", "BedroomAbvGr", "OpenPorchSF", "EnclosedPorch", "3SsnPorch", "ScreenPorch"]
cleaned_data = raw_data.drop(columns=columns_to_delete, axis=1)

numerical_columns = cleaned_data.select_dtypes(include=[np.number]).columns
categorical_columns = cleaned_data.select_dtypes(include=["object"]).columns

cleaned_data[numerical_columns] = remove_outliers(cleaned_data[numerical_columns])
cleaned_data[numerical_columns] = remove_skew(cleaned_data[numerical_columns])


mode_values = cleaned_data[categorical_columns].mode().iloc[0]
cleaned_data.loc[:, categorical_columns] = cleaned_data.loc[:, categorical_columns].fillna(mode_values)

mean_values = cleaned_data[numerical_columns].mean()
cleaned_data.loc[:, numerical_columns] = cleaned_data.loc[:, numerical_columns].fillna(mean_values)

cleaned_data.drop_duplicates(inplace=True)

In [4]:
# Splitting Data
X = cleaned_data.drop('SalePrice', axis=1)
y = cleaned_data['SalePrice']

label_encoders = {column: LabelEncoder() for column in categorical_columns}
for column in categorical_columns:
    X[column] = label_encoders[column].fit_transform(X[column])

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.30, random_state=42)

X_train_filled = X_train.fillna(X_train.mean())
X_val_filled = X_val.fillna(X_val.mean())

(X_train.shape, X_val.shape, y_train.shape, y_val.shape)

((1013, 61), (435, 61), (1013,), (435,))

In [5]:
# Sequential Processing
n_estimators_range = [10, 25, 50, 100, 200, 300, 400]
max_features_range = ['sqrt', 'log2', None]  # None means using all features
max_depth_range = [1, 2, 5, 10, 20, None]  # None means no limit

start_time = time.time()

best_rmse = float('inf')
best_mape = float('inf')
best_model = None
best_parameters = {}

for n_estimators in tqdm(n_estimators_range):
    for max_features in max_features_range:
        for max_depth in max_depth_range:
            rf_model = RandomForestRegressor(
                n_estimators=n_estimators,
                max_features=max_features,
                max_depth=max_depth,
                random_state=42
            )
            rf_model.fit(X_train_filled, y_train)
            
            # Make predictions and compute RMSE
            y_val_pred = rf_model.predict(X_val_filled)
            rmse = sqrt(mean_squared_error(y_val, y_val_pred))
            # Compute MAPE
            mape = mean_absolute_percentage_error(y_val, y_val_pred) * 100
            print(f"The parameters: {n_estimators}, {max_features}, {max_depth}. RMSE: {rmse}, MAPE: {mape}%")

            if rmse < best_rmse:
                best_rmse = rmse
                best_mape = mape
                best_model = rf_model
                best_parameters = {
                    'n_estimators': n_estimators,
                    'max_features': max_features,
                    'max_depth': max_depth
                }
print(f"The best parameters {best_params}\nRMSE = {best_rmse}\nMAPE = {best_mape}%")
end_time = time.time()
sequential_time = end_time - start_time
print(f"The sequential execution time is {sequential_time}")

  0%|          | 0/7 [00:00<?, ?it/s]

The parameters: 10, sqrt, 1. RMSE: 0.16887545882328203, MAPE: 0.7048899662283336%
The parameters: 10, sqrt, 2. RMSE: 0.1344126757225613, MAPE: 0.540630519898148%
The parameters: 10, sqrt, 5. RMSE: 0.09407476521178056, MAPE: 0.34119417270018954%
The parameters: 10, sqrt, 10. RMSE: 0.08476446594160225, MAPE: 0.2989022685949839%
The parameters: 10, sqrt, 20. RMSE: 0.08451107555404473, MAPE: 0.29247375061733444%
The parameters: 10, sqrt, None. RMSE: 0.08451107555404473, MAPE: 0.29247375061733444%
The parameters: 10, log2, 1. RMSE: 0.1712972985739615, MAPE: 0.707562666495215%
The parameters: 10, log2, 2. RMSE: 0.13551161070916118, MAPE: 0.5533931335404082%
The parameters: 10, log2, 5. RMSE: 0.09649029606846798, MAPE: 0.3487186521387578%
The parameters: 10, log2, 10. RMSE: 0.0920833256661376, MAPE: 0.3129570943196805%
The parameters: 10, log2, 20. RMSE: 0.08971370975882881, MAPE: 0.31772550927157206%
The parameters: 10, log2, None. RMSE: 0.08940843805661373, MAPE: 0.31510066341702125%
The pa

 14%|█▍        | 1/7 [00:00<00:01,  3.76it/s]

The parameters: 10, None, 20. RMSE: 0.08333113654876813, MAPE: 0.3120727052790386%
The parameters: 10, None, None. RMSE: 0.08333113654876813, MAPE: 0.3120727052790386%
The parameters: 25, sqrt, 1. RMSE: 0.16676302488516653, MAPE: 0.6895365879113001%
The parameters: 25, sqrt, 2. RMSE: 0.12960729520527625, MAPE: 0.5012942877996519%
The parameters: 25, sqrt, 5. RMSE: 0.09207704529547207, MAPE: 0.32351260258309283%
The parameters: 25, sqrt, 10. RMSE: 0.08021566521506387, MAPE: 0.27849874414935577%
The parameters: 25, sqrt, 20. RMSE: 0.07928464427348063, MAPE: 0.28167416653848404%
The parameters: 25, sqrt, None. RMSE: 0.07914750772070804, MAPE: 0.2813219223680187%
The parameters: 25, log2, 1. RMSE: 0.17117453673758604, MAPE: 0.7052304555636381%
The parameters: 25, log2, 2. RMSE: 0.13319268423290404, MAPE: 0.5478883995444657%
The parameters: 25, log2, 5. RMSE: 0.09594935008508815, MAPE: 0.3446207280206574%
The parameters: 25, log2, 10. RMSE: 0.087403684627813, MAPE: 0.29216741724310147%
The 

 29%|██▊       | 2/7 [00:00<00:02,  2.15it/s]

The parameters: 25, None, None. RMSE: 0.08007635132081034, MAPE: 0.28606852946776984%
The parameters: 50, sqrt, 1. RMSE: 0.16788643350042706, MAPE: 0.6978818304213996%
The parameters: 50, sqrt, 2. RMSE: 0.12799268610620812, MAPE: 0.4994106630612974%
The parameters: 50, sqrt, 5. RMSE: 0.0915346659675952, MAPE: 0.3200505332660564%
The parameters: 50, sqrt, 10. RMSE: 0.08352400791726335, MAPE: 0.2799140215969263%
The parameters: 50, sqrt, 20. RMSE: 0.08167702625785542, MAPE: 0.2783969564823481%
The parameters: 50, sqrt, None. RMSE: 0.0809629783187925, MAPE: 0.2759878665287946%
The parameters: 50, log2, 1. RMSE: 0.1753998391611537, MAPE: 0.7472315105881182%
The parameters: 50, log2, 2. RMSE: 0.13443104051840474, MAPE: 0.5430075464802198%
The parameters: 50, log2, 5. RMSE: 0.09491985499936398, MAPE: 0.34283983040619564%
The parameters: 50, log2, 10. RMSE: 0.0844097046862017, MAPE: 0.2859904543615162%
The parameters: 50, log2, 20. RMSE: 0.08499008891970065, MAPE: 0.2879287040051233%
The para

 43%|████▎     | 3/7 [00:02<00:03,  1.29it/s]

The parameters: 50, None, None. RMSE: 0.07893303708500507, MAPE: 0.27561899394557354%
The parameters: 100, sqrt, 1. RMSE: 0.1674507745870075, MAPE: 0.7101112982854306%
The parameters: 100, sqrt, 2. RMSE: 0.12559135397038754, MAPE: 0.4788747592356197%
The parameters: 100, sqrt, 5. RMSE: 0.08945419409108653, MAPE: 0.3122439329108258%
The parameters: 100, sqrt, 10. RMSE: 0.08125393083311382, MAPE: 0.27547768262820116%
The parameters: 100, sqrt, 20. RMSE: 0.08021575236494055, MAPE: 0.27389465620573317%
The parameters: 100, sqrt, None. RMSE: 0.07989257961957874, MAPE: 0.27227486130679385%
The parameters: 100, log2, 1. RMSE: 0.1730383055811066, MAPE: 0.7446603989722339%
The parameters: 100, log2, 2. RMSE: 0.13244316052490698, MAPE: 0.5291368002370073%
The parameters: 100, log2, 5. RMSE: 0.0928463479776819, MAPE: 0.32752832333295695%
The parameters: 100, log2, 10. RMSE: 0.08324437903914232, MAPE: 0.2825717805975063%
The parameters: 100, log2, 20. RMSE: 0.08230175150410236, MAPE: 0.28142595003

 57%|█████▋    | 4/7 [00:04<00:04,  1.38s/it]

The parameters: 100, None, None. RMSE: 0.07676117420016221, MAPE: 0.27228053026303684%
The parameters: 200, sqrt, 1. RMSE: 0.16461451784716483, MAPE: 0.6822200315549887%
The parameters: 200, sqrt, 2. RMSE: 0.12346671062657838, MAPE: 0.4664024721303316%
The parameters: 200, sqrt, 5. RMSE: 0.0889409317882488, MAPE: 0.3088609184042812%
The parameters: 200, sqrt, 10. RMSE: 0.07963122038144103, MAPE: 0.27156583367356457%
The parameters: 200, sqrt, 20. RMSE: 0.08043700178175968, MAPE: 0.2725902940829648%
The parameters: 200, sqrt, None. RMSE: 0.08034147828017824, MAPE: 0.27182853518545436%
The parameters: 200, log2, 1. RMSE: 0.16929912573755523, MAPE: 0.7068619080241239%
The parameters: 200, log2, 2. RMSE: 0.12937771920110006, MAPE: 0.5042626619686496%
The parameters: 200, log2, 5. RMSE: 0.09201952777856183, MAPE: 0.31975444313461054%
The parameters: 200, log2, 10. RMSE: 0.08299590498942215, MAPE: 0.279507939652177%
The parameters: 200, log2, 20. RMSE: 0.082142832128671, MAPE: 0.279200191744

 71%|███████▏  | 5/7 [00:08<00:05,  2.52s/it]

The parameters: 200, None, None. RMSE: 0.07657510363285479, MAPE: 0.27222165542874815%
The parameters: 300, sqrt, 1. RMSE: 0.1648455581852619, MAPE: 0.6795434312614116%
The parameters: 300, sqrt, 2. RMSE: 0.12349680994458354, MAPE: 0.46509326741768003%
The parameters: 300, sqrt, 5. RMSE: 0.08881837358774558, MAPE: 0.3099411111810699%
The parameters: 300, sqrt, 10. RMSE: 0.07925131858731489, MAPE: 0.2710507068278926%
The parameters: 300, sqrt, 20. RMSE: 0.07958759117811016, MAPE: 0.26961063730752366%
The parameters: 300, sqrt, None. RMSE: 0.07944221336935055, MAPE: 0.2686581515293154%
The parameters: 300, log2, 1. RMSE: 0.16942791963684156, MAPE: 0.7071357543344426%
The parameters: 300, log2, 2. RMSE: 0.13046242125169508, MAPE: 0.5151006598212483%
The parameters: 300, log2, 5. RMSE: 0.09246771586479145, MAPE: 0.32188099973810014%
The parameters: 300, log2, 10. RMSE: 0.08375577622492537, MAPE: 0.2810680232149154%
The parameters: 300, log2, 20. RMSE: 0.08205000695740979, MAPE: 0.277855295

 86%|████████▌ | 6/7 [00:15<00:03,  3.95s/it]

The parameters: 300, None, None. RMSE: 0.07634520408622658, MAPE: 0.27233851195175895%
The parameters: 400, sqrt, 1. RMSE: 0.16460797564546525, MAPE: 0.6775145771324852%
The parameters: 400, sqrt, 2. RMSE: 0.12350082688647207, MAPE: 0.4650610985789084%
The parameters: 400, sqrt, 5. RMSE: 0.08891943252079391, MAPE: 0.31016818750882197%
The parameters: 400, sqrt, 10. RMSE: 0.07947659797247365, MAPE: 0.2714653608530758%
The parameters: 400, sqrt, 20. RMSE: 0.07966400432959692, MAPE: 0.270478950081561%
The parameters: 400, sqrt, None. RMSE: 0.07949506858830116, MAPE: 0.26960576976123507%
The parameters: 400, log2, 1. RMSE: 0.1691165436061864, MAPE: 0.7067862063612407%
The parameters: 400, log2, 2. RMSE: 0.1301704827510147, MAPE: 0.515090570408518%
The parameters: 400, log2, 5. RMSE: 0.09283583715793904, MAPE: 0.32336678679463643%
The parameters: 400, log2, 10. RMSE: 0.08334011606127427, MAPE: 0.27972406184155446%
The parameters: 400, log2, 20. RMSE: 0.08199398245711215, MAPE: 0.27700485492

100%|██████████| 7/7 [00:24<00:00,  3.51s/it]

The parameters: 400, None, None. RMSE: 0.0763511122106873, MAPE: 0.2716301562861062%
The best parameters {'n_estimators': 300, 'max_features': None, 'max_depth': None} for RMSE = 0.07634520408622658, MAPE: 0.2716301562861062%
The sequential execution time is 24.57148504257202





In [6]:
# Threading Processing

def test_model(n_estimators= 100, 
                   max_features='auto', 
                   max_depth= None, 
                   results=[], 
                   index= 0) -> None:
    rf_model = RandomForestRegressor(
        n_estimators=n_estimators,
        max_features=max_features,
        max_depth=max_depth,
        random_state=42
    )
    rf_model.fit(X_train_filled, y_train)
    
    y_val_pred = rf_model.predict(X_val_filled)
    rmse = sqrt(mean_squared_error(y_val, y_val_pred))
    mape = mean_absolute_percentage_error(y_val, y_val_pred) * 100
    
    results[index] = (n_estimators, max_features, max_depth, rmse, mape)

start_time = time.time()

results = [None] * (len(n_estimators_range) * len(max_features_range) * len(max_depth_range))

threads = []
index = 0
for n_estimators in n_estimators_range:
    for max_features in max_features_range:
        for max_depth in max_depth_range:
            thread = Thread(target=test_model, args=(n_estimators,
                                                         max_features,
                                                         max_depth,
                                                         results,
                                                         index))
            threads.append(thread)
            thread.start()
            index += 1

for thread in tqdm(threads, desc="Waiting for threads to finish: "):
    thread.join()

best_params = None
best_rmse = float('inf')
best_mape = float('inf')
for n_estimators, max_features, max_depth, rmse, mape in tqdm(results, desc="Anlyzing results: "):
    if rmse < best_rmse:
        best_rmse = rmse
        best_mape = mape
        best_params = {
            'n_estimators': n_estimators,
            'max_features': max_features,
            'max_depth': max_depth
        }

print(f"The best parameters {best_params}\nRMSE = {best_rmse}\nMAPE = {best_mape}%")
end_time = time.time()
threading_time = end_time - start_time
print(f"Thread Parallel execution time is {threading_time}")

Waiting for threads to finish: 100%|██████████| 126/126 [00:08<00:00, 14.76it/s]
Anlyzing results: 100%|██████████| 126/126 [00:00<00:00, 2032624.25it/s]

The best parameters {'n_estimators': 300, 'max_features': None, 'max_depth': None}
RMSE = 0.07634520408622658
MAPE = 0.27233851195175895%
Thread Parallel execution time is 10.38661503791809





In [18]:
# MultiProcess Processing

def proc_model(n_estimators= 100, 
                   max_features= 'auto', 
                   max_depth= 1, 
                   result_queue=None) -> None:

    rf_model = RandomForestRegressor(
        n_estimators=n_estimators,
        max_features=max_features,
        max_depth=max_depth,
        random_state=42
    )
    rf_model.fit(X_train_filled, y_train)
    

    y_val_pred = rf_model.predict(X_val_filled)
    rmse = sqrt(mean_squared_error(y_val, y_val_pred))
    mape = mean_absolute_percentage_error(y_val, y_val_pred) * 100
    

    result_dictionary = {
        'n_estimators': n_estimators,
        'max_features': max_features,
        'max_depth': max_depth,
        'rmse': rmse,
        'mape': mape
    }
    
    if result_queue is not None:
        result_queue.put(result_dictionary)

start_time = time.time()  

processes = []
queue = Queue()

for n_estimators in n_estimators_range:
    for max_features in max_features_range:
        for max_depth in max_depth_range:
            file_path = f'results_{n_estimators}_{max_features}_{max_depth}.json'
            process = Process(target=proc_model,
                              args=(n_estimators,
                                    max_features,
                                    max_depth,
                                    queue))
            processes.append(process)
            process.start()


for process in tqdm(processes, desc="Waiting for processes to finish: "):
    process.join()


results = []
while not queue.empty():
    results.append(queue.get())

best_params = None
best_rmse = float('inf')
best_mape = float('inf')

for res in results:
    if res['rmse'] < best_rmse:
        best_rmse = res['rmse']
        best_mape = res['mape']
        best_params = {
            'n_estimators': res['n_estimators'],
            'max_features': res['max_features'],
            'max_depth': res['max_depth']
        }

print(f"The best parameters {best_params}\nRMSE = {best_rmse}\nMAPE = {best_mape}%")

end_time = time.time()
processes_time = end_time - start_time
print(f"The processes parallel execution time is {processes_time}")

non_parallel_end = time.time()

Traceback (most recent call last):
  File "<string>", line 1, in <module>
  File "/opt/anaconda3/lib/python3.12/multiprocessing/spawn.py", line 122, in spawn_main
    exitcode = _main(fd, parent_sentinel)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/anaconda3/lib/python3.12/multiprocessing/spawn.py", line 132, in _main
Traceback (most recent call last):
  File "<string>", line 1, in <module>
  File "/opt/anaconda3/lib/python3.12/multiprocessing/spawn.py", line 122, in spawn_main
    exitcode = _main(fd, parent_sentinel)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/anaconda3/lib/python3.12/multiprocessing/spawn.py", line 132, in _main
    self = reduction.pickle.load(from_parent)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
AttributeError: Can't get attribute 'proc_model' on <module '__main__' (<class '_frozen_importlib.BuiltinImporter'>)>
Traceback (most recent call last):
  File "<string>", line 1, in <module>
spawn.py", line 122, in spawn_main.12/multiprocessing/

The best parameters None
RMSE = inf
MAPE = inf%
The processes parallel execution time is 1.5244958400726318





In [None]:
# Metric Analysis