In [1]:
# Dependencies
import pandas as pd
import numpy as np
import os
from scipy import stats
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error
from math import sqrt
from tqdm import tqdm
import time
from threading import Thread
from multiprocessing import Process, Queue
import json

In [2]:
# Loading Data
raw_data = pd.read_csv('train.csv', index_col="Id")

In [3]:
# Cleaning Data

def remove_outliers(df: pd.DataFrame) -> pd.DataFrame:

    numerical_columns = cleaned_data.select_dtypes(include=[np.number]).columns
    
    Q1 = df.quantile(0.25)
    Q3 = df.quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    cleaned_df = df[(df[numerical_columns] >= lower_bound) & (df[numerical_columns] <= upper_bound)]
    cleaned_df = cleaned_df.dropna()
    return cleaned_df

def remove_skew(data: pd.DataFrame) -> pd.DataFrame:
    df = data.copy()
    for col in df.columns:
        skew_val = df[col].skew()
        if skew_val > 0:
            if df[col].min() <= 0:
                shift = abs(df[col].min()) + 1
                df[col] = np.log(df[col] + shift)
            else:
                df[col] = np.log(df[col])
        elif skew_val < 0:
            if df[col].min() < 0:
                shift = abs(df[col].min())
                df[col] = np.sqrt(df[col] + shift)
            else:
                df[col] = np.sqrt(df[col])
    return df

columns_to_delete = ['MoSold', 'YrSold', 'SaleType', 'SaleCondition', 'Alley', 'FireplaceQu', 'PoolQC', 'Fence', 'MiscFeature', "PoolArea", "BsmtFinType2", "2ndFlrSF", "BsmtFinSF1", "1stFlrSF", "HalfBath", "WoodDeckSF","GrLivArea", "BsmtFullBath", "BedroomAbvGr", "OpenPorchSF", "EnclosedPorch", "3SsnPorch", "ScreenPorch"]
cleaned_data = raw_data.drop(columns=columns_to_delete, axis=1)

numerical_columns = cleaned_data.select_dtypes(include=[np.number]).columns
categorical_columns = cleaned_data.select_dtypes(include=["object"]).columns

cleaned_data[numerical_columns] = remove_outliers(cleaned_data[numerical_columns])
cleaned_data[numerical_columns] = remove_skew(cleaned_data[numerical_columns])


mode_values = cleaned_data[categorical_columns].mode().iloc[0]
cleaned_data.loc[:, categorical_columns] = cleaned_data.loc[:, categorical_columns].fillna(mode_values)

mean_values = cleaned_data[numerical_columns].mean()
cleaned_data.loc[:, numerical_columns] = cleaned_data.loc[:, numerical_columns].fillna(mean_values)

cleaned_data.drop_duplicates(inplace=True)

In [4]:
# Splitting Data
X = cleaned_data.drop('SalePrice', axis=1)
y = cleaned_data['SalePrice']

label_encoders = {column: LabelEncoder() for column in categorical_columns}
for column in categorical_columns:
    X[column] = label_encoders[column].fit_transform(X[column])

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.30, random_state=42)

X_train_filled = X_train.fillna(X_train.mean())
X_val_filled = X_val.fillna(X_val.mean())

(X_train.shape, X_val.shape, y_train.shape, y_val.shape)

((1013, 56), (435, 56), (1013,), (435,))

In [5]:
# Sequential Processing
n_estimators_range = [10, 25, 50, 100, 200, 300, 400]
max_features_range = ['sqrt', 'log2', None]
max_depth_range = [1, 2, 5, 10, 20, None]

start_time = time.time()

best_rmse = float('inf')
best_mape = float('inf')
best_model = None
best_params = {}

for n_estimators in tqdm(n_estimators_range):
    for max_features in max_features_range:
        for max_depth in max_depth_range:
            rf_model = RandomForestRegressor(
                n_estimators=n_estimators,
                max_features=max_features,
                max_depth=max_depth,
                random_state=42
            )
            rf_model.fit(X_train_filled, y_train)
            
            # Make predictions and compute RMSE
            y_val_pred = rf_model.predict(X_val_filled)
            rmse = sqrt(mean_squared_error(y_val, y_val_pred))
            # Compute MAPE
            mape = mean_absolute_percentage_error(y_val, y_val_pred) * 100

            if rmse < best_rmse:
                best_rmse = rmse
                best_mape = mape
                best_model = rf_model
                best_params = {
                    'n_estimators': n_estimators,
                    'max_features': max_features,
                    'max_depth': max_depth
                }
print(f"The best parameters {best_params}\nRMSE = {best_rmse}\nMAPE = {best_mape}%")
end_time = time.time()
sequential_time = end_time - start_time
print(f"The sequential execution time is {sequential_time}")

100%|██████████████████████████████████████████████████████████████████████████████████| 7/7 [00:33<00:00,  4.84s/it]

The best parameters {'n_estimators': 50, 'max_features': None, 'max_depth': 10}
RMSE = 0.08475464314752809
MAPE = 0.315009643239149%
The sequential execution time is 33.902761936187744





In [6]:
# Threading Processing
non_par_thread_start = time.time()
def test_model(n_estimators= 100, 
                   max_features='auto', 
                   max_depth= None, 
                   results=[], 
                   index= 0) -> None:
    rf_model = RandomForestRegressor(
        n_estimators=n_estimators,
        max_features=max_features,
        max_depth=max_depth,
        random_state=42
    )
    rf_model.fit(X_train_filled, y_train)
    
    y_val_pred = rf_model.predict(X_val_filled)
    rmse = sqrt(mean_squared_error(y_val, y_val_pred))
    mape = mean_absolute_percentage_error(y_val, y_val_pred) * 100
    
    results[index] = (n_estimators, max_features, max_depth, rmse, mape)

start_time = time.time()

results = [None] * (len(n_estimators_range) * len(max_features_range) * len(max_depth_range))

threads = []
index = 0
for n_estimators in n_estimators_range:
    for max_features in max_features_range:
        for max_depth in max_depth_range:
            thread = Thread(target=test_model, args=(n_estimators,
                                                         max_features,
                                                         max_depth,
                                                         results,
                                                         index))
            threads.append(thread)
            thread.start()
            index += 1

for thread in tqdm(threads, desc="Waiting for threads to finish: "):
    thread.join()

best_params = None
best_rmse = float('inf')
best_mape = float('inf')
for n_estimators, max_features, max_depth, rmse, mape in tqdm(results, desc="Anlyzing results: "):
    if rmse < best_rmse:
        best_rmse = rmse
        best_mape = mape
        best_params = {
            'n_estimators': n_estimators,
            'max_features': max_features,
            'max_depth': max_depth
        }

print(f"The best parameters {best_params}\nRMSE = {best_rmse}\nMAPE = {best_mape}%")
end_time = time.time()
threading_time = end_time - start_time
print(f"Thread Parallel execution time is {threading_time}")
non_par_thread_end = time.time()
non_par_thread = non_par_thread_end - non_par_thread_start

Waiting for threads to finish: 100%|███████████████████████████████████████████████| 126/126 [00:18<00:00,  6.96it/s]
Anlyzing results: 100%|███████████████████████████████████████████████████████| 126/126 [00:00<00:00, 1311370.48it/s]

The best parameters {'n_estimators': 50, 'max_features': None, 'max_depth': 10}
RMSE = 0.08475464314752809
MAPE = 0.315009643239149%
Thread Parallel execution time is 22.072564363479614





In [7]:
non_par_proc_start = time.time()
def proc_model(n_estimators= 100, 
                   max_features= 'auto', 
                   max_depth= 1, 
                   result_queue=None) -> None:

    rf_model = RandomForestRegressor(
        n_estimators=n_estimators,
        max_features=max_features,
        max_depth=max_depth,
        random_state=42
    )
    rf_model.fit(X_train_filled, y_train)
    

    y_val_pred = rf_model.predict(X_val_filled)
    rmse = sqrt(mean_squared_error(y_val, y_val_pred))
    mape = mean_absolute_percentage_error(y_val, y_val_pred) * 100
    

    result_dictionary = {
        'n_estimators': n_estimators,
        'max_features': max_features,
        'max_depth': max_depth,
        'rmse': rmse,
        'mape': mape
    }
    
    if result_queue is not None:
        result_queue.put(result_dictionary)

In [8]:
# MultiProcess Processing

start_time = time.time()  

processes = []
queue = Queue()

for n_estimators in n_estimators_range:
    for max_features in max_features_range:
        for max_depth in max_depth_range:
            file_path = f'results_{n_estimators}_{max_features}_{max_depth}.json'
            process = Process(target=proc_model,
                              args=(n_estimators,
                                    max_features,
                                    max_depth,
                                    queue))
            processes.append(process)
            process.start()


for process in tqdm(processes, desc="Waiting for processes to finish: "):
    process.join()


results = []
while not queue.empty():
    results.append(queue.get())

best_params = None
best_rmse = float('inf')
best_mape = float('inf')

for res in results:
    if res['rmse'] < best_rmse:
        best_rmse = res['rmse']
        best_mape = res['mape']
        best_params = {
            'n_estimators': res['n_estimators'],
            'max_features': res['max_features'],
            'max_depth': res['max_depth']
        }

print(f"The best parameters {best_params}\nRMSE = {best_rmse}\nMAPE = {best_mape}%")

end_time = time.time()
processes_time = end_time - start_time
print(f"The processes parallel execution time is {processes_time}")

non_par_proc_end = time.time()
non_par_proc = non_par_proc_end - non_par_proc_start

Waiting for processes to finish: 100%|█████████████████████████████████████████████| 126/126 [00:03<00:00, 41.17it/s]

The best parameters {'n_estimators': 50, 'max_features': None, 'max_depth': 10}
RMSE = 0.08475464314752809
MAPE = 0.315009643239149%
The processes parallel execution time is 7.320642471313477





In [9]:
# Metric Analysis

In [10]:
number_of_cpus = os.cpu_count()

speedup_threading = sequential_time / threading_time
print(f"Threading speedup: {speedup_threading}.")

Threading speedup: 1.5359684256842356.


In [11]:
speedup_processes = sequential_time / processes_time
print(f"Multiprocessing Speedup:  {speedup_processes}.")

Multiprocessing Speedup:  4.631118384627911.


In [12]:
efficiency_threading = speedup_threading / number_of_cpus
print(f"Threading efficiency: {efficiency_threading}.")

Threading efficiency: 0.2559947376140393.


In [13]:
efficiency_processes = speedup_processes / number_of_cpus
print(f"Processing Efficiency: {efficiency_processes}.")

Processing Efficiency: 0.7718530641046518.


In [14]:
non_parallel_time = non_par_thread
parallel_portion = 1 - (non_parallel_time / sequential_time)
S_amdhal = 1 / (1 - parallel_portion)
print(f"Amdhal Threading speedup: {S_amdhal}.")

Amdhal Threading speedup: 1.535955484901813.


In [15]:
S_amdhal = 1 / ((1 - parallel_portion) + (parallel_portion / number_of_cpus))
print(f"New Amdhal Threading speedup: {S_amdhal}.")

New Amdhal Threading speedup: 1.4100054583755053.


In [16]:
S_gustafson = (1 - parallel_portion) + (parallel_portion * number_of_cpus)
print(f"Gustafson Threading speedup: {S_gustafson}.")

Gustafson Threading speedup: 2.7446973241417685.


In [17]:
non_parallel_time = non_par_proc
parallel_portion = 1 - (non_parallel_time / sequential_time)
S_amdhal = 1 / (1 - parallel_portion)
print(f"Amdhal Process speedup: {S_amdhal}.")

Amdhal Process speedup: 4.628030016771092.


In [18]:
S_amdhal = 1 / ((1 - parallel_portion) + (parallel_portion / number_of_cpus))
print(f"New Amdhal Process speedup: {S_amdhal}.")

New Amdhal Process speedup: 2.8840977907481675.


In [19]:
S_gustafson = (1 - parallel_portion) + (parallel_portion * number_of_cpus)
print(f"Gustafson Process speedup: {S_gustafson}.")

Gustafson Process speedup: 4.919626713335704.
