In [6]:
!pip install torch
!pip install seaborn


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m23.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Collecting seaborn
  Obtaining dependency information for seaborn from https://files.pythonhosted.org/packages/7b/e5/83fcd7e9db036c179e0352bfcd20f81d728197a16f883e7b90307a88e65e/seaborn-0.13.0-py3-none-any.whl.metadata
  Using cached seaborn-0.13.0-py3-none-any.whl.metadata (5.3 kB)
Using cached seaborn-0.13.0-py3-none-any.whl (294 kB)
Installing collected packages: seaborn
Successfully installed seaborn-0.13.0

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m23.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [23]:
# Load helpers and custom dataset class
from __init__ import PricingWizardDataset, base_regression_pipeline, regression_accuracy, threshold_accuracy

# Data manipulation 
import pandas as pd
import numpy as np

# Data visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Scikit
from sklearn.svm import LinearSVR
from sklearn.preprocessing import StandardScaler
from sklearn.inspection import permutation_importance
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, cross_val_score
from sklearn.metrics import mean_squared_error

In [11]:
# Load Data
data = PricingWizardDataset()

Dataset Loaded: post_preprocessing_without_dummies
	Number of Rows: 283055
	Number of Columns: 22
	Outlier Removal: True
	Train Size: 0.8
	Test Size: 0.2
	Random State: 42


In [12]:
# Apply data preparation
print('Applying data preparation...')
data.apply_function(base_regression_pipeline)
print('Done.')

Applying data preparation...
Done.


In [13]:
# Split data
data.stratify_train_test_split(y_column='log_listing_price',
                               val_size=0,
                               return_splits=False)

Dependent variable distribution is equal across all subsets


# Linear SVR

In [19]:
# Defines a set of values to explore during the hyperparameter tuning process
linear_param_dist = {
    'C': [0.1, 1, 10, 100],
    'epsilon': [0.01, 0.1, 0.5, 1.0],
}

linear_svr = LinearSVR(dual="auto")
linear_svr

In [24]:
scaler = StandardScaler()

X_val = data.X
X_train = scaler.fit_transform(data.X_train.values)
X_test = scaler.fit_transform(data.X_test.values)
y_train = data.y_train
y_test = data.y_test

random_search = RandomizedSearchCV(
    linear_svr,
    param_distributions=linear_param_dist,
    scoring='neg_mean_squared_error',
    random_state=42,
    n_jobs=-1,
    verbose=1
)
random_search.fit(X_train, y_train)

# Get the best hyperparameters from Random Search
best_params_random: list = random_search.best_params_

Fitting 5 folds for each of 10 candidates, totalling 50 fits




KeyboardInterrupt: 

In [None]:
# Use the best hyperparameters from Random Search as initial values for Grid Search
grid_search_params = {
    key: [value] for key, value in best_params_random.items()
}

grid_search = GridSearchCV(
    model,
    param_grid=grid_search_params,
    scoring='neg_mean_squared_error',
    cv=5,
    n_jobs=-1,
    verbose=1
)
grid_search.fit(X_train, y_train)

# Get the best hyperparameters from Grid Search
best_params_grid: list = grid_search.best_params_

# Train the final model with the best hyperparameters from Grid Search
final_model = grid_search.best_estimator_

In [None]:
# Evaluate the model using cross-validation and calculates the mean
cv_scores: list = cross_val_score(final_model, X_train, y_train, scoring='neg_mean_squared_error', cv=5)
mse_mean_cv: float = -np.mean(cv_scores)

# Train the final model on the entire training set, measuring the training time in seconds
start_time = time.time()
final_model.fit(X_train, y_train)
end_time = time.time()

# Calculate training time
training_time = end_time - start_time

# Evaluate the final model on the test set
y_pred_test = final_model.predict(X_test)
mse_test: float = mean_squared_error(y_test, y_pred_test)

In [None]:
# Calculate permutation importances for the regressor
feature_importances = permutation_importance(final_model, X_test, y_test, n_repeats=10, random_state=42).importances_mean

# Visualizations

In [None]:
# actual vs predicted
models = [model for models in results.values() for model in models]

num_models = len(models)
num_cols = 2
num_rows = math.ceil(num_models / num_cols)

fig, axs = plt.subplots(num_rows, num_cols, figsize=(18, 6*num_rows), tight_layout=True)
axs = axs.flatten()

for i, model in enumerate(models):
    axs[i].scatter(y_test, model.y_pred, alpha=0.25)
    axs[i].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k', lw=1)
    axs[i].set_title(model.label)
    axs[i].set_xlabel('Actual Values')
    axs[i].set_ylabel('Predicted Values')

plt.suptitle("Actual vs. Predicted values by model")

plt.show(fig)

In [None]:
# residuals
models = [model for models in results.values() for model in models]

num_models = len(models)
num_cols = 2
num_rows = math.ceil(num_models / num_cols)

fig, axs = plt.subplots(num_rows, num_cols, figsize=(18, 6*num_rows), tight_layout=True)

for i, model in enumerate(models):
    # Calculate residuals
    prediction_error = y_test[i] - model.y_pred

    # Extract the subplot for the current model
    row = i // num_cols
    col = i % num_cols

    # Plot the scatter plot on the specific subplot
    axs[row, col].scatter(model.y_pred, prediction_error, alpha=0.5)
    plt.axhline(y=0, color='r', linestyle='--')
    axs[row, col].set_title(model.label)
    axs[row, col].set_xlabel('Predicted Values')
    axs[row, col].set_ylabel('Prediction Errors')

plt.suptitle("Residual values by model")

plt.show(fig)

In [None]:
# model evaluation
regressor_names = [result.label for results in results.values() for result in results]

mse_mean_cv_values = [result.mse_mean_cv for results in results.values() for result in results]
mse_test_values = [result.mse_test for results in results.values() for result in results]

bar_width = 0.35
index = range(len(regressor_names))

fig, ax = plt.subplots(tight_layout=True)
bar1 = ax.bar(index, mse_mean_cv_values, bar_width, label='MSE Mean CV')
bar2 = ax.bar([i + bar_width for i in index], mse_test_values, bar_width, label='MSE Test')

ax.set_xlabel('Regressor')
ax.set_ylabel('Mean Squared Error (MSE)')
ax.set_title('MSE Mean CV and MSE Test for Each Regressor')
ax.set_xticks([i + bar_width / 2 for i in index])
ax.set_xticklabels(regressor_names)
ax.legend()

plt.show(fig)

In [None]:
# feature importances
fig, ax = plt.subplots(tight_layout=True)

# Bar plot for average feature importances
sns.barplot(x=feature_importances.index, y=feature_importances['average'], hue=feature_importances.index, legend=False)
plt.axhline(y=0, color='k', linestyle='--', linewidth=1)
ax.set_title('Average Feature Importances')
ax.set_xlabel('Features')
ax.set_ylabel('Average Importance')
ax.tick_params(axis='x', labelsize=8)

plt.show(fig)

In [None]:
regressor_names = [result.label for results in results.values() for result in results]
training_times = [result.training_time for results in results.values() for result in results]

fig, ax = plt.subplots(tight_layout=True)
ax.bar(regressor_names, training_times, color='blue')
ax.set_xlabel('Regressor')
ax.set_ylabel('Training Time (s)')
ax.set_title('Training Time Comparison for Different Regressors')

plt.show(fig)