```markdown
This is just a notebook version of the train.py file. Use this to ensure training works as intended
```

In [2]:
# Library Import
import pandas as pd
from utils.Dataloader import PricingWizardDataset
from utils.DataTransformation import base_regression_pipeline, ridge_regression_pipeline
from utils.helpers import save_model, drop_helpers
from models import base_linear_regression, regularized_regression, regression_neural_network 
import argparse
import torch
from torch.utils.data import TensorDataset, DataLoader
from sklearn.preprocessing import MinMaxScaler

In [3]:
# Load data
data = PricingWizardDataset()

Dataset Loaded: post_preprocessing_without_dummies
	Number of Rows: 283055
	Number of Columns: 22
	Outlier Removal: True
	Train Size: 0.8
	Test Size: 0.2
	Random State: 42


## Base Linear Regression

In [3]:
# 1. Data transformations
print('Applying data preparation...')    
data.apply_function(base_regression_pipeline)
print('Done.')

Applying data preparation...
Done.


In [4]:
# 2. Stratify Split
data.stratify_train_test_split(y_column='log_listing_price', 
                               val_size=0,
                               return_splits=False)

Dependent variable distribution is equal across all subsets


In [5]:
# 3. Train model
results = linear_regression(data)

# 4. Print results
print('Test Results:')
print('R2 Score:', results['r2'])
print('MSE:', results['mse'])
print('MAE', results['mae'])
print('RMSE', results['rmse'])

Test Results:
R2 Score: 0.5452212770709474
MSE: 0.4381693860784058
MAE 0.5134875030165035
RMSE 0.6619436426754213


## Regularizated Regression

In [6]:
# Reset Dataset
data.reset_dataset()

# 1. Data transformations
print('Applying data preparation...')    
data.apply_function(ridge_regression_pipeline)
print('Done.')

Applying data preparation...
Done.


In [7]:
# 2. Stratify Split
data.stratify_train_test_split(y_column='log_listing_price', 
                               val_size=0,
                               return_splits=False)

Dependent variable distribution is equal across all subsets


In [8]:
# Make predictions on the test data
results = regularized_regression(data, -1)

Training model using GridSearchCV: regularized_regression
Fitting 5 folds for each of 13 candidates, totalling 65 fits
[CV] END ........................alpha=0.0031622776601683794; total time=  30.5s
[CV] END ........................alpha=0.0031622776601683794; total time=  30.4s
[CV] END ........................................alpha=0.001; total time=  30.6s
[CV] END .........................................alpha=0.01; total time=  30.9s
[CV] END .........................................alpha=0.01; total time=  31.3s
[CV] END ........................alpha=0.0031622776601683794; total time=  31.3s
[CV] END ........................................alpha=0.001; total time=  31.3s
[CV] END ........................................alpha=0.001; total time=  31.5s
[CV] END ........................................alpha=0.001; total time=  31.7s
[CV] END ........................alpha=0.0031622776601683794; total time=  31.7s
[CV] END ........................alpha=0.0031622776601683794; total tim

In [16]:
# 4. Print results
print('Test Results:')
print('R2 Score:', results['r2'])
print('MSE:', results['mse'])
print('MAE', results['mae'])
print('RMSE', results['rmse'])

Test Results:
R2 Score: 0.5695965129992877
MSE: 0.4146843776034087
MAE 0.4972702697816776
RMSE 0.6439599192522845


### Neural Net

In [4]:
# Reset dataset
data.reset_dataset()

# Apply ridge regression data preparation
print('Applying data preparation...')    
data.apply_function(ridge_regression_pipeline)
    
# Standard Scaling
scaler = MinMaxScaler()
X = scaler.fit_transform(drop_helpers(data.df))

    
# Assigning X to data.df
data.df[drop_helpers(data.df).columns] = X

Applying data preparation...


In [5]:
# Split data   
data.stratify_train_test_split(y_column='log_listing_price', 
                               val_size=.2,
                               return_splits=False)

# Converting to PyTorch tensors
X_train_tensor = torch.tensor(drop_helpers(data.X_train).to_numpy(), dtype=torch.float32)
X_test_tensor = torch.tensor(drop_helpers(data.X_test).to_numpy(), dtype=torch.float32)
X_val_tensor = torch.tensor(drop_helpers(data.X_val).to_numpy(), dtype=torch.float32)

y_train_tensor = torch.tensor(data.y_train, dtype=torch.float32)
y_test_tensor = torch.tensor(data.y_test, dtype=torch.float32)
y_val_tensor = torch.tensor(data.y_val, dtype=torch.float32)

# Create pytorch datasets
trainset = TensorDataset(X_train_tensor, y_train_tensor)
valset = TensorDataset(X_val_tensor, y_val_tensor)

Dependent variable distribution is equal across all subsets


In [6]:
# Create pytorch dataloaders
batch_size = 32 
train_loader = DataLoader(dataset=trainset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(dataset=valset, batch_size=batch_size, shuffle=False)

In [7]:
# Train Model
results = regression_neural_network.regression_network(train_loader, val_loader, X_test_tensor, data.y_test)

Epoch 0, train loss: 0.5360277266478125, val loss: 0.3769057646393776
Epoch 1, train loss: 0.37201510082309913, val loss: 0.36070685381185535
Epoch 2, train loss: 0.350926898995354, val loss: 0.3772550086180369
Epoch 3, train loss: 0.3415549395054287, val loss: 0.34388937130570413
Epoch 4, train loss: 0.3344634524065813, val loss: 0.34510406237621766
Epoch 5, train loss: 0.32887414834281736, val loss: 0.36449053823148125
Epoch 6, train loss: 0.32519303964199703, val loss: 0.34444683176481117
Epoch 7, train loss: 0.32177047881339177, val loss: 0.34081980778840976
Epoch 8, train loss: 0.31904713651017985, val loss: 0.34078088883457885
Epoch 9, train loss: 0.3166050994888049, val loss: 0.3472947635682626
Epoch 10, train loss: 0.31448905114238884, val loss: 0.34158121569681976
Epoch 11, train loss: 0.31252377495006817, val loss: 0.35191721259958325
Epoch 12, train loss: 0.3111695528238136, val loss: 0.3403679687087819
Epoch 13, train loss: 0.3090308446397818, val loss: 0.34582832857629675


In [8]:
# Print results
print('R2 Score:', results['r2'])
print('MSE:', results['mse'])
print('MAE', results['mae'])
print('RMSE', results['rmse'])

R2 Score: 0.6408822819519124
MSE: 0.3460020931356468
MAE 0.4392821771925936
RMSE 0.588219426010096


In [9]:
# Save model
path = 'models/pickled_models/regression_neural_net.pt'
save_model(results, path, model_type='pytorch')

Model saved successfully at models/pickled_models/regression_neural_net.pt
