In [None]:
import torch
from tqdm import tqdm
from torch import nn, optim
import pandas as pd
import pandas as pd
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

data = pd.read_csv("../processed_data/data.csv").drop(columns = ["id", "source", "coordinates"])

In [None]:
# Selecting numerical columns for outlier analysis
col = 'price'

# Outlier analysis using IQR
outlier_info = {}

Q1 = data[col].quantile(0.25)
Q3 = data[col].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Counting outliers
outliers = data[(data[col] < lower_bound) | (data[col] > upper_bound)]
data = data.drop(outliers.index)
data.info()
data.to_csv("../processed_data/data_no_outliers.csv", index=False)

In [None]:
base_neuron_count = 32

class ResidualBlock(nn.Module):
    def __init__(self, n_features):
        super(ResidualBlock, self).__init__()
        self.fc = nn.Linear(n_features, n_features)
        self.activation = nn.ELU()

    def forward(self, x):
        return self.activation(self.fc(x) + x)

class TabularFFNN(nn.Module):
    def __init__(self, input_size, output_size):
        super(TabularFFNN, self).__init__()
        self.ffnn = nn.Sequential(
            nn.Linear(input_size, base_neuron_count),
            nn.ELU(),
            nn.LayerNorm(base_neuron_count),
            nn.Linear(base_neuron_count, base_neuron_count * 2),
            nn.ELU(),
            nn.Dropout(0.2),
            nn.Linear(base_neuron_count * 2, base_neuron_count // 2),
            nn.ELU(),
            nn.LayerNorm(base_neuron_count // 2),
            nn.Linear(base_neuron_count // 2, base_neuron_count // 2),
            ResidualBlock(base_neuron_count // 2),
            nn.Dropout(0.4),
            nn.Linear(base_neuron_count // 2, output_size)
        )

    def forward(self, x):
        x = x.float()
        x = x.view(x.size(0), -1)
        x = self.ffnn(x)
        return x
    
class TabularFFNNOLD(nn.Module):
    def __init__(self, input_size, output_size, dropout_prob=0.4):
        super(TabularFFNNOLD, self).__init__()
        self.ffnn = nn.Sequential(
            nn.Linear(input_size, base_neuron_count), 
            nn.BatchNorm1d(base_neuron_count),  # Ensure the input here has 512 features
            nn.ReLU(),
            nn.Linear(base_neuron_count, base_neuron_count),
            nn.Dropout(0.5),
            nn.ReLU(),
            nn.Linear(base_neuron_count, base_neuron_count),
            nn.ReLU(),
            nn.Linear(base_neuron_count, base_neuron_count),
            nn.ReLU(),
            nn.Linear(base_neuron_count, base_neuron_count),
            nn.ReLU(),
            nn.Linear(base_neuron_count, base_neuron_count), # 16
            nn.ReLU(),
            nn.Linear(base_neuron_count, base_neuron_count),
            nn.ReLU(),
            nn.Linear(base_neuron_count, base_neuron_count),
            nn.Dropout(0.1),
            nn.ReLU(),
            nn.Linear(base_neuron_count, output_size)
        )

    def forward(self, x):
        x = x.float()
        # print(x)
        x = x.view(x.size(0), -1)
        x = self.ffnn(x)
        return x
    
class TabularFFNNSimple(nn.Module):
    def __init__(self, input_size, output_size, dropout_prob=0.4):
        super(TabularFFNNSimple, self).__init__()
        self.ffnn = nn.Sequential(
            nn.Linear(input_size, base_neuron_count),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(base_neuron_count, base_neuron_count),
            nn.ReLU(),
            nn.Linear(base_neuron_count, output_size)
        )

    def forward(self, x):
        x = x.float()
        # print(x)
        x = x.view(x.size(0), -1)
        x = self.ffnn(x)
        return x
    
# Split the data into features and target
X = data.drop('price', axis=1)
y = data['price']

# Standardize the features
device = torch.device("cpu")
# Convert to PyTorch tensors
X_tensor = torch.tensor(X.to_numpy(), dtype=torch.float32, device = device)
y_tensor = torch.tensor(y.values, dtype=torch.float32, device = device)


# Split the data into training and combined validation and testing sets
X_train, X_val_test, y_train, y_val_test = train_test_split(X_tensor, y_tensor, test_size=0.4, random_state=42)

# Split the combined validation and testing sets
X_val, X_test, y_val, y_test = train_test_split(X_val_test, y_val_test, test_size=0.5, random_state=42)

# Create DataLoader for training, validation, and testing
train_data = TensorDataset(X_train, y_train)
val_data = TensorDataset(X_val, y_val)
test_data = TensorDataset(X_test, y_test)
batch_size = 256
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_data, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_data, batch_size=batch_size, shuffle=False)

# Check if the dimensions match the expected input size for the model
input_size = X_train.shape[1]

# Output
# input_size, train_loader, test_loader

model = TabularFFNN(
    input_size = input_size,
    output_size = 1
)
model.to(device)

num_epochs = 3000
train_losses = []
val_losses = []
epochs_suc = [] # to have a reference to it

optimizer = optim.Adam(
    model.parameters(), 
    lr=8e-4,
    weight_decay=5e-4
)
criterion = torch.nn.MSELoss()
criterion_abs = torch.nn.L1Loss()

scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, 
    mode='min', 
    factor=0.999999, 
    patience=10, 
    verbose=True
)

for epoch in range(num_epochs):
    # Training
    model.train()  # Set the model to training mode
    running_loss = 0.0
    l1_losses = []
    for tuple_ in tqdm(train_loader, desc=f'Epoch {epoch+1}/{num_epochs} - Training'):
        datas, prices = tuple_
        optimizer.zero_grad()
        outputs = model(datas)
        prices_viewed = prices.view(-1, 1).float()
        loss = criterion(outputs, prices_viewed)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    train_losses.append(running_loss / len(train_loader))  # Average loss for this epoch

    # Validation
    model.eval()  # Set the model to evaluation mode
    val_loss = 0.0
    with torch.no_grad():  # Disable gradient calculation
        for tuple_ in tqdm(val_loader, desc=f'Epoch {epoch+1}/{num_epochs} - Validation'):
            datas, prices = tuple_
            outputs = model(datas)  # Forward pass
            prices_viewed = prices.view(-1, 1).float()
            loss = criterion(outputs, prices_viewed)  # Compute loss
            val_loss += loss.item()  # Accumulate the loss
            l1_losses.append(criterion_abs(outputs, prices_viewed))

    val_losses.append(val_loss / len(val_loader))  # Average loss for this epoch
    l1_mean_loss = sum(l1_losses) / len(l1_losses)
    # Print epoch's summary
    epochs_suc.append(epoch)
    scheduler.step(val_losses[-1])
    
    print(f'Epoch {epoch+1}, Training Loss: {int(train_losses[-1])}, Validation Loss: {int(val_losses[-1])}, L1: {int(l1_mean_loss)}')

In [None]:
import matplotlib.pyplot as plt
plt.title("Model evaluation")
plt.plot(train_losses, label = 'Training')
plt.plot(val_losses, label = 'Validation')
plt.ylabel("MSE")
plt.xlabel("Epoch")
plt.yscale('log')
plt.xticks(range(1, epochs_suc[-1], int(epochs_suc[-1] / 10)))
plt.legend()

In [None]:
optimizer = optim.Adam(
    model.parameters(), 
    lr=8e-4,
    weight_decay=5e-4
)
criterion = torch.nn.MSELoss()
criterion_abs = torch.nn.L1Loss()

scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, 
    mode='min', 
    factor=0.999999, 
    patience=10, 
    verbose=True
)
for epoch in range(num_epochs):
    # Training
    model.train()  # Set the model to training mode
    running_loss = 0.0
    l1_losses = []
    for tuple_ in tqdm(train_loader):
        datas, prices = tuple_
        optimizer.zero_grad()
        outputs = model(datas)
        prices_viewed = prices.view(-1, 1).float()
        loss = criterion(outputs, prices_viewed)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    train_losses.append(running_loss / len(train_loader))  # Average loss for this epoch

    # Validation
    model.eval()  # Set the model to evaluation mode
    val_loss = 0.0
    with torch.no_grad():  # Disable gradient calculation
        for tuple_ in tqdm(val_loader):
            datas, prices = tuple_
            outputs = model(datas)  # Forward pass
            prices_viewed = prices.view(-1, 1).float()
            loss = criterion(outputs, prices_viewed)  # Compute loss
            val_loss += loss.item()  # Accumulate the loss
            l1_losses.append(criterion_abs(outputs, prices_viewed))

    val_losses.append(val_loss / len(val_loader))  # Average loss for this epoch
    l1_mean_loss = sum(l1_losses) / len(l1_losses)
    # Print epoch's summary
    epochs_suc.append(epoch)
    scheduler.step(val_losses[-1])
    
    print(f'Epoch {epoch+1}, Training Loss: {int(train_losses[-1])}, Validation Loss: {int(val_losses[-1])}, L1: {int(l1_mean_loss)}')