In [193]:
from IPython import get_ipython
from IPython.display import display

In [194]:
%matplotlib inline
import pandas as pd
import torch
from torch import nn

In [195]:
class KaggleHouse(torch.utils.data.Dataset):
    def __init__(self, batch_size, train=None, val=None):
        super().__init__()
        self.batch_size = batch_size
        if train is None:
            self.raw_train = pd.read_csv('train.csv')
            self.raw_val = pd.read_csv('test.csv')

    def preprocess(self):
        label = 'SalePrice'
        features = pd.concat(
            (self.raw_train.drop(columns=['Id', label]),
             self.raw_val.drop(columns=['Id'])))

        features = features.apply(pd.to_numeric, errors='coerce')

        numeric_features = features.dtypes[features.dtypes != 'object'].index
        features[numeric_features] = features[numeric_features].apply(
            lambda x: (x - x.mean()) / x.std())

        features = features.fillna(0)
        features = pd.get_dummies(features, dummy_na=True)

        self.train = torch.tensor(features[:self.raw_train.shape[0]].values, dtype=torch.float32)
        self.val = torch.tensor(features[self.raw_train.shape[0]:].values, dtype=torch.float32)
        self.y_train = torch.tensor(self.raw_train[label].values.reshape(-1, 1), dtype=torch.float32)

    def get_tensorloader(self, tensors, train):
          # Create a DataLoader from the tensors (customize as needed)
          from torch.utils.data import DataLoader, TensorDataset

          dataset = TensorDataset(*tensors)  # Create a dataset from tensors
          return DataLoader(dataset, batch_size=self.batch_size, shuffle=train)  # Create DataLoader

    def get_dataloader(self, train):
        return self.get_tensorloader([self.train, self.y_train] if train else [self.val], train)

# Run the class definition
data = KaggleHouse(batch_size=64)
data.preprocess()
print(data.train.shape)  # Check the shape of the training data


torch.Size([1460, 79])


In [196]:
print(data.raw_train.iloc[:4, [0, 1, 2, 3, -3, -2, -1]])

   Id  MSSubClass MSZoning  LotFrontage SaleType SaleCondition  SalePrice
0   1          60       RL         65.0       WD        Normal     208500
1   2          20       RL         80.0       WD        Normal     181500
2   3          60       RL         68.0       WD        Normal     223500
3   4          70       RL         60.0       WD       Abnorml     140000


In [198]:
data.preprocess()
data.train.shape

torch.Size([1460, 79])

In [200]:
train_dataloader = data.get_dataloader(train=True)
test_dataloader = data.get_dataloader(train=False)

In [201]:
# Example of iterating over the training DataLoader
for X, y in train_dataloader:
    print(f"Shape of X: {X.shape}, Shape of y: {y.shape}")
    break  # Just to show the first batch


Shape of X: torch.Size([64, 79]), Shape of y: torch.Size([64, 1])


In [229]:
# Setting the Device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Defining the MLP Class
class MLP(nn.Module):
    def __init__(self):
        super(MLP, self).__init__()
        self.layers = nn.Sequential(
            nn.Linear(79, 1024),
            nn.ReLU(),
            nn.Linear(1024, 512),
            nn.ReLU(),
            nn.Linear(512, 128),
            nn.ReLU(),
            nn.Linear(128, 1)
        )

    # Forward Pass Definition
    def forward(self, x):
        return self.layers(x)

# Instantiating the Model
model = MLP().to(device)
print(model)


MLP(
  (layers): Sequential(
    (0): Linear(in_features=79, out_features=1024, bias=True)
    (1): ReLU()
    (2): Linear(in_features=1024, out_features=512, bias=True)
    (3): ReLU()
    (4): Linear(in_features=512, out_features=128, bias=True)
    (5): ReLU()
    (6): Linear(in_features=128, out_features=1, bias=True)
  )
)


In [230]:
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters())

In [233]:
def train(data_loader, model, criterion, optimizer):
    model.train()
    total_loss = 0

    for data, target in data_loader:
        # Move data and target to device
        data = data.to(device)
        target = target.to(device)

        # Reshape input data
        data = data.view(data.size(0), -1)

        # Forward pass
        output = model(data).view(-1)  # Ensure output is 1D

        # Calculate the loss
        loss = criterion(output, target)
        total_loss += loss.item()

        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    avg_loss = total_loss / len(data_loader)
    print(f"Average loss: {avg_loss:.7f}")


In [234]:
epochs = 7
for epoch in range(epochs):
    print(f"Training epoch: {epoch + 1}")
    train(train_dataloader, model, criterion, optimizer)

Training epoch: 1
Average loss: 38725168706.7826080
Training epoch: 2
Average loss: 36813782416.6956558
Training epoch: 3
Average loss: 29954585243.8260880
Training epoch: 4
Average loss: 18006215368.3478279
Training epoch: 5
Average loss: 12278920815.3043480
Training epoch: 6
Average loss: 10692622157.9130440
Training epoch: 7
Average loss: 9700371589.5652180
