In [1]:
import numpy as np
import torch
from torch import nn
import pandas as pd
from sklearn.model_selection import train_test_split



In [2]:
data = pd.read_csv("train.csv")
y = data['SalePrice']
X = data.drop('SalePrice', axis=1)
X.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,2,2008,WD,Normal
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,5,2007,WD,Normal
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,9,2008,WD,Normal
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,2,2006,WD,Abnorml
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,12,2008,WD,Normal


In [3]:
X.drop('Id', axis=1)
numeric_cols = list(X.select_dtypes(include=np.number).columns)
X_num = X[numeric_cols]
X_num.head()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold
0,1,60,65.0,8450,7,5,2003,2003,196.0,706,...,548,0,61,0,0,0,0,0,2,2008
1,2,20,80.0,9600,6,8,1976,1976,0.0,978,...,460,298,0,0,0,0,0,0,5,2007
2,3,60,68.0,11250,7,5,2001,2002,162.0,486,...,608,0,42,0,0,0,0,0,9,2008
3,4,70,60.0,9550,7,5,1915,1970,0.0,216,...,642,0,35,272,0,0,0,0,2,2006
4,5,60,84.0,14260,8,5,2000,2000,350.0,655,...,836,192,84,0,0,0,0,0,12,2008


## First model will use only numeric values and missing values will be imputed with SimpleImputer

In [4]:
from sklearn.impute import SimpleImputer

my_imputer = SimpleImputer()

X_train, X_test, y_train, y_test = train_test_split(X_num, y, test_size=0.2)

imputed_X_train = pd.DataFrame(my_imputer.fit_transform(X_train))
imputed_X_test = pd.DataFrame(my_imputer.transform(X_test))

imputed_X_train.columns = X_train.columns
imputed_X_test.columns = X_test.columns

X_train, X_test = imputed_X_train, imputed_X_test

X_train = torch.tensor(X_train.values, dtype=torch.float32)
X_test = torch.tensor(X_test.values, dtype=torch.float32)
y_train = torch.tensor(y_train.values, dtype=torch.float32)
y_test = torch.tensor(y_test.values, dtype=torch.float32)

X_train.dtype, y_train.dtype

(torch.float32, torch.float32)

In [5]:
class ModelV1(nn.Module):
    def __init__(self):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(in_features=37, out_features=1),
            # nn.Linear(in_features=10, out_features=10),
            # nn.Linear(in_features=10, out_features=1)
        )
        
    def forward(self, x):
        return self.layers(x)

In [17]:
model1 = ModelV1()

loss_fn = nn.L1Loss()
optimizer = torch.optim.SGD(params=model1.parameters(), lr=0.0001)

In [18]:
model1.eval()
y_pred = model1(X_train).squeeze()
y_pred

tensor([-1602.8331, -1323.0964, -1936.3274,  ..., -1355.3306, -1275.1062,
        -1084.7643], grad_fn=<SqueezeBackward0>)

In [11]:
def train_test(model:torch.nn.Module, loss_fn:torch.nn.Module, optimizer:torch.optim.Optimizer, epochs:int):
    
    for epoch in range(epochs):
        model.train()
        y_preds = model(X_train).squeeze()
        loss = loss_fn(y_preds, y_train)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        model.eval()
        with (torch.inference_mode()):
            test_pred = model(X_test).squeeze()
            test_loss = loss_fn(test_pred, y_test)
        if epoch % 50 == 0:
            print("----------------------------")
            print(f"Epoch {epoch}:")
            print(f"Train loss: {loss} | Test loss: {test_loss}")

In [19]:
train_test(model=model1, loss_fn=loss_fn, optimizer=optimizer, epochs=2000)

----------------------------
Epoch 0:
Train loss: 181502.5 | Test loss: 174249.453125
----------------------------
Epoch 50:
Train loss: 60208.890625 | Test loss: 56522.33984375
----------------------------
Epoch 100:
Train loss: 55605.828125 | Test loss: 53347.1171875
----------------------------
Epoch 150:
Train loss: 51890.9765625 | Test loss: 50788.34375
----------------------------
Epoch 200:
Train loss: 49162.53515625 | Test loss: 48948.1953125
----------------------------
Epoch 250:
Train loss: 47169.39453125 | Test loss: 47298.30859375
----------------------------
Epoch 300:
Train loss: 45746.97265625 | Test loss: 45964.78125
----------------------------
Epoch 350:
Train loss: 44615.63671875 | Test loss: 45115.62890625
----------------------------
Epoch 400:
Train loss: 43618.88671875 | Test loss: 44350.34765625
----------------------------
Epoch 450:
Train loss: 42787.53125 | Test loss: 43582.796875
----------------------------
Epoch 500:
Train loss: 42029.6328125 | Test loss:

## This results are not great, let's figure out which 5 data columns have the most corelation with SalePrice and use only them