In [407]:
import numpy as np
import torch
from torch import nn
import pandas as pd
from sklearn.model_selection import train_test_split

In [408]:
data = pd.read_csv("train.csv")
y = data['SalePrice']
X = data.drop('SalePrice', axis=1)
X.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,2,2008,WD,Normal
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,5,2007,WD,Normal
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,9,2008,WD,Normal
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,2,2006,WD,Abnorml
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,12,2008,WD,Normal


In [409]:
X.drop('Id', axis=1)
numeric_cols = list(X.select_dtypes(include=np.number).columns)
X_num = X[numeric_cols]
X_num.columns = X[numeric_cols].columns
X_num.head()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold
0,1,60,65.0,8450,7,5,2003,2003,196.0,706,...,548,0,61,0,0,0,0,0,2,2008
1,2,20,80.0,9600,6,8,1976,1976,0.0,978,...,460,298,0,0,0,0,0,0,5,2007
2,3,60,68.0,11250,7,5,2001,2002,162.0,486,...,608,0,42,0,0,0,0,0,9,2008
3,4,70,60.0,9550,7,5,1915,1970,0.0,216,...,642,0,35,272,0,0,0,0,2,2006
4,5,60,84.0,14260,8,5,2000,2000,350.0,655,...,836,192,84,0,0,0,0,0,12,2008


## First model will use only numeric values and missing values will be imputed with SimpleImputer

In [410]:
from sklearn.impute import SimpleImputer

my_imputer = SimpleImputer()

def get_train_test_data(X, y, test_size):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size)
    
    imputed_X_train = pd.DataFrame(my_imputer.fit_transform(X_train))
    imputed_X_test = pd.DataFrame(my_imputer.transform(X_test))
    
    imputed_X_train.columns = X_train.columns
    imputed_X_test.columns = X_test.columns
    
    X_train, X_test = imputed_X_train, imputed_X_test
    
    X_train = torch.tensor(X_train.values, dtype=torch.float32)
    X_test = torch.tensor(X_test.values, dtype=torch.float32)
    y_train = torch.tensor(y_train.values, dtype=torch.float32).squeeze()
    y_test = torch.tensor(y_test.values, dtype=torch.float32).squeeze()
    return X_train, X_test, y_train, y_test

In [411]:
X_train, X_test, y_train, y_test = get_train_test_data(X_num, y, test_size=0.2)
X_num = pd.DataFrame(my_imputer.transform(X_num))

In [412]:
class ModelV1(nn.Module):
    def __init__(self, in_features):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(in_features=in_features, out_features=1),
            # nn.Linear(in_features=10, out_features=10),
            # nn.Linear(in_features=10, out_features=1)
        )
        
    def forward(self, x):
        return self.layers(x)

In [413]:
model1 = ModelV1(in_features=37)

loss_fn = nn.L1Loss()
optimizer = torch.optim.SGD(params=model1.parameters(), lr=0.0001)

In [414]:
model1.eval()
y_pred = model1(X_train).squeeze()
y_pred

tensor([-268.0148,  -98.6031, -303.8015,  ..., -330.1853, -338.9747,
        -250.6477], grad_fn=<SqueezeBackward0>)

In [415]:
def train_test(model:torch.nn.Module, loss_fn:torch.nn.Module, optimizer:torch.optim.Optimizer, epochs:int, X_train, X_test, y_train, y_test):
    
    for epoch in range(epochs):
        model.train()
        y_preds = model(X_train).squeeze()
        loss = loss_fn(y_preds, y_train)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        model.eval()
        with (torch.inference_mode()):
            test_pred = model(X_test).squeeze()
            test_loss = loss_fn(test_pred, y_test)
        if epoch % 50 == 0:
            print("----------------------------")
            print(f"Epoch {epoch}:")
            print(f"Train loss: {loss} | Test loss: {test_loss}")

In [416]:
train_test(model=model1, loss_fn=loss_fn, optimizer=optimizer, epochs=2000, X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test)

----------------------------
Epoch 0:
Train loss: 180182.03125 | Test loss: 171856.265625
----------------------------
Epoch 50:
Train loss: 58346.25390625 | Test loss: 65808.96875
----------------------------
Epoch 100:
Train loss: 54455.6875 | Test loss: 61339.3359375
----------------------------
Epoch 150:
Train loss: 51197.4375 | Test loss: 57555.8203125
----------------------------
Epoch 200:
Train loss: 48678.0625 | Test loss: 54606.90625
----------------------------
Epoch 250:
Train loss: 46793.92578125 | Test loss: 52240.30859375
----------------------------
Epoch 300:
Train loss: 45321.69921875 | Test loss: 50298.80859375
----------------------------
Epoch 350:
Train loss: 44132.43359375 | Test loss: 48973.6484375
----------------------------
Epoch 400:
Train loss: 43195.69921875 | Test loss: 47893.1171875
----------------------------
Epoch 450:
Train loss: 42339.078125 | Test loss: 46968.4921875
----------------------------
Epoch 500:
Train loss: 41575.4921875 | Test loss: 46

## This results are not great, let's figure out which 5 data columns have the most corelation with SalePrice and use only them

In [417]:
from sklearn.feature_selection import mutual_info_regression

def make_mi_scores(X, y, discrete_features):
    mi_scores = mutual_info_regression(X, y, discrete_features=discrete_features)
    mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)
    mi_scores = mi_scores.sort_values(ascending=False)
    return mi_scores

In [418]:
discrete_features = X_num.dtypes == int

mi_scores = make_mi_scores(X_num, y, discrete_features=discrete_features)
mi_scores[:7]

4     0.562154
16    0.483745
12    0.367210
27    0.366140
6     0.365785
26    0.356768
13    0.312129
Name: MI Scores, dtype: float64

## Let's now use only this 7 features in a Model and see if there is any difference in performance

In [419]:
model2 = ModelV1(in_features=7)
lista = list(mi_scores[:7].index)
for i in lista:
    i -= 1
X_num_2 = X_num[lista]
X_train, X_test, y_train, y_test = get_train_test_data(X_num_2, y, test_size=0.2)
optimizer_2 = torch.optim.SGD(params=model2.parameters(), lr=0.001)

In [420]:
train_test(model=model2, loss_fn=loss_fn, optimizer=optimizer_2, epochs=2000, X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test)

----------------------------
Epoch 0:
Train loss: 181817.484375 | Test loss: 170244.234375
----------------------------
Epoch 50:
Train loss: 38349.5 | Test loss: 38911.47265625
----------------------------
Epoch 100:
Train loss: 36654.87890625 | Test loss: 37045.5234375
----------------------------
Epoch 150:
Train loss: 35221.98828125 | Test loss: 35635.0546875
----------------------------
Epoch 200:
Train loss: 34072.97265625 | Test loss: 34561.73828125
----------------------------
Epoch 250:
Train loss: 33144.19921875 | Test loss: 33714.734375
----------------------------
Epoch 300:
Train loss: 32369.732421875 | Test loss: 33037.2734375
----------------------------
Epoch 350:
Train loss: 31785.99609375 | Test loss: 32536.1171875
----------------------------
Epoch 400:
Train loss: 31370.119140625 | Test loss: 32165.671875
----------------------------
Epoch 450:
Train loss: 31050.96484375 | Test loss: 31867.2734375
----------------------------
Epoch 500:
Train loss: 30797.85546875 | 

## Now let's try to normalize data and see whether it improves our result or not

In [424]:
X_num_3 = X_num_2
y_normalized = pd.DataFrame(torch.nn.functional.normalize(torch.tensor(y, dtype=torch.float32).unsqueeze(dim=1)))
X_train, X_test, y_train, y_test = get_train_test_data(X_num_3, y_normalized, test_size=0.2)
X_train, X_test = torch.nn.functional.normalize(X_train), torch.nn.functional.normalize(X_test)

In [425]:
model3 = ModelV1(in_features=7)
optimizer_3 = torch.optim.SGD(params=model3.parameters(), lr=0.001)

In [426]:
train_test(model=model3, loss_fn=loss_fn, optimizer=optimizer_3, epochs=2001, X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test)

----------------------------
Epoch 0:
Train loss: 1.496457815170288 | Test loss: 1.4966332912445068
----------------------------
Epoch 50:
Train loss: 1.3984014987945557 | Test loss: 1.398505687713623
----------------------------
Epoch 100:
Train loss: 1.300345540046692 | Test loss: 1.3003778457641602
----------------------------
Epoch 150:
Train loss: 1.2022892236709595 | Test loss: 1.2022500038146973
----------------------------
Epoch 200:
Train loss: 1.1042330265045166 | Test loss: 1.1041224002838135
----------------------------
Epoch 250:
Train loss: 1.006177306175232 | Test loss: 1.0059951543807983
----------------------------
Epoch 300:
Train loss: 0.9081215858459473 | Test loss: 0.9078678488731384
----------------------------
Epoch 350:
Train loss: 0.8100658059120178 | Test loss: 0.8097404837608337
----------------------------
Epoch 400:
Train loss: 0.7120100259780884 | Test loss: 0.711613118648529
----------------------------
Epoch 450:
Train loss: 0.6139541268348694 | Test los

## Let's try now with more complicated model

In [432]:
X_train, X_test, y_train, y_test = get_train_test_data(X_num, y, test_size=0.2)

In [460]:
class ModelV2(nn.Module):
    def __init__(self, in_features):
        super().__init__()
        self.layer = nn.Sequential(
            nn.Linear(in_features=in_features, out_features=10),
            nn.Linear(in_features=10, out_features=10),
            nn.Linear(in_features=10, out_features=1)
        )
        
    def forward(self, x):
        return self.layer(x)

In [465]:
model4 = ModelV2(37)
loss_fn = nn.L1Loss()
optimizer_4 = torch.optim.Adam(model4.parameters(), lr=0.01, weight_decay=1e-5)

In [466]:
train_test(model4, loss_fn=loss_fn, optimizer=optimizer_4, epochs=2000, X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test)

----------------------------
Epoch 0:
Train loss: 181763.859375 | Test loss: 177604.28125
----------------------------
Epoch 50:
Train loss: 49329.9140625 | Test loss: 42873.515625
----------------------------
Epoch 100:
Train loss: 41520.7578125 | Test loss: 34990.2734375
----------------------------
Epoch 150:
Train loss: 33197.30078125 | Test loss: 29182.814453125
----------------------------
Epoch 200:
Train loss: 28842.8125 | Test loss: 25398.689453125
----------------------------
Epoch 250:
Train loss: 28026.83984375 | Test loss: 25108.298828125
----------------------------
Epoch 300:
Train loss: 27781.712890625 | Test loss: 25098.125
----------------------------
Epoch 350:
Train loss: 27619.1171875 | Test loss: 25059.158203125
----------------------------
Epoch 400:
Train loss: 27517.65234375 | Test loss: 24901.06640625
----------------------------
Epoch 450:
Train loss: 27450.36328125 | Test loss: 24839.5
----------------------------
Epoch 500:
Train loss: 27414.349609375 | Tes