In [150]:
import numpy as np
import torch
from torch import nn
import pandas as pd
from sklearn.model_selection import train_test_split

In [151]:
data = pd.read_csv("train.csv")
y = data['SalePrice']
X = data.drop('SalePrice', axis=1)
X.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,2,2008,WD,Normal
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,5,2007,WD,Normal
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,9,2008,WD,Normal
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,2,2006,WD,Abnorml
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,12,2008,WD,Normal


In [152]:
X.drop('Id', axis=1)
numeric_cols = list(X.select_dtypes(include=np.number).columns)
X_num = X[numeric_cols]
X_num.columns = X[numeric_cols].columns
X_num.head()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold
0,1,60,65.0,8450,7,5,2003,2003,196.0,706,...,548,0,61,0,0,0,0,0,2,2008
1,2,20,80.0,9600,6,8,1976,1976,0.0,978,...,460,298,0,0,0,0,0,0,5,2007
2,3,60,68.0,11250,7,5,2001,2002,162.0,486,...,608,0,42,0,0,0,0,0,9,2008
3,4,70,60.0,9550,7,5,1915,1970,0.0,216,...,642,0,35,272,0,0,0,0,2,2006
4,5,60,84.0,14260,8,5,2000,2000,350.0,655,...,836,192,84,0,0,0,0,0,12,2008


## First model will use only numeric values and missing values will be imputed with SimpleImputer

In [153]:
from sklearn.impute import SimpleImputer

my_imputer = SimpleImputer()

def get_train_test_data(X, y, test_size):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size)
    
    imputed_X_train = pd.DataFrame(my_imputer.fit_transform(X_train))
    imputed_X_test = pd.DataFrame(my_imputer.transform(X_test))
    
    imputed_X_train.columns = X_train.columns
    imputed_X_test.columns = X_test.columns
    
    X_train, X_test = imputed_X_train, imputed_X_test
    
    X_train = torch.tensor(X_train.values, dtype=torch.float32)
    X_test = torch.tensor(X_test.values, dtype=torch.float32)
    y_train = torch.tensor(y_train.values, dtype=torch.float32).squeeze()
    y_test = torch.tensor(y_test.values, dtype=torch.float32).squeeze()
    return X_train, X_test, y_train, y_test

In [154]:
X_train, X_test, y_train, y_test = get_train_test_data(X_num, y, test_size=0.2)
X_num = pd.DataFrame(my_imputer.transform(X_num))

In [155]:
class ModelV1(nn.Module):
    def __init__(self, in_features):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(in_features=in_features, out_features=1),
            # nn.Linear(in_features=10, out_features=10),
            # nn.Linear(in_features=10, out_features=1)
        )
        
    def forward(self, x):
        return self.layers(x)

In [156]:
model1 = ModelV1(in_features=37)

loss_fn = nn.L1Loss()
optimizer = torch.optim.SGD(params=model1.parameters(), lr=0.0001)

In [157]:
model1.eval()
y_pred = model1(X_train).squeeze()
y_pred

tensor([ -192.6853,  -547.2485,  -160.6722,  ...,  -325.7292,  -633.8361,
        -2070.7332], grad_fn=<SqueezeBackward0>)

In [158]:
def train_test(model:torch.nn.Module, loss_fn:torch.nn.Module, optimizer:torch.optim.Optimizer, epochs:int, X_train, X_test, y_train, y_test):
    
    for epoch in range(epochs):
        model.train()
        y_preds = model(X_train).squeeze()
        loss = loss_fn(y_preds, y_train)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        model.eval()
        with (torch.inference_mode()):
            test_pred = model(X_test).squeeze()
            test_loss = loss_fn(test_pred, y_test)
        if epoch % 50 == 0:
            print("----------------------------")
            print(f"Epoch {epoch}:")
            print(f"Train loss: {loss} | Test loss: {test_loss}")

In [159]:
train_test(model=model1, loss_fn=loss_fn, optimizer=optimizer, epochs=2000, X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test)

----------------------------
Epoch 0:
Train loss: 182054.453125 | Test loss: 164537.40625
----------------------------
Epoch 50:
Train loss: 58978.8828125 | Test loss: 62964.24609375
----------------------------
Epoch 100:
Train loss: 55311.67578125 | Test loss: 58188.85546875
----------------------------
Epoch 150:
Train loss: 52219.02734375 | Test loss: 54179.3515625
----------------------------
Epoch 200:
Train loss: 49892.57421875 | Test loss: 50916.95703125
----------------------------
Epoch 250:
Train loss: 48090.35546875 | Test loss: 48341.234375
----------------------------
Epoch 300:
Train loss: 46655.13671875 | Test loss: 46175.15625
----------------------------
Epoch 350:
Train loss: 45531.49609375 | Test loss: 44601.70703125
----------------------------
Epoch 400:
Train loss: 44623.20703125 | Test loss: 43383.546875
----------------------------
Epoch 450:
Train loss: 43795.13671875 | Test loss: 42378.08984375
----------------------------
Epoch 500:
Train loss: 43034.15625 |

## This results are not great, let's figure out which 5 data columns have the most corelation with SalePrice and use only them

In [160]:
from sklearn.feature_selection import mutual_info_regression

def make_mi_scores(X, y, discrete_features):
    mi_scores = mutual_info_regression(X, y, discrete_features=discrete_features)
    mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)
    mi_scores = mi_scores.sort_values(ascending=False)
    return mi_scores

In [161]:
discrete_features = X_num.dtypes == int

mi_scores = make_mi_scores(X_num, y, discrete_features=discrete_features)
mi_scores[:7]

4     0.553254
16    0.483946
26    0.374828
12    0.366776
27    0.362363
6     0.358356
13    0.308175
Name: MI Scores, dtype: float64

## Let's now use only this 7 features in a Model and see if there is any difference in performance

In [162]:
model2 = ModelV1(in_features=7)
lista = list(mi_scores[:7].index)
for i in lista:
    i -= 1
X_num_2 = X_num[lista]
X_train, X_test, y_train, y_test = get_train_test_data(X_num_2, y, test_size=0.2)
optimizer_2 = torch.optim.SGD(params=model2.parameters(), lr=0.001)

In [163]:
train_test(model=model2, loss_fn=loss_fn, optimizer=optimizer_2, epochs=2000, X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test)

----------------------------
Epoch 0:
Train loss: 181554.265625 | Test loss: 171862.515625
----------------------------
Epoch 50:
Train loss: 39001.36328125 | Test loss: 36363.30859375
----------------------------
Epoch 100:
Train loss: 37194.98046875 | Test loss: 34653.47265625
----------------------------
Epoch 150:
Train loss: 35792.59375 | Test loss: 33223.70703125
----------------------------
Epoch 200:
Train loss: 34680.41015625 | Test loss: 32105.140625
----------------------------
Epoch 250:
Train loss: 33768.05859375 | Test loss: 31225.986328125
----------------------------
Epoch 300:
Train loss: 32989.5234375 | Test loss: 30568.240234375
----------------------------
Epoch 350:
Train loss: 32415.1171875 | Test loss: 30098.0625
----------------------------
Epoch 400:
Train loss: 32013.185546875 | Test loss: 29715.435546875
----------------------------
Epoch 450:
Train loss: 31716.068359375 | Test loss: 29400.36328125
----------------------------
Epoch 500:
Train loss: 31472.785

## Now let's try to normalize data and see whether it improves our result or not

In [164]:
X_num_3 = X_num_2
y_normalized = pd.DataFrame(torch.nn.functional.normalize(torch.tensor(y, dtype=torch.float32).unsqueeze(dim=1)))
X_train, X_test, y_train, y_test = get_train_test_data(X_num_3, y_normalized, test_size=0.2)
X_train, X_test = torch.nn.functional.normalize(X_train), torch.nn.functional.normalize(X_test)

In [165]:
model3 = ModelV1(in_features=7)
optimizer_3 = torch.optim.SGD(params=model3.parameters(), lr=0.001)

In [166]:
train_test(model=model3, loss_fn=loss_fn, optimizer=optimizer_3, epochs=2001, X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test)

----------------------------
Epoch 0:
Train loss: 1.0369818210601807 | Test loss: 1.0436515808105469
----------------------------
Epoch 50:
Train loss: 0.9388831257820129 | Test loss: 0.9455909132957458
----------------------------
Epoch 100:
Train loss: 0.8407841920852661 | Test loss: 0.8475304245948792
----------------------------
Epoch 150:
Train loss: 0.7426854968070984 | Test loss: 0.7494698762893677
----------------------------
Epoch 200:
Train loss: 0.6445866823196411 | Test loss: 0.6514092683792114
----------------------------
Epoch 250:
Train loss: 0.5464879274368286 | Test loss: 0.5533487796783447
----------------------------
Epoch 300:
Train loss: 0.44839027523994446 | Test loss: 0.4552893340587616
----------------------------
Epoch 350:
Train loss: 0.3502930998802185 | Test loss: 0.35723039507865906
----------------------------
Epoch 400:
Train loss: 0.25219592452049255 | Test loss: 0.25917139649391174
----------------------------
Epoch 450:
Train loss: 0.15414118766784668 

## Let's try now with more complicated model

In [167]:
X_train, X_test, y_train, y_test = get_train_test_data(X_num, y, test_size=0.2)

In [168]:
class ModelV2(nn.Module):
    def __init__(self, in_features):
        super().__init__()
        self.layer = nn.Sequential(
            nn.Linear(in_features=in_features, out_features=10),
            nn.Linear(in_features=10, out_features=10),
            nn.Linear(in_features=10, out_features=1)
        )
        
    def forward(self, x):
        return self.layer(x)

In [169]:
model4 = ModelV2(37)
loss_fn = nn.L1Loss()
optimizer_4 = torch.optim.Adam(model4.parameters(), lr=0.01, weight_decay=1e-5)

In [170]:
train_test(model4, loss_fn=loss_fn, optimizer=optimizer_4, epochs=2000, X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test)

----------------------------
Epoch 0:
Train loss: 181277.515625 | Test loss: 178446.46875
----------------------------
Epoch 50:
Train loss: 49215.9453125 | Test loss: 46153.7265625
----------------------------
Epoch 100:
Train loss: 40684.32421875 | Test loss: 38366.8984375
----------------------------
Epoch 150:
Train loss: 31227.154296875 | Test loss: 31239.13671875
----------------------------
Epoch 200:
Train loss: 27144.28515625 | Test loss: 29595.455078125
----------------------------
Epoch 250:
Train loss: 26596.517578125 | Test loss: 29967.96484375
----------------------------
Epoch 300:
Train loss: 26343.33984375 | Test loss: 29995.9765625
----------------------------
Epoch 350:
Train loss: 26200.548828125 | Test loss: 30019.03125
----------------------------
Epoch 400:
Train loss: 26132.955078125 | Test loss: 29913.46875
----------------------------
Epoch 450:
Train loss: 26088.88671875 | Test loss: 29851.0
----------------------------
Epoch 500:
Train loss: 26050.5078125 | 

## Let's now try using categorical variables

In [171]:
from sklearn.preprocessing import OrdinalEncoder

label_X = X.copy()

s = (X.dtypes == 'object')
object_cols = list(s[s].index)

ordinal_encoder = OrdinalEncoder()
label_X[object_cols] = ordinal_encoder.fit_transform(X[object_cols])
label_X = label_X.drop('Id', axis=1)
label_X = pd.DataFrame(my_imputer.fit_transform(label_X))
label_X.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,69,70,71,72,73,74,75,76,77,78
0,60.0,3.0,65.0,8450.0,1.0,0.450549,3.0,3.0,0.0,4.0,...,0.0,0.0,1.142857,1.427046,1.907407,0.0,2.0,2008.0,8.0,4.0
1,20.0,3.0,80.0,9600.0,1.0,0.450549,3.0,3.0,0.0,2.0,...,0.0,0.0,1.142857,1.427046,1.907407,0.0,5.0,2007.0,8.0,4.0
2,60.0,3.0,68.0,11250.0,1.0,0.450549,0.0,3.0,0.0,4.0,...,0.0,0.0,1.142857,1.427046,1.907407,0.0,9.0,2008.0,8.0,4.0
3,70.0,3.0,60.0,9550.0,1.0,0.450549,0.0,3.0,0.0,0.0,...,0.0,0.0,1.142857,1.427046,1.907407,0.0,2.0,2006.0,8.0,0.0
4,60.0,3.0,84.0,14260.0,1.0,0.450549,0.0,3.0,0.0,2.0,...,0.0,0.0,1.142857,1.427046,1.907407,0.0,12.0,2008.0,8.0,4.0


In [172]:
mi_scores_all = make_mi_scores(label_X, y, discrete_features=label_X.columns)
mi_scores_all[:10]

16    0.572001
11    0.527917
61    0.495389
45    0.433127
18    0.411872
37    0.399563
3     0.393677
60    0.369847
58    0.345411
29    0.334689
Name: MI Scores, dtype: float64

In [173]:
useful_data = list(mi_scores_all[mi_scores_all.gt(0.3)].index)
label_X = label_X[useful_data]

In [174]:
X_train, X_test, y_train, y_test = get_train_test_data(label_X, y, test_size=0.2)

In [175]:
model5 = ModelV2(len(useful_data))
optimizer_5 = torch.optim.Adam(params=model5.parameters(), lr=0.02, weight_decay=1e-4)

In [176]:
train_test(model5, loss_fn=loss_fn, optimizer=optimizer_5, epochs=2001, X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test)

----------------------------
Epoch 0:
Train loss: 181647.609375 | Test loss: 177688.359375
----------------------------
Epoch 50:
Train loss: 53156.4296875 | Test loss: 51383.34375
----------------------------
Epoch 100:
Train loss: 36323.3828125 | Test loss: 34159.7109375
----------------------------
Epoch 150:
Train loss: 29573.373046875 | Test loss: 28330.23046875
----------------------------
Epoch 200:
Train loss: 28848.2421875 | Test loss: 28545.25
----------------------------
Epoch 250:
Train loss: 28807.2578125 | Test loss: 28413.53515625
----------------------------
Epoch 300:
Train loss: 28753.376953125 | Test loss: 28331.916015625
----------------------------
Epoch 350:
Train loss: 28706.853515625 | Test loss: 28393.091796875
----------------------------
Epoch 400:
Train loss: 28677.408203125 | Test loss: 28504.13671875
----------------------------
Epoch 450:
Train loss: 28609.24609375 | Test loss: 28316.041015625
----------------------------
Epoch 500:
Train loss: 28564.2109

## Now let's make some predictions and see what score this model gets in competition

In [213]:
X_eval = pd.read_csv("test.csv")
X_eval.tail()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
1454,2915,160,RM,21.0,1936,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,6,2006,WD,Normal
1455,2916,160,RM,21.0,1894,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,4,2006,WD,Abnorml
1456,2917,20,RL,160.0,20000,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,9,2006,WD,Abnorml
1457,2918,85,RL,62.0,10441,Pave,,Reg,Lvl,AllPub,...,0,0,,MnPrv,Shed,700,7,2006,WD,Normal
1458,2919,60,RL,74.0,9627,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,11,2006,WD,Normal


In [232]:
label_X_test = X_eval.copy()
label_X_test.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal


In [233]:
label_X_test = label_X_test.drop("Id", axis=1)

label_X_test[object_cols] = ordinal_encoder.fit_transform(label_X_test[object_cols])
label_X_test = pd.DataFrame(my_imputer.fit_transform(label_X_test))

label_X_test.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,69,70,71,72,73,74,75,76,77,78
0,20.0,2.0,80.0,11622.0,1.0,0.345794,3.0,3.0,0.0,4.0,...,120.0,0.0,0.333333,2.0,1.843137,0.0,6.0,2010.0,8.0,4.0
1,20.0,3.0,81.0,14267.0,1.0,0.345794,0.0,3.0,0.0,0.0,...,0.0,0.0,0.333333,1.396552,0.0,12500.0,6.0,2010.0,8.0,4.0
2,60.0,3.0,74.0,13830.0,1.0,0.345794,0.0,3.0,0.0,4.0,...,0.0,0.0,0.333333,2.0,1.843137,0.0,3.0,2010.0,8.0,4.0
3,60.0,3.0,78.0,9978.0,1.0,0.345794,0.0,3.0,0.0,4.0,...,0.0,0.0,0.333333,1.396552,1.843137,0.0,6.0,2010.0,8.0,4.0
4,120.0,3.0,43.0,5005.0,1.0,0.345794,0.0,1.0,0.0,4.0,...,144.0,0.0,0.333333,1.396552,1.843137,0.0,1.0,2010.0,8.0,4.0


In [234]:
label_X_test = torch.tensor(label_X_test[useful_data].values, dtype=torch.float32)

In [235]:
label_X_test.shape

torch.Size([1459, 12])

In [236]:
model5.eval()
y_res = model5(label_X_test)
predictions = y_res.detach().numpy()
predictions

array([[127283.76],
       [166482.7 ],
       [169461.05],
       ...,
       [159634.78],
       [110707.25],
       [212968.14]], dtype=float32)

In [237]:
import csv
def write_to_csv(predictions):
    with open("predicts.csv", mode="w") as f:
        fieldnames = [
            "Id",
            "SalePrice"
        ]
        writer = csv.DictWriter(f, fieldnames=fieldnames)
        writer.writeheader()
        for i in range(0, len(predictions)):
            dct = {"Id":i+1461, "SalePrice":predictions[i][0]}
            writer.writerow(dct)

In [238]:
write_to_csv(predictions=predictions)

## This model has achieved score of 0.18712 