In [1]:
import torch,torchvision
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from torch.nn import *
np.random.seed(42)

In [2]:
data = pd.read_csv('./train.csv')

In [3]:
data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [4]:
import random

In [5]:
object_cols = []
int_cols = []
for nan in zip(data.isna().sum(),data.columns,data.dtypes):
    if nan[0] > 0:
        print(nan)
        if nan[2] == 'float64':
            data[nan[1]].fillna(data[nan[1]].median(),inplace=True)
        else:
            data[nan[1]].fillna(random.choice(data[nan[1]].dropna().tolist()),inplace=True)
    if nan[2] == 'float64' or nan[2] == 'int64':
        int_cols.append(nan[1])
    else:
        object_cols.append(nan[1])

(259, 'LotFrontage', dtype('float64'))
(1369, 'Alley', dtype('O'))
(8, 'MasVnrType', dtype('O'))
(8, 'MasVnrArea', dtype('float64'))
(37, 'BsmtQual', dtype('O'))
(37, 'BsmtCond', dtype('O'))
(38, 'BsmtExposure', dtype('O'))
(37, 'BsmtFinType1', dtype('O'))
(38, 'BsmtFinType2', dtype('O'))
(1, 'Electrical', dtype('O'))
(690, 'FireplaceQu', dtype('O'))
(81, 'GarageType', dtype('O'))
(81, 'GarageYrBlt', dtype('float64'))
(81, 'GarageFinish', dtype('O'))
(81, 'GarageQual', dtype('O'))
(81, 'GarageCond', dtype('O'))
(1453, 'PoolQC', dtype('O'))
(1179, 'Fence', dtype('O'))
(1406, 'MiscFeature', dtype('O'))


In [6]:
from sklearn.compose import make_column_transformer

In [7]:
from sklearn.model_selection import *
from sklearn.metrics import *

In [8]:
X = data.drop('SalePrice',axis=1)
y = data['SalePrice']
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.125)

In [9]:
from sklearn.preprocessing import *
mct = make_column_transformer(
    (OneHotEncoder(handle_unknown='ignore'),object_cols),
    remainder='passthrough'
)
mct.fit(X_train)
X_train = torch.from_numpy(np.array(mct.transform(X_train).toarray()))
X_test = torch.from_numpy(np.array(mct.transform(X_test).toarray()))

In [10]:
y_train = torch.from_numpy(np.array(y_train))
y_test = torch.from_numpy(np.array(y_test))

In [11]:
X_train.shape

torch.Size([1277, 288])

In [12]:
class BaseLine_Model(Module):
    def __init__(self,starter=2048,activation=ReLU(),iter_of_linear6=1):
        super().__init__()
        self.iter_of_linear6 = iter_of_linear6
        self.activation = activation
        self.linear1 = Linear(288,starter)
        self.linear1batchnorm = BatchNorm1d(starter)
        self.linear2 = Linear(starter,starter*2)
        self.linear2batchnorm = BatchNorm1d(starter*2)
        self.linear3 = Linear(starter*2,starter*2*2)
        self.linear3batchnorm = BatchNorm1d(starter*2*2)
        self.linear4 = Linear(starter*2*2,starter*2*2*2)
        self.linear4batchnorm = BatchNorm1d(starter*2*2*2)
        self.linear5 = Linear(starter*2*2*2,starter*2*2)
        self.linear5batchnorm = BatchNorm1d(starter*2*2)
        self.linear6 = Linear(starter*2*2,starter*2*2)
        self.linear6batchnorm = BatchNorm1d(starter*2*2)
        self.output = Linear(starter*2*2,1)
    
    def forward(self,X):
        preds = self.activation(self.linear1batchnorm(self.linear1(X)))
        preds = self.activation(self.linear2batchnorm(self.linear2(preds)))
        preds = self.activation(self.linear3batchnorm(self.linear3(preds)))
        preds = self.activation(self.linear4batchnorm(self.linear4(preds)))
        preds = self.activation(self.linear5batchnorm(self.linear5(preds)))
        for _ in range(self.iter_of_linear6):
            preds = self.activation(self.linear6batchnorm(self.linear6(preds)))
        preds = self.output(preds)
        return preds

In [13]:
device = 'cuda'
PROJECT_NAME = 'House-Prices-Advanced-Regression-Techniques-V4'

In [14]:
model = BaseLine_Model().to(device)
criterion = MSELoss()
optimizer = torch.optim.Adam(model.parameters(),lr=0.001)

In [15]:
batch_size = 32

In [16]:
epochs = 100

In [17]:
from tqdm import tqdm

In [18]:
def get_loss(model,X,y,criterion):
    model.to('cpu')
    preds = model(X.float())
    loss = criterion(preds,y)
    return loss.item()
def make_submission(model,name):
    data = pd.read_csv('./test.csv')
    ids = data['Id']
    for nan in zip(data.isna().sum(),data.columns,data.dtypes):
        if nan[0] > 0:
            if nan[2] == 'float64' or nan[2] == 'int64':
                data[nan[1]].fillna(data[nan[1]].median(),inplace=True)
            else:
                data[nan[1]].fillna(random.choice(data[nan[1]].dropna().tolist()),inplace=True)
    data.dropna(inplace=True)
    data = torch.from_numpy(np.array(mct.transform(data).toarray()))
    data = data.float().to(device).view(-1,288)
    model.to(device)
    model.train()
    preds = model(data)
    preds = torch.squeeze(preds.to('cpu')).detach().numpy()
    df = pd.DataFrame({'Id':ids,'SalePrice':preds})
    df.to_csv(f'./submissions/submisssion-{name}.csv',index=False)
    return df

In [19]:
import wandb

In [20]:
X_train.shape

torch.Size([1277, 288])

In [21]:
model = BaseLine_Model().to(device)
criterion = MSELoss()
optimizer = torch.optim.Adam(model.parameters(),lr=0.001)
batch_size = 32

In [22]:
from torch.optim import *

In [27]:
# 
# starters = [128,256,512,1024,2048,2048*2,2048*2*2]
#
# iter_of_linear6s = [1,2,5,7,10,12,25,50,75,100]
#
# activations = [ELU,LeakyReLU,PReLU,ReLU,ReLU6,RReLU,SELU,CELU,GELU,Tanh,SiLU]
#
# batch_sizes = [2,3,4,5]
#
# optimizers = [Adam,AdamW,Adamax,Adadelta,Adagrad,ASGD,LBFGS,RMSprop,Rprop,SGD]
#
# lrs = [0.1,0.01,0.001,0.0001,0.00001]
# 
criterions = [MSELoss(),L1Loss()]

In [None]:
for criterion in criterions:
    torch.cuda.empty_cache()
    model = BaseLine_Model(starter=1024,iter_of_linear6=1,activation=PReLU()).to(device)
    criterion = criterion
    optimizer = Adam(model.parameters(),lr=lr)
    batch_size = 16
    name = f'{criterion}-criterion'
    wandb.init(project=PROJECT_NAME,name=name)
    for _ in tqdm(range(epochs)):
        torch.cuda.empty_cache()
        for idx in range(0,len(X_train),batch_size):
            torch.cuda.empty_cache()
            X_batch = X_train[idx:idx+batch_size].float().view(-1,288).to(device)
            y_batch = y_train[idx:idx+batch_size].float().to(device)
            model.to(device)
            preds = model(X_batch)
            loss = criterion(preds,y_batch)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        wandb.log({'Loss':loss.item()})
        wandb.log({'Val Loss':get_loss(model,X_test,y_test,criterion)})
    wandb.finish()
    make_submission(model,name)

VBox(children=(Label(value=' 0.15MB of 0.15MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

[34m[1mwandb[0m: [32m[41mERROR[0m Control-C detected -- Run data was not synced


[34m[1mwandb[0m: wandb version 0.12.0 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
 17%|█▋        | 17/100 [01:14<06:05,  4.41s/it]