In [1]:
## Load the data

In [2]:
import pandas as pd

In [3]:
data = pd.read_csv('./data.csv')
data.sample(frac=1)

                     date     price  bedrooms  bathrooms  sqft_living  \
1888  2014-06-04 00:00:00  371000.0       3.0       1.50         1420   
909   2014-05-19 00:00:00  850000.0       4.0       3.50         2640   
775   2014-05-15 00:00:00  344000.0       3.0       2.50         1232   
4235  2014-07-09 00:00:00  379000.0       3.0       2.50         1530   
1324  2014-05-27 00:00:00  529000.0       3.0       1.00         1210   
...                   ...       ...       ...        ...          ...   
777   2014-05-15 00:00:00  386380.0       3.0       2.50         1720   
2258  2014-06-10 00:00:00  680000.0       3.0       1.75         1760   
844   2014-05-16 00:00:00  675000.0       5.0       2.50         3200   
1479  2014-05-28 00:00:00  565000.0       7.0       4.50         4140   
3424  2014-06-26 00:00:00  310000.0       4.0       2.50         2430   

      sqft_lot  floors  waterfront  view  condition  sqft_above  \
1888      4500     1.0           0     0          3     

In [4]:
## Clean the data

In [5]:
data.columns

Index(['date', 'price', 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot',
       'floors', 'waterfront', 'view', 'condition', 'sqft_above',
       'sqft_basement', 'yr_built', 'yr_renovated', 'street', 'city',
       'statezip', 'country'],
      dtype='object')

In [6]:
data.drop(['sqft_living','sqft_lot','waterfront','view','condition','sqft_above','sqft_basement','street','city','statezip','country'],axis=1,inplace=True)

In [7]:
data.drop('date',axis=1,inplace=True)

In [8]:
data.head()

       price  bedrooms  bathrooms  floors  yr_built  yr_renovated
0   313000.0       3.0       1.50     1.5      1955          2005
1  2384000.0       5.0       2.50     2.0      1921             0
2   342000.0       3.0       2.00     1.0      1966             0
3   420000.0       3.0       2.25     1.0      1963             0
4   550000.0       4.0       2.50     1.0      1976          1992

In [9]:
## Feature Enginnering

In [10]:
def fe(data,col):
    print(len(data))
    max_no = data[col].quantile(0.99)
    min_no = data[col].quantile(0.05)
    data = data[data[col] > min_no]
    data = data[data[col] < max_no]
    print(len(data))
    return data

In [11]:
for col in list(data.columns):
    print(col)
    data = fe(data,'price')

In [12]:
data.head()

      price  bedrooms  bathrooms  floors  yr_built  yr_renovated
2  342000.0       3.0       2.00     1.0      1966             0
3  420000.0       3.0       2.25     1.0      1963             0
4  550000.0       4.0       2.50     1.0      1976          1992
5  490000.0       2.0       1.00     1.0      1938          1994
6  335000.0       2.0       2.00     1.0      1976             0

In [13]:
X = data.drop('price',axis=1)
y = data['price']

In [14]:
from sklearn.model_selection import train_test_split

In [15]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.25)

In [16]:
len(X_train),len(X_test)

(2367, 789)

In [17]:
## Modelling

In [18]:
import torch
import torch.nn as nn
import torch.optim as optim

In [19]:
import torch.nn.functional as F

In [20]:
class BaseLine_Model(nn.Module):
    def __init__(self,input_shape,output_shape):
        super().__init__()
        self.fc1 = nn.Linear(input_shape,64)
        self.fc2 = nn.Linear(64,128)
        self.fc3 = nn.Linear(128,256)
        self.fc4 = nn.Linear(256,128)
        self.fc5 = nn.Linear(128,output_shape)
    
    def forward(self,X):
        preds = self.fc1(X)
        preds = F.relu(preds)
        preds = self.fc2(preds)
        preds = F.relu(preds)
        preds = self.fc3(preds)
        preds = F.relu(preds)
        preds = self.fc4(preds)
        preds = F.relu(preds)
        preds = self.fc5(preds)
        return preds

In [21]:
import wandb

In [22]:
BATCH_SIZE = 32

In [23]:
PROJECT_NAME = 'House-Price-Pred'

In [24]:
from tqdm import tqdm

In [25]:
device = torch.device('cuda')

In [26]:
def get_loss(criterion,X,y,model):
    model.eval()
    with torch.no_grad():
        preds = model(X.float().to(device))
        preds = preds.to(device)
        y = y.to(device)
        loss = criterion(preds,y)
    model.train()
    return loss.item()
def get_accuracy(X,y,model):
    model.eval()
    with torch.no_grad():
        correct = 0
        total = 0
        for i in range(len(X)):
            pred = model(X[i].float().to(device))
            pred.to(device)
            if round(int(pred[0])) == round(int(y[i])):
                correct += 1
            total += 1
        if correct == 0:
            correct += 1
    model.train()
    return round(correct/total,3)

In [27]:
import numpy as np

In [28]:
X_train = torch.from_numpy(np.array(X_train))
y_train = torch.from_numpy(np.array(y_train))
X_test = torch.from_numpy(np.array(X_test))
y_test = torch.from_numpy(np.array(y_test))

In [29]:
get_accuracy(X_test,y_test,model)

In [30]:
EPOCHS = 112

In [31]:
# get_accuracy(X_test,y_test,model)

In [32]:
EPOCHS = 112

In [33]:
# wandb.init(project=PROJECT_NAME,name='baseline')
# for _ in tqdm(range(EPOCHS)):
#     model.to(device)
#     preds = model(X_train.float().to(device))
#     preds = preds.view(len(preds),)
#     preds.to(device)
#     loss = criterion(preds.float(),y_train.float().to(device))
#     optimizer.zero_grad()
#     loss.backward()
#     optimizer.step()
#     wandb.log({'loss':loss.item(),'val_loss':get_loss(criterion,X_test,y_test,model),'accuracy':get_accuracy(X_train,y_train,model),'val_accuracy':get_accuracy(X_test,y_test,model)})

In [34]:
# preds

In [35]:
# y_train

In [36]:
# torch.round(preds)

In [37]:
# torch.round(y_train)

In [38]:
import matplotlib.pyplot as plt

In [39]:
# preds[0]

In [40]:
# y_train[0]

In [41]:
# model.eval()
# with torch.no_grad():
# #     preds = model(X_test.float().to(device))

In [42]:
# for index in range(12):
#     print(preds[index][0])
#     print(y_test[index])
#     print('\n')

In [43]:
class Test_Model(nn.Module):
    def __init__(self,input_shape=5,output_shape=1,num_of_layers=2,activation=F.relu,fc1_output=64,fc2_output=128,fc3_output=256):
        super().__init__()
        self.activation = activation
        self.num_of_layers = num_of_layers
        self.fc1 = nn.Linear(input_shape,fc1_output)
        self.fc2 = nn.Linear(fc1_output,fc2_output)
        self.fc3 = nn.Linear(fc2_output,fc3_output)
        self.fc4 = nn.Linear(fc3_output,fc2_output)
        self.fc5 = nn.Linear(fc2_output,fc2_output)
        self.fc6 = nn.Linear(fc2_output,output_shape)
    
    def forward(self,X):
        preds = self.fc1(X)
        preds = self.activation(preds)
        preds = self.fc2(preds)
        preds = self.activation(preds)
        preds = self.fc3(preds)
        preds = self.activation(preds)
        preds = self.fc4(preds)
        for _ in range(self.num_of_layers):
            preds = self.fc5(preds)
            preds = self.activation(preds)
        preds = self.fc6(preds)
        return preds

In [44]:
# num_of_layers = 
# activation = nn.LeakyReLU()
# lr = 
# criterion = torch.nn.HingeEmbeddingLoss()
# optimizer = torch.optim.Adamax()

In [45]:
lrs = [0.125,0.25,0.5,0.75,1.0,0.1,0.01,0.001,0.0001]
for lr in lrs:
    model = Test_Model(activation=nn.LeakyReLU(),num_of_layers=2).to(device)
    criterion = torch.nn.HingeEmbeddingLoss()
    optimizer = torch.optim.Adamax(model.parameters(),lr=lr)
    wandb.init(project=PROJECT_NAME,name=f'lr-{lr}')
    for _ in tqdm(range(EPOCHS)):
        model.to(device)
        preds = model(X_train.float().to(device))
        preds = preds.view(len(preds),)
        preds.to(device)
        loss = criterion(preds.float(),y_train.float().to(device))
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        wandb.log({'loss':loss.item(),'val_loss':get_loss(criterion,X_test,y_test,model),'accuracy':get_accuracy(X_train,y_train,model),'val_accuracy':get_accuracy(X_test,y_test,model)})

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
loss,1.0
val_loss,1.0
accuracy,0.0
val_accuracy,0.001
_runtime,112.0
_timestamp,1621443773.0
_step,111.0


0,1
loss,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
val_loss,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
accuracy,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
val_accuracy,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
_runtime,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
_timestamp,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███


In [46]:
model = Test_Model(activation=nn.LeakyReLU(),num_of_layers=2).to(device)
criterion = torch.nn.HingeEmbeddingLoss()
optimizer = torch.optim.Adamax(model.parameters(),lr=0.25)
wandb.init(project=PROJECT_NAME,name=f'big-dog-model')
for _ in tqdm(range(250)):
    model.to(device)
    preds = model(X_train.float().to(device))
    preds = preds.view(len(preds),)
    preds.to(device)
    loss = criterion(preds.float(),y_train.float().to(device))
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    wandb.log({'loss':loss.item(),'val_loss':get_loss(criterion,X_test,y_test,model),'accuracy':get_accuracy(X_train,y_train,model),'val_accuracy':get_accuracy(X_test,y_test,model)})