In [4]:
import torch
import numpy as np
import torchvision
import torch.nn as nn
import pandas as pd
import torch.optim as optim
import torch.functional as F
import torch.utils.data as data
import seaborn as sns
import matplotlib.pyplot as plt

In [5]:
df = pd.read_csv("train_data.csv")

In [6]:
df

Unnamed: 0,SalePrice,YearBuilt,Size(sqf),Floor,HallwayType,HeatingType,AptManageType,N_Parkinglot(Ground),N_Parkinglot(Basement),TimeToBusStop,TimeToSubway,N_manager,N_elevators,SubwayStation,N_FacilitiesInApt,N_FacilitiesNearBy(Total),N_SchoolNearBy(Total)
0,141592,2006,814,3,terraced,individual_heating,management_in_trust,111.0,184.0,5min~10min,10min~15min,3.0,0.0,Kyungbuk_uni_hospital,5,6.0,9.0
1,51327,1985,587,8,corridor,individual_heating,self_management,80.0,76.0,0~5min,5min~10min,2.0,2.0,Daegu,3,12.0,4.0
2,48672,1985,587,6,corridor,individual_heating,self_management,80.0,76.0,0~5min,5min~10min,2.0,2.0,Daegu,3,12.0,4.0
3,380530,2006,2056,8,terraced,individual_heating,management_in_trust,249.0,536.0,0~5min,0-5min,5.0,11.0,Sin-nam,5,3.0,7.0
4,78318,1992,644,2,mixed,individual_heating,self_management,142.0,79.0,5min~10min,15min~20min,4.0,8.0,Myung-duk,3,9.0,14.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4119,570796,2007,1928,24,terraced,individual_heating,management_in_trust,0.0,1270.0,0~5min,0-5min,14.0,16.0,Kyungbuk_uni_hospital,10,9.0,10.0
4120,307079,2015,644,22,terraced,individual_heating,management_in_trust,102.0,400.0,0~5min,5min~10min,5.0,10.0,Daegu,7,7.0,11.0
4121,357522,2007,868,20,terraced,individual_heating,management_in_trust,0.0,1270.0,0~5min,0-5min,14.0,16.0,Kyungbuk_uni_hospital,10,9.0,10.0
4122,312389,1978,1327,1,corridor,individual_heating,self_management,87.0,0.0,0~5min,0-5min,1.0,4.0,Kyungbuk_uni_hospital,3,7.0,11.0


In [7]:
print(len(df.columns))

17


In [8]:
categorical_columns = ["HallwayType", "SubwayStation"]

print(df["TimeToBusStop"].unique())
print(df["TimeToSubway"].unique())

['5min~10min' '0~5min' '10min~15min']
['10min~15min' '5min~10min' '0-5min' '15min~20min' 'no_bus_stop_nearby']


In [9]:
dataType = "TimeToBusStop"
df.loc[df[dataType] == '0~5min', dataType] = np.float64(2.0)
df.loc[df[dataType] == '5min~10min', dataType] = np.float64(1.0)
df.loc[df[dataType] == '10min~15min', dataType] = np.float64(0.0)
print(df[dataType].unique())

dataType = "TimeToSubway"
df.loc[df[dataType] == 'no_bus_stop_nearby', dataType] = np.float64(0.0)
df.loc[df[dataType] == '0-5min', dataType] = np.float64(4.0)
df.loc[df[dataType] == '5min~10min', dataType] = np.float64(3.0)
df.loc[df[dataType] == '10min~15min', dataType] = np.float64(2.0)
df.loc[df[dataType] == '15min~20min', dataType] = np.float64(1.0)
print(df[dataType].unique())

[1.0 2.0 0.0]
[2.0 3.0 4.0 1.0 0.0]


In [10]:
df.HeatingType = (df.HeatingType==df["HeatingType"].unique()[0]).astype(int)
df.AptManageType = (df.AptManageType==df["AptManageType"].unique()[0]).astype(int)

In [11]:
categorical_values0 = pd.get_dummies(df[categorical_columns[0]])
categorical_values0.head()
categorical_values1 = pd.get_dummies(df[categorical_columns[1]])
categorical_values1.head()

Unnamed: 0,Bangoge,Banwoldang,Chil-sung-market,Daegu,Kyungbuk_uni_hospital,Myung-duk,Sin-nam,no_subway_nearby
0,0,0,0,0,1,0,0,0
1,0,0,0,1,0,0,0,0
2,0,0,0,1,0,0,0,0
3,0,0,0,0,0,0,1,0
4,0,0,0,0,0,1,0,0


In [12]:
df.drop(columns=categorical_columns,inplace=True)
df = df.astype(float)

In [13]:
train_indices = np.random.rand(len(df))>0.3

In [14]:
df

Unnamed: 0,SalePrice,YearBuilt,Size(sqf),Floor,HeatingType,AptManageType,N_Parkinglot(Ground),N_Parkinglot(Basement),TimeToBusStop,TimeToSubway,N_manager,N_elevators,N_FacilitiesInApt,N_FacilitiesNearBy(Total),N_SchoolNearBy(Total)
0,141592.0,2006.0,814.0,3.0,1.0,1.0,111.0,184.0,1.0,2.0,3.0,0.0,5.0,6.0,9.0
1,51327.0,1985.0,587.0,8.0,1.0,0.0,80.0,76.0,2.0,3.0,2.0,2.0,3.0,12.0,4.0
2,48672.0,1985.0,587.0,6.0,1.0,0.0,80.0,76.0,2.0,3.0,2.0,2.0,3.0,12.0,4.0
3,380530.0,2006.0,2056.0,8.0,1.0,1.0,249.0,536.0,2.0,4.0,5.0,11.0,5.0,3.0,7.0
4,78318.0,1992.0,644.0,2.0,1.0,0.0,142.0,79.0,1.0,1.0,4.0,8.0,3.0,9.0,14.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4119,570796.0,2007.0,1928.0,24.0,1.0,1.0,0.0,1270.0,2.0,4.0,14.0,16.0,10.0,9.0,10.0
4120,307079.0,2015.0,644.0,22.0,1.0,1.0,102.0,400.0,2.0,3.0,5.0,10.0,7.0,7.0,11.0
4121,357522.0,2007.0,868.0,20.0,1.0,1.0,0.0,1270.0,2.0,4.0,14.0,16.0,10.0,9.0,10.0
4122,312389.0,1978.0,1327.0,1.0,1.0,0.0,87.0,0.0,2.0,4.0,1.0,4.0,3.0,7.0,11.0


In [15]:
for column in df.columns:
    print(type(df[column].unique()[0]))

<class 'numpy.float64'>
<class 'numpy.float64'>
<class 'numpy.float64'>
<class 'numpy.float64'>
<class 'numpy.float64'>
<class 'numpy.float64'>
<class 'numpy.float64'>
<class 'numpy.float64'>
<class 'numpy.float64'>
<class 'numpy.float64'>
<class 'numpy.float64'>
<class 'numpy.float64'>
<class 'numpy.float64'>
<class 'numpy.float64'>
<class 'numpy.float64'>


In [16]:
numerical_data = torch.from_numpy(df.values[train_indices,1:]).float()
categorical_data0 = torch.from_numpy(categorical_values0.values[train_indices]).float()
categorical_data1 = torch.from_numpy(categorical_values1.values[train_indices]).float()
targets = torch.from_numpy(df.values[train_indices,0]).float()

test_numerical_data = torch.from_numpy(df.values[~train_indices,1:]).float()
test_categorical_data0 = torch.from_numpy(categorical_values0.values[~train_indices]).float()
test_categorical_data1 = torch.from_numpy(categorical_values1.values[~train_indices]).float()
test_targets = torch.from_numpy(df.values[~train_indices,0]).float()

In [17]:
# # Policzenie liczby wejść do sieci
# inputs_num = len(df[1:].columns)
# print(inputs_num)

In [18]:
train_dataset = data.TensorDataset(numerical_data,categorical_data0, categorical_data1,targets)
test_dataset = data.TensorDataset(test_numerical_data,test_categorical_data0,test_categorical_data1,test_targets)

In [19]:
num_inputs = df.shape[1] + len(categorical_columns) - 1
print(num_inputs)

16


In [44]:
class AffordabilityEstim(nn.Module): #Zmiana ReLu na Tanh nic nie daje
    def __init__(self, num_inputs, num_outputs):
        super(AffordabilityEstim, self).__init__()
        self.lin1 =nn.Linear(num_inputs, 48)  # 28 x 28 = 784
        self.bn1 = nn.BatchNorm1d(48)
        self.act1 =nn.Tanh()
        self.lin2 =nn.Linear(48, 24)
        self.bn2 = nn.BatchNorm1d(24)
        self.act2 =nn.ReLU()
        self.lin3 =nn.Linear(24, num_outputs)


    def forward(self, x):
        x = self.lin1(x)
        x = self.bn1(x)
        x = self.act1(x)
        x = self.lin2(x)
        x = self.bn2(x)
        x = self.act2(x)
        x = self.lin3(x)
        return x

In [21]:
model = AffordabilityEstim(16, 1)
print(model)

AffordabilityEstim(
  (lin1): Linear(in_features=16, out_features=48, bias=True)
  (bn1): BatchNorm1d(48, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (act1): ReLU()
  (lin2): Linear(in_features=48, out_features=24, bias=True)
  (bn2): BatchNorm1d(24, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (act2): ReLU()
  (lin3): Linear(in_features=24, out_features=1, bias=True)
)


In [22]:
def classify(x):
    # CHEAP
    if x <= 100000:
        return 0
    # AVERAGE
    if x > 100000 and x <= 350000:
        return 1
    # EXPENSIVE
    if x > 350000:
        return 2

In [23]:
loss_fn = nn.MSELoss()
# opt = optim.SGD(model.parameters(), lr=0.001, momentum=0.5)
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [24]:
train_loader = data.DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = data.DataLoader(test_dataset, batch_size=64, shuffle=False)

In [25]:
def get_accuracy(model, data_loader):
    # ZMIENIĆ TO NA REGRESJĘ
    correct = 0
    total = 0
    model.eval() #*********#
    for x, cat_x0, cat_x1, labels in data_loader:
        output = model(x, cat_x0, cat_x1)
        pred = output>0
        correct += pred.eq(labels.view_as(pred)).sum().item()
        total += x.shape[0]
    return correct / total

In [26]:
# epochs = 100
# opt = optim.Adam(model.parameters(), lr=0.001)
# loss_module = nn.MSELoss()

# model = AffordabilityEstim(numerical_data.shape[1]+categorical_data0.shape[1]+categorical_data1.shape[1], 1)

# model.train()

# for epoch in range(epochs):
#     for numerical_data,categorical_data0, categorical_data1,labels in train_loader:

#         # inputs, labels = batch
#         inputs = torch.cat([numerical_data, categorical_data0, categorical_data1],dim=1)
#         # print(inputs)
#         preds = model(inputs)
#         preds = preds.squeeze(dim=1)

#         loss = loss_module(preds, labels)

#         # training steps for normal model
#         opt.zero_grad()
#         loss.backward()
#         opt.step()
# print(preds)


In [27]:
# #Próba bez categorical data - Jest tak samo źle

# epochs = 100
# opt = optim.Adam(model.parameters(), lr=0.001)
# loss_module = nn.MSELoss()

# model = AffordabilityEstim(14, 1)

# model.train()

# for epoch in range(epochs):
#     for numerical_data,categorical_data0, categorical_data1,labels in train_loader:

#         # inputs, labels = batch
#         # inputs = torch.cat([numerical_data, categorical_data0, categorical_data1],dim=1)
#         # print(inputs)
#         preds = model(numerical_data)
#         preds = preds.squeeze(dim=1)

#         loss = loss_module(preds, labels)

#         # training steps for normal model
#         opt.zero_grad()
#         loss.backward()
#         opt.step()
# print(preds)

In [45]:
#Próba z labels podzielonymi przez 100000 - wartości bliskie 0 - W wyniku wartości ujemne

epochs = 100
opt = optim.Adam(model.parameters(), lr=0.001)
loss_module = nn.MSELoss()

model = AffordabilityEstim(numerical_data.shape[1]+categorical_data0.shape[1]+categorical_data1.shape[1], 1)

model.train()

for epoch in range(epochs):
    for numerical_data,categorical_data0, categorical_data1,labels in train_loader:

        # inputs, labels = batch
        inputs = torch.cat([numerical_data, categorical_data0, categorical_data1],dim=1)
        # print(labels/100000)
        preds = model(inputs)
        preds = preds.squeeze(dim=1)

        loss = loss_module(preds, labels/100000)

        # training steps for normal model
        opt.zero_grad()
        loss.backward()
        opt.step()
print(preds)

tensor([-0.0896,  0.0465,  0.3100, -0.0843,  0.0104, -0.0313,  0.0165, -0.0974,
        -0.0992, -0.0422, -0.0658,  0.3026, -0.0149, -0.0563, -0.0910,  0.1377,
        -0.0179,  0.0422, -0.0158,  0.3475,  0.2177, -0.0548],
       grad_fn=<SqueezeBackward1>)


In [29]:
# #Próba z innym batch size - Bez większych zmian
# train_loader = data.DataLoader(train_dataset, batch_size=16, shuffle=True)

# epochs = 100
# opt = optim.Adam(model.parameters(), lr=0.001)
# loss_module = nn.MSELoss()

# model = AffordabilityEstim(numerical_data.shape[1]+categorical_data0.shape[1]+categorical_data1.shape[1], 1)

# model.train()

# for epoch in range(epochs):
#     for numerical_data,categorical_data0, categorical_data1,labels in train_loader:

#         # inputs, labels = batch
#         inputs = torch.cat([numerical_data, categorical_data0, categorical_data1],dim=1)
#         # print(inputs)
#         preds = model(inputs)
#         preds = preds.squeeze(dim=1)

#         loss = loss_module(preds, labels)

#         # training steps for normal model
#         opt.zero_grad()
#         loss.backward()
#         opt.step()
# print(preds)

In [30]:
# #Próba z większą architekturą - Też bez zmian

# class AffordabilityEstim(nn.Module):
#     def __init__(self, num_inputs, num_outputs):
#         super(AffordabilityEstim, self).__init__()
#         self.lin1 =nn.Linear(num_inputs, 48)  # 28 x 28 = 784
#         self.bn1 = nn.BatchNorm1d(48)
#         self.act1 =nn.Tanh()
#         self.lin2 =nn.Linear(48, 48)
#         self.bn2 = nn.BatchNorm1d(48)
#         self.act2 =nn.Tanh()
#         self.lin3 =nn.Linear(48, 24)
#         self.bn3 = nn.BatchNorm1d(24)
#         self.act3 =nn.ReLU()
#         self.lin4 =nn.Linear(24, num_outputs)



#     def forward(self, x):
#         x = self.lin1(x)
#         x = self.bn1(x)
#         x = self.act1(x)
#         x = self.lin2(x)
#         x = self.bn2(x)
#         x = self.act2(x)
#         x = self.lin3(x)
#         x = self.bn3(x)
#         x = self.act3(x)
#         x = self.lin4(x)
#         return x

# epochs = 100
# opt = optim.Adam(model.parameters(), lr=0.001)
# loss_module = nn.MSELoss()

# model = AffordabilityEstim(numerical_data.shape[1]+categorical_data0.shape[1]+categorical_data1.shape[1], 1)

# model.train()

# for epoch in range(epochs):
#     for numerical_data,categorical_data0, categorical_data1,labels in train_loader:

#         # inputs, labels = batch
#         inputs = torch.cat([numerical_data, categorical_data0, categorical_data1],dim=1)
#         # print(inputs)
#         preds = model(inputs)
#         preds = preds.squeeze(dim=1)

#         loss = loss_module(preds, labels)

#         # training steps for normal model
#         opt.zero_grad()
#         loss.backward()
#         opt.step()
# print(preds)

In [46]:
# #Próba z większą architekturą i podzieleniem labels - Też wychodzą ujemne wartości

# class AffordabilityEstim(nn.Module):
#     def __init__(self, num_inputs, num_outputs):
#         super(AffordabilityEstim, self).__init__()
#         self.lin1 =nn.Linear(num_inputs, 48)  # 28 x 28 = 784
#         self.bn1 = nn.BatchNorm1d(48)
#         self.act1 =nn.Tanh()
#         self.lin2 =nn.Linear(48, 48)
#         self.bn2 = nn.BatchNorm1d(48)
#         self.act2 =nn.Tanh()
#         self.lin3 =nn.Linear(48, 24)
#         self.bn3 = nn.BatchNorm1d(24)
#         self.act3 =nn.ReLU()
#         self.lin4 =nn.Linear(24, num_outputs)



#     def forward(self, x):
#         x = self.lin1(x)
#         x = self.bn1(x)
#         x = self.act1(x)
#         x = self.lin2(x)
#         x = self.bn2(x)
#         x = self.act2(x)
#         x = self.lin3(x)
#         x = self.bn3(x)
#         x = self.act3(x)
#         x = self.lin4(x)
#         return x

# epochs = 100
# opt = optim.Adam(model.parameters(), lr=0.001)
# loss_module = nn.MSELoss()

# model = AffordabilityEstim(numerical_data.shape[1]+categorical_data0.shape[1]+categorical_data1.shape[1], 1)

# model.train()

# for epoch in range(epochs):
#     for numerical_data,categorical_data0, categorical_data1,labels in train_loader:

#         # inputs, labels = batch
#         inputs = torch.cat([numerical_data, categorical_data0, categorical_data1],dim=1)
#         # print(inputs)
#         preds = model(inputs)
#         preds = preds.squeeze(dim=1)

#         loss = loss_module(preds, labels/100000)

#         # training steps for normal model
#         opt.zero_grad()
#         loss.backward()
#         opt.step()
# print(preds)

tensor([-0.4969, -0.0572, -0.0602, -0.0534, -0.0713, -0.7214, -0.3178, -0.5397,
        -0.1613, -0.1324,  0.0480, -0.6908, -0.6508, -0.1333, -0.1584, -0.1083,
        -0.2105, -0.0777, -0.1316, -0.1333, -0.5312, -0.1109],
       grad_fn=<SqueezeBackward1>)


In [36]:
def mae(y_true,y_pred):
    return np.absolute(np.subtract(y_true, y_pred)).mean()

In [49]:
model.eval() # Set model to eval mode

MaeList = []

with torch.no_grad(): # Deactivate gradients for the following code
    for numerical_data,categorical_data0, categorical_data1,labels in test_loader:

        # Determine prediction of model on dev set
        inputs = torch.cat([numerical_data, categorical_data0, categorical_data1],dim=1)
        preds = model(inputs)
        # print(preds)
        preds = preds.squeeze(dim=1)
        print(preds*100000)

        # # Keep records of predictions for the accuracy metric (true_preds=TP+TN, num_preds=TP+TN+FP+FN)
        # true_preds += (preds == data_labels).sum()
        # num_preds += data_labels.shape[0]

        MaeList.append(mae(labels, preds*100000))
print(MaeList)
MaeValue = sum(MaeList)/len(MaeList)
print(MaeValue)

tensor([ -2727.0825,  -2672.8625, -14196.0322, -75708.1016, -77086.0000,
        -40970.4297, -28830.5488,   1850.2294,   5592.7695, -14248.0801,
        -12920.7002,   2377.1870, -40910.3906, -65901.6719, -39130.8750,
        -51829.4883, -64154.2656, -40972.1211, -52435.5352, -51829.4883,
        -14340.9941, -76599.7188, -31970.5254, -41052.8633,   4301.6069,
          4332.8496, -14340.9941,   3923.4341,   4270.1870, -14629.0420,
        -32082.5781, -77079.6562, -33931.2578, -69552.2188,  -8477.6572,
          5680.5991, -14904.1797, -12965.4404, -17079.6777, -45074.3594,
        -20464.6230,  -9179.9346,   4595.8369, -12834.4912,   5416.8687,
        -40990.9375, -37461.8125,  -6624.2925, -37014.3359,   5416.8687,
        -36502.3477, -14629.0420,   2337.3835, -15089.3750, -12692.9414,
          1924.9459, -33931.2578, -77063.4688, -32100.3770, -17145.2012,
        -65041.8047,   3490.3198,   4238.6055, -14340.9941])
tensor([-14899.7100, -36683.5781, -33931.2578,  -6404.3813,    