In [1]:
import torch
import numpy as np
import torchvision
import torch.nn as nn
import pandas as pd
import torch.optim as optim
import torch.functional as F
import torch.utils.data as data
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv("train_data.csv")

In [3]:
df

Unnamed: 0,SalePrice,YearBuilt,Size(sqf),Floor,HallwayType,HeatingType,AptManageType,N_Parkinglot(Ground),N_Parkinglot(Basement),TimeToBusStop,TimeToSubway,N_manager,N_elevators,SubwayStation,N_FacilitiesInApt,N_FacilitiesNearBy(Total),N_SchoolNearBy(Total)
0,141592,2006,814,3,terraced,individual_heating,management_in_trust,111.0,184.0,5min~10min,10min~15min,3.0,0.0,Kyungbuk_uni_hospital,5,6.0,9.0
1,51327,1985,587,8,corridor,individual_heating,self_management,80.0,76.0,0~5min,5min~10min,2.0,2.0,Daegu,3,12.0,4.0
2,48672,1985,587,6,corridor,individual_heating,self_management,80.0,76.0,0~5min,5min~10min,2.0,2.0,Daegu,3,12.0,4.0
3,380530,2006,2056,8,terraced,individual_heating,management_in_trust,249.0,536.0,0~5min,0-5min,5.0,11.0,Sin-nam,5,3.0,7.0
4,78318,1992,644,2,mixed,individual_heating,self_management,142.0,79.0,5min~10min,15min~20min,4.0,8.0,Myung-duk,3,9.0,14.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4119,570796,2007,1928,24,terraced,individual_heating,management_in_trust,0.0,1270.0,0~5min,0-5min,14.0,16.0,Kyungbuk_uni_hospital,10,9.0,10.0
4120,307079,2015,644,22,terraced,individual_heating,management_in_trust,102.0,400.0,0~5min,5min~10min,5.0,10.0,Daegu,7,7.0,11.0
4121,357522,2007,868,20,terraced,individual_heating,management_in_trust,0.0,1270.0,0~5min,0-5min,14.0,16.0,Kyungbuk_uni_hospital,10,9.0,10.0
4122,312389,1978,1327,1,corridor,individual_heating,self_management,87.0,0.0,0~5min,0-5min,1.0,4.0,Kyungbuk_uni_hospital,3,7.0,11.0


In [4]:
print(len(df.columns))

17


In [5]:
categorical_columns = ["HallwayType", "SubwayStation"]

print(df["TimeToBusStop"].unique())
print(df["TimeToSubway"].unique())

['5min~10min' '0~5min' '10min~15min']
['10min~15min' '5min~10min' '0-5min' '15min~20min' 'no_bus_stop_nearby']


In [6]:
dataType = "TimeToBusStop"
df.loc[df[dataType] == '0~5min', dataType] = np.float64(2.0)
df.loc[df[dataType] == '5min~10min', dataType] = np.float64(1.0)
df.loc[df[dataType] == '10min~15min', dataType] = np.float64(0.0)
print(df[dataType].unique())

dataType = "TimeToSubway"
df.loc[df[dataType] == 'no_bus_stop_nearby', dataType] = np.float64(0.0)
df.loc[df[dataType] == '0-5min', dataType] = np.float64(4.0)
df.loc[df[dataType] == '5min~10min', dataType] = np.float64(3.0)
df.loc[df[dataType] == '10min~15min', dataType] = np.float64(2.0)
df.loc[df[dataType] == '15min~20min', dataType] = np.float64(1.0)
print(df[dataType].unique())

[1.0 2.0 0.0]
[2.0 3.0 4.0 1.0 0.0]


In [7]:
df.HeatingType = (df.HeatingType==df["HeatingType"].unique()[0]).astype(int)
df.AptManageType = (df.AptManageType==df["AptManageType"].unique()[0]).astype(int)

In [8]:
categorical_values0 = pd.get_dummies(df[categorical_columns[0]])
categorical_values0.head()
categorical_values1 = pd.get_dummies(df[categorical_columns[1]])
categorical_values1.head()

Unnamed: 0,Bangoge,Banwoldang,Chil-sung-market,Daegu,Kyungbuk_uni_hospital,Myung-duk,Sin-nam,no_subway_nearby
0,0,0,0,0,1,0,0,0
1,0,0,0,1,0,0,0,0
2,0,0,0,1,0,0,0,0
3,0,0,0,0,0,0,1,0
4,0,0,0,0,0,1,0,0


In [9]:
df.drop(columns=categorical_columns,inplace=True)
df = df.astype(float)

In [10]:
train_indices = np.random.rand(len(df))>0.3

In [11]:
df

Unnamed: 0,SalePrice,YearBuilt,Size(sqf),Floor,HeatingType,AptManageType,N_Parkinglot(Ground),N_Parkinglot(Basement),TimeToBusStop,TimeToSubway,N_manager,N_elevators,N_FacilitiesInApt,N_FacilitiesNearBy(Total),N_SchoolNearBy(Total)
0,141592.0,2006.0,814.0,3.0,1.0,1.0,111.0,184.0,1.0,2.0,3.0,0.0,5.0,6.0,9.0
1,51327.0,1985.0,587.0,8.0,1.0,0.0,80.0,76.0,2.0,3.0,2.0,2.0,3.0,12.0,4.0
2,48672.0,1985.0,587.0,6.0,1.0,0.0,80.0,76.0,2.0,3.0,2.0,2.0,3.0,12.0,4.0
3,380530.0,2006.0,2056.0,8.0,1.0,1.0,249.0,536.0,2.0,4.0,5.0,11.0,5.0,3.0,7.0
4,78318.0,1992.0,644.0,2.0,1.0,0.0,142.0,79.0,1.0,1.0,4.0,8.0,3.0,9.0,14.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4119,570796.0,2007.0,1928.0,24.0,1.0,1.0,0.0,1270.0,2.0,4.0,14.0,16.0,10.0,9.0,10.0
4120,307079.0,2015.0,644.0,22.0,1.0,1.0,102.0,400.0,2.0,3.0,5.0,10.0,7.0,7.0,11.0
4121,357522.0,2007.0,868.0,20.0,1.0,1.0,0.0,1270.0,2.0,4.0,14.0,16.0,10.0,9.0,10.0
4122,312389.0,1978.0,1327.0,1.0,1.0,0.0,87.0,0.0,2.0,4.0,1.0,4.0,3.0,7.0,11.0


In [12]:
numerical_data = torch.from_numpy(df.values[train_indices,1:]).float()
print(numerical_data)
categorical_data0 = torch.from_numpy(categorical_values0.values[train_indices]).float()
categorical_data1 = torch.from_numpy(categorical_values1.values[train_indices]).float()
targets = torch.from_numpy(df.values[train_indices,0]).float()

test_numerical_data = torch.from_numpy(df.values[~train_indices,1:]).float()
test_categorical_data0 = torch.from_numpy(categorical_values0.values[~train_indices]).float()
test_categorical_data1 = torch.from_numpy(categorical_values1.values[~train_indices]).float()
test_targets = torch.from_numpy(df.values[~train_indices,0]).float()

tensor([[2006.,  814.,    3.,  ...,    5.,    6.,    9.],
        [1985.,  587.,    6.,  ...,    3.,   12.,    4.],
        [2006., 2056.,    8.,  ...,    5.,    3.,    7.],
        ...,
        [2007., 1928.,   24.,  ...,   10.,    9.,   10.],
        [2015.,  644.,   22.,  ...,    7.,    7.,   11.],
        [2007.,  868.,   13.,  ...,   10.,    9.,   10.]])


In [13]:
# # Policzenie liczby wejść do sieci
# inputs_num = len(df[1:].columns)
# print(inputs_num)

print(test_numerical_data.shape)
print(numerical_data.shape)

torch.Size([1290, 14])
torch.Size([2834, 14])


In [14]:
print(numerical_data.shape)
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
numerical_data = scaler.fit_transform(numerical_data)
test_numerical_data = scaler.transform(test_numerical_data)
print(numerical_data.shape)


torch.Size([2834, 14])


(2834, 14)


In [15]:
train_dataset = data.TensorDataset(torch.from_numpy(numerical_data).float(),categorical_data0, categorical_data1,targets)
test_dataset = data.TensorDataset(torch.from_numpy(test_numerical_data).float(),test_categorical_data0,test_categorical_data1,test_targets)

In [16]:
num_inputs = df.shape[1] + len(categorical_columns) - 1
print(num_inputs)

16


In [17]:
class AffordabilityEstim(nn.Module):
    def __init__(self, num_inputs, num_outputs):
        super(AffordabilityEstim, self).__init__()
        self.lin1 =nn.Linear(num_inputs, 120) 
        self.bn1 = nn.BatchNorm1d(120)
        self.act1 =nn.ReLU()
        self.lin2 =nn.Linear(120, 240)
        self.bn2 = nn.BatchNorm1d(240)
        self.act2 =nn.ReLU()
        self.lin3 =nn.Linear(240, num_outputs)


    def forward(self, x):
        x = self.lin1(x)
        x = self.bn1(x)
        x = self.act1(x)
        x = self.lin2(x)
        # x = self.bn2(x)
        x = self.act2(x)
        x = self.lin3(x)
        # x = 1000000 * x
        # x = abs(x)
        return x

In [18]:
def classify(x):
    # CHEAP
    if x <= 100000:
        return 0
    # AVERAGE
    if x > 100000 and x <= 350000:
        return 1
    # EXPENSIVE 
    if x > 350000:
        return 2

In [19]:
train_loader = data.DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = data.DataLoader(test_dataset, batch_size=64, shuffle=False)

In [20]:
def get_accuracy(model, data_loader):
    # ZMIENIĆ TO NA REGRESJĘ
    correct = 0
    total = 0
    model.eval() #*********#
    for numerical_data,categorical_data0, categorical_data1,labels in data_loader:
        inputs = torch.cat([numerical_data,categorical_data0, categorical_data1],dim=1)
        pred = model(inputs)
        for i in range(len(pred)):
            print(classify(pred[i]), classify(labels[i]))
            if classify(pred[i]) == classify(labels[i]):
                correct = correct + 1
                total = total + 1
            else:
                total = total + 1
    return correct / total

In [28]:
model = AffordabilityEstim(numerical_data.shape[1]+categorical_data0.shape[1]+categorical_data1.shape[1], 1)

epochs = 1000
opt = optim.Adam(model.parameters(), lr=10)
loss_module = nn.MSELoss()

model.train()

for epoch in range(epochs):
    for numerical_data,categorical_data0, categorical_data1,labels in train_loader:

        # inputs, labels = batch
        inputs = torch.cat([numerical_data, categorical_data0, categorical_data1],dim=1)
        preds = model(inputs)
        preds = preds.squeeze(dim=1)
        loss = loss_module(preds, labels)

        # training steps for normal model
        opt.zero_grad()
        loss.backward()
        opt.step()


In [22]:
def mae(y_true,y_pred):
    return np.absolute(np.subtract(y_true, y_pred)).mean()

In [29]:
model.eval() # Set model to eval mode

MaeList = []

with torch.no_grad(): # Deactivate gradients for the following code
    for numerical_data,categorical_data0, categorical_data1,labels in test_loader:

        # Determine prediction of model on dev set
        inputs = torch.cat([numerical_data, categorical_data0, categorical_data1],dim=1)
        # print(inputs[0])
        preds = model(inputs)
        # print(preds)
        preds = preds.squeeze(dim=1)
        print(preds)
        # print(preds)

        # # Keep records of predictions for the accuracy metric (true_preds=TP+TN, num_preds=TP+TN+FP+FN)
        # true_preds += (preds == data_labels).sum()
        # num_preds += data_labels.shape[0]

        MaeList.append(mae(labels, preds))
MaeValue = sum(MaeList)/len(MaeList)
print(MaeValue)

tensor([ 89200.0781, 114471.7812, 116199.3750, 230585.4375, 101980.6562,
        230585.4375, 230585.4375,  88728.9062, 333542.4375,  61334.0547,
        101509.5000, 176002.1562, 115257.0469, 125279.5469, 326540.2188,
        301451.7812, 331165.1875, 310880.7188, 174745.7188,  59920.5664,
        126378.9219, 102922.9531, 236224.0000, 301328.1875, 177101.5156,
        275215.1875, 125750.7031, 230585.4375, 157460.8281, 105760.9375,
         74695.3906, 270086.4688, 126693.0312, 230585.4375, 152736.6875,
        242837.2188, 343031.2500, 217306.4688, 193410.1250, 179461.0625,
         89357.1250, 297312.2812, 323020.1562, 349178.2812, 342241.5000,
         66986.6641, 230585.4375, 168604.8750, 177886.7812, 230585.4375,
        362974.5625, 330024.7188, 324754.3438, 313482.0312, 328222.7500,
        295382.0938, 321285.9375,  67614.8750, 254478.6875, 276082.2812,
        126064.8125, 252550.2500, 123866.0781, 201498.7188])
tensor([ 61334.0547, 230585.4375, 106232.0938,  57906.6641,  57

In [30]:
print(get_accuracy(model, test_loader))

0 0
1 0
1 0
1 1
1 0
1 1
1 1
0 0
1 1
0 0
1 0
1 1
1 0
1 0
1 1
1 1
1 2
1 2
1 1
0 0
1 0
1 0
1 1
1 1
1 1
1 1
1 0
1 1
1 1
1 0
0 0
1 1
1 0
1 1
1 1
1 1
1 2
1 1
1 1
1 1
0 0
1 1
1 2
1 2
1 2
0 0
1 1
1 1
1 1
1 1
2 2
1 2
1 2
1 2
1 2
1 1
1 2
0 0
1 1
1 1
1 0
1 1
1 0
1 1
0 0
1 1
1 0
0 0
0 0
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 2
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 0
1 0
1 0
1 1
1 0
1 1
0 0
1 0
1 1
1 0
1 0
1 1
1 0
1 1
1 0
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
2 2
1 0
1 1
1 1
1 0
1 2
1 0
1 1
2 2
1 1
1 1
1 1
1 1
0 0
1 1
1 1
1 1
1 0
1 1
1 1
1 1
1 1
1 1
2 2
1 1
1 1
1 1
1 1
1 1
1 1
1 0
1 0
1 1
0 0
1 0
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 0
1 2
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
0 0
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 0
1 0
1 1
1 0
1 1
1 1
1 2
1 1
1 1
1 1
1 1
1 0
1 1
1 1
1 1
1 1
1 1
1 1
0 0
1 0
1 1
1 1
0 0
1 1
1 1
1 1
1 1
1 1
0 0
1 0
1 1
1 1
2 2
1 1
1 1
1 0
1 0
1 1
0 0
1 1
2 2
1 1
0 0
1 0
0 0
1 0
1 1
1 1


In [31]:
dft = pd.read_csv("test_data.csv")
print(len(dft))
categorical_columns = ["HallwayType", "SubwayStation"]
dataType = "TimeToBusStop"
dft.loc[dft[dataType] == '0~5min', dataType] = np.float64(2.0)
dft.loc[dft[dataType] == '5min~10min', dataType] = np.float64(1.0)
dft.loc[dft[dataType] == '10min~15min', dataType] = np.float64(0.0)
print(dft[dataType].unique())

dataType = "TimeToSubway"
dft.loc[dft[dataType] == 'no_bus_stop_nearby', dataType] = np.float64(0.0)
dft.loc[dft[dataType] == '0-5min', dataType] = np.float64(4.0)
dft.loc[dft[dataType] == '5min~10min', dataType] = np.float64(3.0)
dft.loc[dft[dataType] == '10min~15min', dataType] = np.float64(2.0)
dft.loc[dft[dataType] == '15min~20min', dataType] = np.float64(1.0)
print(dft[dataType].unique())

dft.HeatingType = (dft.HeatingType==dft["HeatingType"].unique()[0]).astype(int)
dft.AptManageType = (dft.AptManageType==dft["AptManageType"].unique()[0]).astype(int)

categorical_values0t = pd.get_dummies(dft[categorical_columns[0]])
categorical_values1t = pd.get_dummies(dft[categorical_columns[1]])

dft.drop(columns=categorical_columns,inplace=True)

dft = dft.astype(float)

numerical_datat = torch.from_numpy(dft.values).float()
scaler = StandardScaler()
numerical_datat = scaler.fit_transform(numerical_datat)
categorical_data0t = torch.from_numpy(categorical_values0t.values).float()
categorical_data1t = torch.from_numpy(categorical_values1t.values).float()
validate_dataset = data.TensorDataset(torch.from_numpy(numerical_datat).float(),categorical_data0t,categorical_data1t)
validate_loader = data.DataLoader(validate_dataset, batch_size=64, shuffle=False)


1767
[2.0 1.0 0.0]
[1.0 4.0 3.0 2.0 0.0]


In [32]:
all_preds = []

model.eval() # Set model to eval mode

with torch.no_grad(): # Deactivate gradients for the following code
    for numerical_data,categorical_data0, categorical_data1 in validate_loader:

        # Determine prediction of model on dev set
        inputs = torch.cat([numerical_data, categorical_data0, categorical_data1],dim=1)
        # print(inputs)
        # print(inputs[0])
        preds = model(inputs)
        # print(preds)
        preds = preds.squeeze(dim=1)
        print(preds)
        for pred in preds:
            # print(pred)
            all_preds.append(classify(pred))
        # print(preds)

        # # Keep records of predictions for the accuracy metric (true_preds=TP+TN, num_preds=TP+TN+FP+FN)
        # true_preds += (preds == data_labels).sum()
        # num_preds += data_labels.shape[0]

# all_preds = np.concatenate(all_preds, axis=0)
print(all_preds)
np.savetxt("predykcje.csv", all_preds, delimiter=",")

tensor([180438.0156, 302277.2812, 341071.1250, 173808.9688, 220664.8125,
        326530.6875, 225410.9844,  63139.1641, 204960.7656, 233004.6719,
        138048.3125, 182504.0625, 176500.0469, 181232.6406, 258750.0469,
        308125.8438, 211256.7500, 230585.4375, 256058.6875, 230585.4375,
        172691.3125, 139001.8594, 147588.6719, 173808.9688, 337060.0312,
        211771.7656, 116965.9531, 246149.1562, 178089.3281, 331799.0625,
        177338.6875, 202148.7344, 234998.0312, 160673.5781, 171425.0781,
        129828.9688, 255637.9844, 179484.4688, 104751.9219, 260934.4062,
        230585.4375, 109619.0781, 234998.0312, 230585.4375, 230585.4375,
        236729.6875, 230585.4375, 224457.4375, 211415.6719, 230585.4375,
        255583.2031, 357609.4062, 176182.2031, 233211.1562, 201830.8906,
        140591.1250, 230585.4375, 203689.3750, 230585.4375, 128557.5625,
        176182.2031, 139160.7812, 120334.0938, 161817.3750])
tensor([202784.4531, 337941.1250, 106182.2656,  92834.0469, 178