In [1]:
import torch
import numpy as np
import torchvision
import torch.nn as nn
import pandas as pd
import torch.optim as optim
import torch.functional as F
import torch.utils.data as data
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv("train_data.csv")

In [3]:
df

Unnamed: 0,SalePrice,YearBuilt,Size(sqf),Floor,HallwayType,HeatingType,AptManageType,N_Parkinglot(Ground),N_Parkinglot(Basement),TimeToBusStop,TimeToSubway,N_manager,N_elevators,SubwayStation,N_FacilitiesInApt,N_FacilitiesNearBy(Total),N_SchoolNearBy(Total)
0,141592,2006,814,3,terraced,individual_heating,management_in_trust,111.0,184.0,5min~10min,10min~15min,3.0,0.0,Kyungbuk_uni_hospital,5,6.0,9.0
1,51327,1985,587,8,corridor,individual_heating,self_management,80.0,76.0,0~5min,5min~10min,2.0,2.0,Daegu,3,12.0,4.0
2,48672,1985,587,6,corridor,individual_heating,self_management,80.0,76.0,0~5min,5min~10min,2.0,2.0,Daegu,3,12.0,4.0
3,380530,2006,2056,8,terraced,individual_heating,management_in_trust,249.0,536.0,0~5min,0-5min,5.0,11.0,Sin-nam,5,3.0,7.0
4,78318,1992,644,2,mixed,individual_heating,self_management,142.0,79.0,5min~10min,15min~20min,4.0,8.0,Myung-duk,3,9.0,14.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4119,570796,2007,1928,24,terraced,individual_heating,management_in_trust,0.0,1270.0,0~5min,0-5min,14.0,16.0,Kyungbuk_uni_hospital,10,9.0,10.0
4120,307079,2015,644,22,terraced,individual_heating,management_in_trust,102.0,400.0,0~5min,5min~10min,5.0,10.0,Daegu,7,7.0,11.0
4121,357522,2007,868,20,terraced,individual_heating,management_in_trust,0.0,1270.0,0~5min,0-5min,14.0,16.0,Kyungbuk_uni_hospital,10,9.0,10.0
4122,312389,1978,1327,1,corridor,individual_heating,self_management,87.0,0.0,0~5min,0-5min,1.0,4.0,Kyungbuk_uni_hospital,3,7.0,11.0


In [4]:
print(len(df.columns))

17


In [5]:
categorical_columns = ["HallwayType", "SubwayStation"]

print(df["TimeToBusStop"].unique())
print(df["TimeToSubway"].unique())

['5min~10min' '0~5min' '10min~15min']
['10min~15min' '5min~10min' '0-5min' '15min~20min' 'no_bus_stop_nearby']


In [6]:
dataType = "TimeToBusStop"
df.loc[df[dataType] == '0~5min', dataType] = np.float64(2.0)
df.loc[df[dataType] == '5min~10min', dataType] = np.float64(1.0)
df.loc[df[dataType] == '10min~15min', dataType] = np.float64(0.0)
print(df[dataType].unique())

dataType = "TimeToSubway"
df.loc[df[dataType] == 'no_bus_stop_nearby', dataType] = np.float64(0.0)
df.loc[df[dataType] == '0-5min', dataType] = np.float64(4.0)
df.loc[df[dataType] == '5min~10min', dataType] = np.float64(3.0)
df.loc[df[dataType] == '10min~15min', dataType] = np.float64(2.0)
df.loc[df[dataType] == '15min~20min', dataType] = np.float64(1.0)
print(df[dataType].unique())

[1.0 2.0 0.0]
[2.0 3.0 4.0 1.0 0.0]


In [7]:
df.HeatingType = (df.HeatingType==df["HeatingType"].unique()[0]).astype(int)
df.AptManageType = (df.AptManageType==df["AptManageType"].unique()[0]).astype(int)

In [8]:
categorical_values0 = pd.get_dummies(df[categorical_columns[0]])
categorical_values0.head()
categorical_values1 = pd.get_dummies(df[categorical_columns[1]])
categorical_values1.head()

Unnamed: 0,Bangoge,Banwoldang,Chil-sung-market,Daegu,Kyungbuk_uni_hospital,Myung-duk,Sin-nam,no_subway_nearby
0,0,0,0,0,1,0,0,0
1,0,0,0,1,0,0,0,0
2,0,0,0,1,0,0,0,0
3,0,0,0,0,0,0,1,0
4,0,0,0,0,0,1,0,0


In [9]:
df.drop(columns=categorical_columns,inplace=True)
df = df.astype(float)

In [10]:
train_indices = np.random.rand(len(df))>0.3

In [11]:
df

Unnamed: 0,SalePrice,YearBuilt,Size(sqf),Floor,HeatingType,AptManageType,N_Parkinglot(Ground),N_Parkinglot(Basement),TimeToBusStop,TimeToSubway,N_manager,N_elevators,N_FacilitiesInApt,N_FacilitiesNearBy(Total),N_SchoolNearBy(Total)
0,141592.0,2006.0,814.0,3.0,1.0,1.0,111.0,184.0,1.0,2.0,3.0,0.0,5.0,6.0,9.0
1,51327.0,1985.0,587.0,8.0,1.0,0.0,80.0,76.0,2.0,3.0,2.0,2.0,3.0,12.0,4.0
2,48672.0,1985.0,587.0,6.0,1.0,0.0,80.0,76.0,2.0,3.0,2.0,2.0,3.0,12.0,4.0
3,380530.0,2006.0,2056.0,8.0,1.0,1.0,249.0,536.0,2.0,4.0,5.0,11.0,5.0,3.0,7.0
4,78318.0,1992.0,644.0,2.0,1.0,0.0,142.0,79.0,1.0,1.0,4.0,8.0,3.0,9.0,14.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4119,570796.0,2007.0,1928.0,24.0,1.0,1.0,0.0,1270.0,2.0,4.0,14.0,16.0,10.0,9.0,10.0
4120,307079.0,2015.0,644.0,22.0,1.0,1.0,102.0,400.0,2.0,3.0,5.0,10.0,7.0,7.0,11.0
4121,357522.0,2007.0,868.0,20.0,1.0,1.0,0.0,1270.0,2.0,4.0,14.0,16.0,10.0,9.0,10.0
4122,312389.0,1978.0,1327.0,1.0,1.0,0.0,87.0,0.0,2.0,4.0,1.0,4.0,3.0,7.0,11.0


In [12]:
for column in df.columns:
    print(type(df[column].unique()[0]))

<class 'numpy.float64'>
<class 'numpy.float64'>
<class 'numpy.float64'>
<class 'numpy.float64'>
<class 'numpy.float64'>
<class 'numpy.float64'>
<class 'numpy.float64'>
<class 'numpy.float64'>
<class 'numpy.float64'>
<class 'numpy.float64'>
<class 'numpy.float64'>
<class 'numpy.float64'>
<class 'numpy.float64'>
<class 'numpy.float64'>
<class 'numpy.float64'>


In [13]:
numerical_data = torch.from_numpy(df.values[train_indices,1:]).float()
categorical_data0 = torch.from_numpy(categorical_values0.values[train_indices]).float()
categorical_data1 = torch.from_numpy(categorical_values1.values[train_indices]).float()
targets = torch.from_numpy(df.values[train_indices,0]).float()

test_numerical_data = torch.from_numpy(df.values[~train_indices,1:]).float()
test_categorical_data0 = torch.from_numpy(categorical_values0.values[~train_indices]).float()
test_categorical_data1 = torch.from_numpy(categorical_values1.values[~train_indices]).float()
test_targets = torch.from_numpy(df.values[~train_indices,0]).float()

In [14]:
# # Policzenie liczby wejść do sieci
# inputs_num = len(df[1:].columns)
# print(inputs_num)

print(test_numerical_data.shape)
print(numerical_data.shape)

torch.Size([1286, 14])
torch.Size([2838, 14])


In [15]:
print(numerical_data.shape)
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
numerical_data = scaler.fit_transform(numerical_data)
test_numerical_data = scaler.transform(test_numerical_data)
print(numerical_data.shape)


torch.Size([2838, 14])


(2838, 14)


In [16]:
train_dataset = data.TensorDataset(torch.from_numpy(numerical_data).float(),categorical_data0, categorical_data1,targets)
test_dataset = data.TensorDataset(torch.from_numpy(test_numerical_data).float(),test_categorical_data0,test_categorical_data1,test_targets)

In [17]:
num_inputs = df.shape[1] + len(categorical_columns) - 1
print(num_inputs)

16


In [34]:
class AffordabilityEstim(nn.Module):
    def __init__(self, num_inputs, num_outputs):
        super(AffordabilityEstim, self).__init__()
        self.lin1 =nn.Linear(num_inputs, 120) 
        self.bn1 = nn.BatchNorm1d(120)
        self.act1 =nn.ReLU()
        self.lin2 =nn.Linear(120, 240)
        self.bn2 = nn.BatchNorm1d(240)
        self.act2 =nn.ReLU()
        self.lin3 =nn.Linear(240, num_outputs)


    def forward(self, x):
        x = self.lin1(x)
        x = self.bn1(x)
        x = self.act1(x)
        x = self.lin2(x)
        # x = self.bn2(x)
        x = self.act2(x)
        x = self.lin3(x)
        # x = 1000000 * x
        # x = abs(x)
        return x

In [20]:
def classify(x):
    # CHEAP
    if x <= 100000:
        return 0
    # AVERAGE
    if x > 100000 and x <= 350000:
        return 1
    # EXPENSIVE 
    if x > 350000:
        return 2

In [21]:
train_loader = data.DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = data.DataLoader(test_dataset, batch_size=64, shuffle=False)

In [53]:
def get_accuracy(model, data_loader):
    # ZMIENIĆ TO NA REGRESJĘ
    correct = 0
    total = 0
    model.eval() #*********#
    for numerical_data,categorical_data0, categorical_data1,labels in data_loader:
        inputs = torch.cat([numerical_data,categorical_data0, categorical_data1],dim=1)
        pred = model(inputs)
        for i in range(len(pred)):
            print(classify(pred[i]), classify(labels[i]))
            if classify(pred[i]) == classify(labels[i]):
                correct = correct + 1
                total = total + 1
            else:
                total = total + 1
    return correct / total

In [35]:
model = AffordabilityEstim(numerical_data.shape[1]+categorical_data0.shape[1]+categorical_data1.shape[1], 1)

epochs = 1000
opt = optim.Adam(model.parameters(), lr=10)
loss_module = nn.MSELoss()

model.train()

for epoch in range(epochs):
    for numerical_data,categorical_data0, categorical_data1,labels in train_loader:

        # inputs, labels = batch
        inputs = torch.cat([numerical_data, categorical_data0, categorical_data1],dim=1)
        preds = model(inputs)
        preds = preds.squeeze(dim=1)
        loss = loss_module(preds, labels)

        # training steps for normal model
        opt.zero_grad()
        loss.backward()
        opt.step()


In [24]:
def mae(y_true,y_pred):
    return np.absolute(np.subtract(y_true, y_pred)).mean()

In [37]:
model.eval() # Set model to eval mode

MaeList = []

with torch.no_grad(): # Deactivate gradients for the following code
    for numerical_data,categorical_data0, categorical_data1,labels in test_loader:

        # Determine prediction of model on dev set
        inputs = torch.cat([numerical_data, categorical_data0, categorical_data1],dim=1)
        preds = model(inputs)
        # print(preds)
        preds = preds.squeeze(dim=1)
        print(preds)
        # print(preds)

        # # Keep records of predictions for the accuracy metric (true_preds=TP+TN, num_preds=TP+TN+FP+FN)
        # true_preds += (preds == data_labels).sum()
        # num_preds += data_labels.shape[0]

        MaeList.append(mae(labels, preds))
MaeValue = sum(MaeList)/len(MaeList)
print(MaeValue)

tensor([202277.3281, 348053.8750,  86665.9531, 207282.2500,  55092.8281,
        207282.2500, 238214.5781, 369145.1250, 233032.8750, 159714.4062,
        104550.5000, 151620.6250, 226814.8125, 326764.4062, 325003.5938,
        321894.5312, 223481.4062,  19451.4980, 184904.8438, 241323.6094,
        255346.1562, 256382.4844, 164293.2656, 216227.0000, 100625.7188,
        244432.6406, 304749.8125, 109129.3594, 148350.0000, 244432.6406,
        335157.3750, 137286.6719, 233494.0312, 102588.1094,  71206.0156,
        162985.0469, 248091.7500, 235105.5469, 222669.4375, 349454.6250,
        198666.2344,  57293.8242, 207282.2500, 155999.3906, 308895.1875,
        361416.1875, 334330.6562, 199095.1406,  26646.8516, 100625.7188,
        207282.2500, 167563.8906, 223481.4062, 207282.2500, 188829.6094,
        333909.4375,  75008.7031, 340548.6875, 262933.5312, 105204.6094,
        196478.6562,  25338.6016, 281587.7188, 207282.2500])
tensor([207282.2500, 200628.6094,  55092.8281, 319821.8438, 288

In [54]:
print(get_accuracy(model, test_loader))

1 1
1 2
0 0
1 1
0 0
1 1
1 1
2 2
1 1
1 1
1 0
1 1
1 1
1 2
1 2
1 2
1 1
0 0
1 1
1 1
1 1
1 1
1 1
1 1
1 0
1 1
1 1
1 0
1 1
1 1
1 2
1 1
1 1
1 1
0 0
1 1
1 1
1 1
1 1
1 2
1 1
0 0
1 1
1 1
1 1
2 2
1 2
1 1
0 0
1 0
1 1
1 1
1 1
1 1
1 1
1 2
0 0
1 2
1 1
1 0
1 1
0 0
1 1
1 1
1 1
1 1
0 0
1 2
1 1
1 1
1 1
1 1
1 0
0 0
1 1
1 2
1 1
1 1
0 0
1 1
1 1
1 2
1 1
1 1
2 2
1 1
1 1
0 0
1 0
1 1
1 1
1 2
1 0
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 0
1 1
0 0
1 1
0 0
1 1
1 1
0 0
0 0
1 1
2 2
1 0
1 1
1 0
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
0 0
0 0
1 1
1 1
1 1
1 1
1 1
1 2
2 2
1 1
1 1
1 1
0 0
1 1
1 1
1 1
1 1
1 1
1 0
1 1
1 1
0 0
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
0 0
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
0 0
1 0
0 0
1 1
0 0
1 1
1 1
1 1
1 1
1 1
1 1
1 0
0 0
1 1
1 1
1 1
1 1
1 1
1 1
1 1
0 0
1 1
0 0
1 1
1 1
1 1
1 1
1 1
1 1
1 0
1 0
1 1
1 1
1 1
1 1
1 1
1 1
1 0
1 1
1 1
1 0
0 0
1 0
1 1
1 1
1 1
0 0
1 2
2 2
1 1
1 1
0 0
0 0
1 2
1 2
0 0
1 1
0 0
1 0
1 1
1 1
1 1
1 1


In [None]:
dft = pd.read_csv("train_data.csv")
categorical_columns = ["HallwayType", "SubwayStation"]
dataType = "TimeToBusStop"
dft.loc[dft[dataType] == '0~5min', dataType] = np.float64(2.0)
dft.loc[dft[dataType] == '5min~10min', dataType] = np.float64(1.0)
dft.loc[dft[dataType] == '10min~15min', dataType] = np.float64(0.0)
print(dft[dataType].unique())

dataType = "TimeToSubway"
dft.loc[dft[dataType] == 'no_bus_stop_nearby', dataType] = np.float64(0.0)
dft.loc[dft[dataType] == '0-5min', dataType] = np.float64(4.0)
dft.loc[dft[dataType] == '5min~10min', dataType] = np.float64(3.0)
dft.loc[dft[dataType] == '10min~15min', dataType] = np.float64(2.0)
dft.loc[dft[dataType] == '15min~20min', dataType] = np.float64(1.0)
print(dft[dataType].unique())

dft.HeatingType = (dft.HeatingType==dft["HeatingType"].unique()[0]).astype(int)
dft.AptManageType = (dft.AptManageType==dft["AptManageType"].unique()[0]).astype(int)

In [None]:
all_preds = []
model.cpu()
with torch.no_grad(): # Deactivate gradients for the following code
    for data_inputs in x_test_data:

        # Determine prediction of model on dev set
        # print(data_inputs)
        preds = model(data_inputs[0])
        preds = preds.cpu().numpy()
        all_preds.append(preds)
all_preds = np.concatenate(all_preds, axis=0)
np.savetxt("predykcje.csv", all_preds, delimiter=",")