In [1]:
import torch
import numpy as np
import torchvision
import torch.nn as nn
import pandas as pd
import torch.optim as optim
import torch.functional as F
import torch.utils.data as data
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv("train_data.csv")

In [3]:
df

Unnamed: 0,SalePrice,YearBuilt,Size(sqf),Floor,HallwayType,HeatingType,AptManageType,N_Parkinglot(Ground),N_Parkinglot(Basement),TimeToBusStop,TimeToSubway,N_manager,N_elevators,SubwayStation,N_FacilitiesInApt,N_FacilitiesNearBy(Total),N_SchoolNearBy(Total)
0,141592,2006,814,3,terraced,individual_heating,management_in_trust,111.0,184.0,5min~10min,10min~15min,3.0,0.0,Kyungbuk_uni_hospital,5,6.0,9.0
1,51327,1985,587,8,corridor,individual_heating,self_management,80.0,76.0,0~5min,5min~10min,2.0,2.0,Daegu,3,12.0,4.0
2,48672,1985,587,6,corridor,individual_heating,self_management,80.0,76.0,0~5min,5min~10min,2.0,2.0,Daegu,3,12.0,4.0
3,380530,2006,2056,8,terraced,individual_heating,management_in_trust,249.0,536.0,0~5min,0-5min,5.0,11.0,Sin-nam,5,3.0,7.0
4,78318,1992,644,2,mixed,individual_heating,self_management,142.0,79.0,5min~10min,15min~20min,4.0,8.0,Myung-duk,3,9.0,14.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4119,570796,2007,1928,24,terraced,individual_heating,management_in_trust,0.0,1270.0,0~5min,0-5min,14.0,16.0,Kyungbuk_uni_hospital,10,9.0,10.0
4120,307079,2015,644,22,terraced,individual_heating,management_in_trust,102.0,400.0,0~5min,5min~10min,5.0,10.0,Daegu,7,7.0,11.0
4121,357522,2007,868,20,terraced,individual_heating,management_in_trust,0.0,1270.0,0~5min,0-5min,14.0,16.0,Kyungbuk_uni_hospital,10,9.0,10.0
4122,312389,1978,1327,1,corridor,individual_heating,self_management,87.0,0.0,0~5min,0-5min,1.0,4.0,Kyungbuk_uni_hospital,3,7.0,11.0


In [4]:
print(len(df.columns))

17


In [5]:
categorical_columns = ["HallwayType", "SubwayStation"]

print(df["TimeToBusStop"].unique())
print(df["TimeToSubway"].unique())

['5min~10min' '0~5min' '10min~15min']
['10min~15min' '5min~10min' '0-5min' '15min~20min' 'no_bus_stop_nearby']


In [6]:
dataType = "TimeToBusStop"
df.loc[df[dataType] == '0~5min', dataType] = np.float64(2.0)
df.loc[df[dataType] == '5min~10min', dataType] = np.float64(1.0)
df.loc[df[dataType] == '10min~15min', dataType] = np.float64(0.0)
print(df[dataType].unique())

dataType = "TimeToSubway"
df.loc[df[dataType] == 'no_bus_stop_nearby', dataType] = np.float64(0.0)
df.loc[df[dataType] == '0-5min', dataType] = np.float64(4.0)
df.loc[df[dataType] == '5min~10min', dataType] = np.float64(3.0)
df.loc[df[dataType] == '10min~15min', dataType] = np.float64(2.0)
df.loc[df[dataType] == '15min~20min', dataType] = np.float64(1.0)
print(df[dataType].unique())

[1.0 2.0 0.0]
[2.0 3.0 4.0 1.0 0.0]


In [7]:
df.HeatingType = (df.HeatingType==df["HeatingType"].unique()[0]).astype(int)
df.AptManageType = (df.AptManageType==df["AptManageType"].unique()[0]).astype(int)

In [8]:
categorical_values0 = pd.get_dummies(df[categorical_columns[0]])
categorical_values0.head()
categorical_values1 = pd.get_dummies(df[categorical_columns[1]])
categorical_values1.head()

Unnamed: 0,Bangoge,Banwoldang,Chil-sung-market,Daegu,Kyungbuk_uni_hospital,Myung-duk,Sin-nam,no_subway_nearby
0,0,0,0,0,1,0,0,0
1,0,0,0,1,0,0,0,0
2,0,0,0,1,0,0,0,0
3,0,0,0,0,0,0,1,0
4,0,0,0,0,0,1,0,0


In [9]:
df.drop(columns=categorical_columns,inplace=True)
df = df.astype(float)

In [10]:
train_indices = np.random.rand(len(df))>0.3

In [11]:
df

Unnamed: 0,SalePrice,YearBuilt,Size(sqf),Floor,HeatingType,AptManageType,N_Parkinglot(Ground),N_Parkinglot(Basement),TimeToBusStop,TimeToSubway,N_manager,N_elevators,N_FacilitiesInApt,N_FacilitiesNearBy(Total),N_SchoolNearBy(Total)
0,141592.0,2006.0,814.0,3.0,1.0,1.0,111.0,184.0,1.0,2.0,3.0,0.0,5.0,6.0,9.0
1,51327.0,1985.0,587.0,8.0,1.0,0.0,80.0,76.0,2.0,3.0,2.0,2.0,3.0,12.0,4.0
2,48672.0,1985.0,587.0,6.0,1.0,0.0,80.0,76.0,2.0,3.0,2.0,2.0,3.0,12.0,4.0
3,380530.0,2006.0,2056.0,8.0,1.0,1.0,249.0,536.0,2.0,4.0,5.0,11.0,5.0,3.0,7.0
4,78318.0,1992.0,644.0,2.0,1.0,0.0,142.0,79.0,1.0,1.0,4.0,8.0,3.0,9.0,14.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4119,570796.0,2007.0,1928.0,24.0,1.0,1.0,0.0,1270.0,2.0,4.0,14.0,16.0,10.0,9.0,10.0
4120,307079.0,2015.0,644.0,22.0,1.0,1.0,102.0,400.0,2.0,3.0,5.0,10.0,7.0,7.0,11.0
4121,357522.0,2007.0,868.0,20.0,1.0,1.0,0.0,1270.0,2.0,4.0,14.0,16.0,10.0,9.0,10.0
4122,312389.0,1978.0,1327.0,1.0,1.0,0.0,87.0,0.0,2.0,4.0,1.0,4.0,3.0,7.0,11.0


In [12]:
for column in df.columns:
    print(type(df[column].unique()[0]))

<class 'numpy.float64'>
<class 'numpy.float64'>
<class 'numpy.float64'>
<class 'numpy.float64'>
<class 'numpy.float64'>
<class 'numpy.float64'>
<class 'numpy.float64'>
<class 'numpy.float64'>
<class 'numpy.float64'>
<class 'numpy.float64'>
<class 'numpy.float64'>
<class 'numpy.float64'>
<class 'numpy.float64'>
<class 'numpy.float64'>
<class 'numpy.float64'>


In [14]:
numerical_data = torch.from_numpy(df.values[train_indices,1:]).float()
categorical_data0 = torch.from_numpy(categorical_values0.values[train_indices]).float()
categorical_data1 = torch.from_numpy(categorical_values1.values[train_indices]).float()
targets = torch.from_numpy(df.values[train_indices,0]).float()

test_numerical_data = torch.from_numpy(df.values[~train_indices,1:]).float()
test_categorical_data0 = torch.from_numpy(categorical_values0.values[~train_indices]).float()
test_categorical_data1 = torch.from_numpy(categorical_values1.values[~train_indices]).float()
test_targets = torch.from_numpy(df.values[~train_indices,0]).float()

In [None]:
# # Policzenie liczby wejść do sieci
# inputs_num = len(df[1:].columns)
# print(inputs_num)

In [15]:
train_dataset = data.TensorDataset(numerical_data,categorical_data0, categorical_data1,targets)
test_dataset = data.TensorDataset(test_numerical_data,test_categorical_data0,test_categorical_data1,test_targets)

In [16]:
num_inputs = df.shape[1] + len(categorical_columns) - 1
print(num_inputs)

16


In [17]:
class AffordabilityEstim(nn.Module):
    def __init__(self, num_inputs, num_outputs):
        super(AffordabilityEstim, self).__init__()
        self.lin1 =nn.Linear(num_inputs, 48)  # 28 x 28 = 784
        self.bn1 = nn.BatchNorm1d(48)
        self.act1 =nn.ReLU()
        self.lin2 =nn.Linear(48, 24)
        self.bn2 = nn.BatchNorm1d(24)
        self.act2 =nn.ReLU()
        self.lin3 =nn.Linear(24, num_outputs)


    def forward(self, x):
        x = self.lin1(x)
        x = self.bn1(x)
        x = self.act1(x)
        x = self.lin2(x)
        x = self.bn2(x)
        x = self.act2(x)
        x = self.lin3(x)
        return x

In [18]:
model = AffordabilityEstim(16, 1)
print(model)

AffordabilityEstim(
  (lin1): Linear(in_features=16, out_features=48, bias=True)
  (bn1): BatchNorm1d(48, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (act1): ReLU()
  (lin2): Linear(in_features=48, out_features=24, bias=True)
  (bn2): BatchNorm1d(24, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (act2): ReLU()
  (lin3): Linear(in_features=24, out_features=1, bias=True)
)


In [20]:
def classify(x):
    # CHEAP
    if x <= 100000:
        return 0
    # AVERAGE
    if x > 100000 and x <= 350000:
        return 1
    # EXPENSIVE 
    if x > 350000:
        return 2

In [22]:
loss_fn = nn.MSELoss()
# opt = optim.SGD(model.parameters(), lr=0.001, momentum=0.5)
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [21]:
train_loader = data.DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = data.DataLoader(test_dataset, batch_size=64, shuffle=False)

In [None]:
def get_accuracy(model, data_loader):
    # ZMIENIĆ TO NA REGRESJĘ
    correct = 0
    total = 0
    model.eval() #*********#
    for x, cat_x0, cat_x1, labels in data_loader:
        output = model(x, cat_x0, cat_x1)
        pred = output>0
        correct += pred.eq(labels.view_as(pred)).sum().item()
        total += x.shape[0]
    return correct / total

In [None]:
epochs = 100

for epoch in range(epochs):
    for i, batch in enumerate(train_loader, 0):

        model.train()

        inputs, labels = batch

        # training steps for normal model
        opt.zero_grad()
        outputs = model(inputs).squeeze()
        loss = loss_fn(outputs, labels)
        loss.backward()
        opt.step()
