In [335]:
import pandas as pd
import torch
import numpy as np
from torch import nn, optim
import torch.utils.data as data
import random

In [336]:
device = torch.device('cuda')

In [337]:
seed = 8888
torch.cuda.manual_seed_all(seed)
torch.manual_seed(seed)
np.random.seed(seed)
random.seed(seed)

In [338]:
df = pd.read_csv('train_data.csv')
df['SalePrice'] = df['SalePrice'].apply(lambda b: 1 if b > 300000 else 0)

# Undersampling
nmin = df['SalePrice'].value_counts().min()
df = df.groupby('SalePrice').apply(lambda x: x.sample(nmin)).reset_index(drop=True)

test_df = pd.read_csv('test_data.csv')

In [339]:
df

Unnamed: 0,SalePrice,YearBuilt,YrSold,MonthSold,Size(sqf),Floor,HallwayType,HeatingType,AptManageType,N_Parkinglot(Ground),N_Parkinglot(Basement),TimeToBusStop,TimeToSubway,N_APT,N_manager,N_elevators,SubwayStation,N_FacilitiesNearBy(PublicOffice),N_FacilitiesNearBy(Hospital),N_FacilitiesNearBy(Dpartmentstore),N_FacilitiesNearBy(Mall),N_FacilitiesNearBy(ETC),N_FacilitiesNearBy(Park),N_SchoolNearBy(Elementary),N_SchoolNearBy(Middle),N_SchoolNearBy(High),N_SchoolNearBy(University),N_FacilitiesInApt,N_FacilitiesNearBy(Total),N_SchoolNearBy(Total)
0,0,1993,2011,8,644,3,mixed,individual_heating,management_in_trust,523.0,536.0,0~5min,15min~20min,8.0,8.0,20.0,Myung-duk,6.0,2,0.0,1.0,5.0,0.0,4.0,3.0,5.0,5.0,4,14.0,17.0
1,0,2006,2009,6,636,1,terraced,individual_heating,management_in_trust,111.0,184.0,5min~10min,10min~15min,3.0,3.0,0.0,Kyungbuk_uni_hospital,2.0,1,1.0,1.0,1.0,0.0,3.0,2.0,2.0,2.0,5,6.0,9.0
2,0,2006,2012,9,903,5,terraced,individual_heating,management_in_trust,123.0,181.0,5min~10min,0-5min,3.0,3.0,11.0,Myung-duk,3.0,1,1.0,1.0,2.0,0.0,4.0,3.0,3.0,1.0,4,8.0,11.0
3,0,2006,2013,2,1149,25,terraced,individual_heating,management_in_trust,249.0,536.0,0~5min,0-5min,6.0,5.0,11.0,Sin-nam,1.0,1,0.0,1.0,0.0,0.0,2.0,2.0,1.0,2.0,5,3.0,7.0
4,0,2013,2016,1,644,15,terraced,individual_heating,management_in_trust,8.0,930.0,0~5min,0-5min,7.0,6.0,14.0,Kyungbuk_uni_hospital,5.0,1,1.0,1.0,0.0,1.0,4.0,2.0,2.0,3.0,7,9.0,11.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1865,1,2007,2011,1,1483,21,terraced,individual_heating,management_in_trust,7.0,605.0,0~5min,0-5min,2.0,5.0,5.0,Banwoldang,4.0,1,2.0,1.0,0.0,1.0,2.0,1.0,1.0,1.0,5,9.0,5.0
1866,1,2007,2017,7,868,17,terraced,individual_heating,management_in_trust,0.0,1270.0,0~5min,0-5min,7.0,14.0,16.0,Kyungbuk_uni_hospital,3.0,1,2.0,1.0,0.0,2.0,3.0,3.0,2.0,2.0,10,9.0,10.0
1867,1,2007,2011,8,1629,17,terraced,individual_heating,management_in_trust,7.0,605.0,0~5min,0-5min,2.0,5.0,5.0,Banwoldang,4.0,1,2.0,1.0,0.0,1.0,2.0,1.0,1.0,1.0,5,9.0,5.0
1868,1,2006,2013,9,1743,11,terraced,individual_heating,management_in_trust,249.0,536.0,0~5min,0-5min,6.0,5.0,11.0,Sin-nam,1.0,1,0.0,1.0,0.0,0.0,2.0,2.0,1.0,2.0,5,3.0,7.0


In [340]:
test_df

Unnamed: 0,YearBuilt,YrSold,MonthSold,Size(sqf),Floor,HallwayType,HeatingType,AptManageType,N_Parkinglot(Ground),N_Parkinglot(Basement),TimeToBusStop,TimeToSubway,N_APT,N_manager,N_elevators,SubwayStation,N_FacilitiesNearBy(PublicOffice),N_FacilitiesNearBy(Hospital),N_FacilitiesNearBy(Dpartmentstore),N_FacilitiesNearBy(Mall),N_FacilitiesNearBy(ETC),N_FacilitiesNearBy(Park),N_SchoolNearBy(Elementary),N_SchoolNearBy(Middle),N_SchoolNearBy(High),N_SchoolNearBy(University),N_FacilitiesInApt,N_FacilitiesNearBy(Total),N_SchoolNearBy(Total)
0,1993,2008,5,914,14,mixed,individual_heating,management_in_trust,523.0,536.0,0~5min,15min~20min,8.0,8.0,20.0,Myung-duk,6.0,2,0.0,1.0,5.0,0.0,4.0,3.0,5.0,5.0,4,14.0,17.0
1,1993,2014,3,914,20,mixed,individual_heating,management_in_trust,523.0,536.0,0~5min,15min~20min,8.0,8.0,20.0,Myung-duk,6.0,2,0.0,1.0,5.0,0.0,4.0,3.0,5.0,5.0,4,14.0,17.0
2,2013,2016,1,644,13,terraced,individual_heating,management_in_trust,8.0,930.0,0~5min,0-5min,7.0,6.0,14.0,Kyungbuk_uni_hospital,5.0,1,1.0,1.0,0.0,1.0,4.0,2.0,2.0,3.0,7,9.0,11.0
3,2009,2017,6,910,6,terraced,individual_heating,management_in_trust,25.0,203.0,0~5min,no_bus_stop_nearby,1.0,2.0,5.0,no_subway_nearby,2.0,1,0.0,0.0,2.0,1.0,2.0,1.0,1.0,1.0,5,6.0,5.0
4,1993,2013,12,1451,14,mixed,individual_heating,management_in_trust,523.0,536.0,0~5min,15min~20min,8.0,8.0,20.0,Myung-duk,6.0,2,0.0,1.0,5.0,0.0,4.0,3.0,5.0,5.0,4,14.0,17.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1762,2008,2014,12,829,20,terraced,individual_heating,management_in_trust,197.0,475.0,5min~10min,0-5min,6.0,6.0,14.0,Sin-nam,3.0,1,1.0,1.0,0.0,1.0,2.0,3.0,2.0,2.0,8,7.0,9.0
1763,1993,2013,7,914,19,mixed,individual_heating,management_in_trust,523.0,536.0,0~5min,15min~20min,8.0,8.0,20.0,Myung-duk,6.0,2,0.0,1.0,5.0,0.0,4.0,3.0,5.0,5.0,4,14.0,17.0
1764,2007,2010,11,868,21,terraced,individual_heating,management_in_trust,0.0,1270.0,0~5min,0-5min,7.0,14.0,16.0,Kyungbuk_uni_hospital,3.0,1,2.0,1.0,0.0,2.0,3.0,3.0,2.0,2.0,10,9.0,10.0
1765,2006,2014,12,1743,10,terraced,individual_heating,management_in_trust,249.0,536.0,0~5min,0-5min,6.0,5.0,11.0,Sin-nam,1.0,1,0.0,1.0,0.0,0.0,2.0,2.0,1.0,2.0,5,3.0,7.0


problematyczne kolumny:

MonthSold - rozbić na klasy lub wywalić

HallwayType, HeatingType, AptManageType, SubwayStation - rozbić na klasy

TimeToBusStop, TimeToSubway - rozbić na klasy albo przerobić na liczby

In [341]:
def change_time(string):
    try:
        value = int(string.replace("min", "").replace("~", "-").split("-")[1])
    except IndexError:
        value = 999
    return value

df['TimeToBusStop'] = df['TimeToBusStop'].map(lambda a: change_time(a))
df['TimeToSubway'] = df['TimeToSubway'].map(lambda a: change_time(a))
categorical_columns = ['HallwayType', 'MonthSold', 'HeatingType', 'AptManageType', 'SubwayStation']
df = df.astype({'MonthSold': str})
categorical_values = pd.get_dummies(df[categorical_columns])
categorical_values.head()

Unnamed: 0,HallwayType_corridor,HallwayType_mixed,HallwayType_terraced,MonthSold_1,MonthSold_10,MonthSold_11,MonthSold_12,MonthSold_2,MonthSold_3,MonthSold_4,MonthSold_5,MonthSold_6,MonthSold_7,MonthSold_8,MonthSold_9,HeatingType_central_heating,HeatingType_individual_heating,AptManageType_management_in_trust,AptManageType_self_management,SubwayStation_Bangoge,SubwayStation_Banwoldang,SubwayStation_Chil-sung-market,SubwayStation_Daegu,SubwayStation_Kyungbuk_uni_hospital,SubwayStation_Myung-duk,SubwayStation_Sin-nam,SubwayStation_no_subway_nearby
0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,1,0,0,0,0,0,0,1,0,0
1,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,1,1,0,0,0,0,0,1,0,0,0
2,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,1,1,0,0,0,0,0,0,1,0,0
3,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,1,0
4,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,1,0,0,0


In [342]:
numerical_values = df.drop(columns=categorical_columns)

train_indices = np.random.rand(len(numerical_values))>0.3

In [343]:
numerical_data = torch.from_numpy(numerical_values.values[train_indices,1:]).float()
categorical_data = torch.from_numpy(categorical_values.values[train_indices]).float()
targets = torch.from_numpy(numerical_values.values[train_indices,0]).float()

validation_numerical_data = torch.from_numpy(numerical_values.values[~train_indices,1:]).float()
validation_categorical_data = torch.from_numpy(categorical_values.values[~train_indices]).float()
validation_targets = torch.from_numpy(numerical_values.values[~train_indices,0]).float()

In [344]:
train_dataset = data.TensorDataset(numerical_data,categorical_data,targets)
validation_dataset = data.TensorDataset(validation_numerical_data,validation_categorical_data,validation_targets)

In [345]:
def get_accuracy(model, data_loader):
    correct = 0
    total = 0
    model.eval() #*********#
    for x, cat_x, labels in data_loader:
        x, cat_x, labels = x.to(device), cat_x.to(device), labels.to(device)
        output = model(x, cat_x)
        pred = torch.round(output)
        correct += pred.eq(labels.view_as(pred)).sum().item()
        total += x.shape[0]
    return correct / total

In [346]:
class Flats_classifier(nn.Module):
    def __init__(self, l2_size, l3_size):
        super().__init__()
        self.emb_layer = nn.Linear(categorical_data.shape[1], categorical_data.shape[1])
        self.act_emb = nn.Tanh()
        self.layer1 = nn.Linear(numerical_data.shape[1] + categorical_data.shape[1], l2_size)
        self.batch_norm1 = nn.BatchNorm1d(l2_size)
        self.act_1 =  nn.LeakyReLU()
        self.d1 = nn.Dropout(0.4)
        self.layer2 = nn.Linear(l2_size, l3_size)
        self.batch_norm2 = nn.BatchNorm1d(l3_size)
        self.act_2 =  nn.LeakyReLU()
        self.d2 = nn.Dropout(0.4)
        self.layer3 = nn.Linear(l3_size, 1)
        self.act_out = nn.Sigmoid()
    def forward(self, x, cat_x):
        cat_x_embedded = self.emb_layer(cat_x)
        cat_x_embedded = self.act_emb(cat_x_embedded)
        x = torch.cat([x,cat_x_embedded],dim=1)
        activation1 = self.act_1(self.batch_norm1(self.layer1(x)))
        activation1 = self.d1(activation1)
        activation2 = self.act_2(self.batch_norm2(self.layer2(activation1)))
        activation2 = self.d2(activation2)
        output = self.act_out(self.layer3(activation2))
        return output

In [352]:
layer2_size = 400
layer3_size = 200
lr = 0.0002
epochs = 500
threshold = 0.98
model = Flats_classifier(layer2_size, layer3_size).to(device)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=48, shuffle=True)
validation_loader = torch.utils.data.DataLoader(validation_dataset, batch_size=128, shuffle=False)
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=lr)

iters = []
losses = []
train_acc = []
val_acc = []
for n in range(epochs):
    epoch_losses = []
    for x, cat_x, labels in iter(train_loader):
        x, cat_x, labels = x.to(device), cat_x.to(device), labels.to(device)
        model.train() 
        out = model(x, cat_x).squeeze()           

        loss = criterion(out, labels)
        loss.backward()  
        epoch_losses.append(loss.item())
        optimizer.step()              
        optimizer.zero_grad()         

    loss_mean = np.array(epoch_losses).mean()
    iters.append(n)
    losses.append(loss_mean)
    validation_acc = get_accuracy(model, validation_loader)
    if n % 10 == 0:
      print(f"Epoch {n:>3} loss {loss_mean:5.3f} validation_acc: {validation_acc:5.3f}")
    train_acc.append(get_accuracy(model, train_loader)) # compute training accuracy 
    val_acc.append(validation_acc)  # compute validation accuracy
    if validation_acc > threshold:
        break
        

print("Final Training Accuracy: {}".format(train_acc[-1]))
print("Final Validation Accuracy: {}".format(val_acc[-1]))

Epoch   0 loss 0.652 validation_acc: 0.839
Epoch  10 loss 0.591 validation_acc: 0.836
Epoch  20 loss 0.583 validation_acc: 0.881
Epoch  30 loss 0.572 validation_acc: 0.832
Epoch  40 loss 0.573 validation_acc: 0.832
Epoch  50 loss 0.547 validation_acc: 0.890
Epoch  60 loss 0.547 validation_acc: 0.953
Epoch  70 loss 0.543 validation_acc: 0.962
Epoch  80 loss 0.542 validation_acc: 0.930
Epoch  90 loss 0.536 validation_acc: 0.962
Epoch 100 loss 0.537 validation_acc: 0.972
Final Training Accuracy: 0.9583975346687211
Final Validation Accuracy: 0.9825174825174825


In [None]:
test_df = test_df.astype({'MonthSold': str})
test_df['TimeToBusStop'] = test_df['TimeToBusStop'].map(lambda a: change_time(a))
test_df['TimeToSubway'] = test_df['TimeToSubway'].map(lambda a: change_time(a))
test_categorical_values = pd.get_dummies(test_df[categorical_columns])
test_numerical_values = test_df.drop(columns=categorical_columns)

test_numerical_data = torch.from_numpy(test_numerical_values.values).float()
test_categorical_data = torch.from_numpy(test_categorical_values.values).float()

In [None]:
model.eval()
model.to(device)
output = model(test_numerical_data.to(device), test_categorical_data.to(device))
pred = torch.round(output)

out_df = pd.DataFrame({'outputs': output.cpu().detach().numpy()[:,0], 'preds': pred.cpu().detach().numpy()[:,0]})
out_df.to_csv('results.csv', index=False)