In [1]:
import pandas as pd
import torch
import numpy as np
from torch import nn, optim
import torch.utils.data as data
import random

In [2]:
device = torch.device('cpu')

In [3]:
seed = 2137
torch.cuda.manual_seed_all(seed)
torch.manual_seed(seed)
np.random.seed(seed)
random.seed(seed)

In [4]:
df = pd.read_csv('train_data.csv')
test_df = pd.read_csv('test_data.csv')

In [5]:
df

Unnamed: 0,SalePrice,YearBuilt,YrSold,MonthSold,Size(sqf),Floor,HallwayType,HeatingType,AptManageType,N_Parkinglot(Ground),...,N_FacilitiesNearBy(Mall),N_FacilitiesNearBy(ETC),N_FacilitiesNearBy(Park),N_SchoolNearBy(Elementary),N_SchoolNearBy(Middle),N_SchoolNearBy(High),N_SchoolNearBy(University),N_FacilitiesInApt,N_FacilitiesNearBy(Total),N_SchoolNearBy(Total)
0,51327,1985,2007,8,587,8,corridor,individual_heating,self_management,80.0,...,1.0,2.0,1.0,2.0,1.0,1.0,0.0,3,12.0,4.0
1,48672,1985,2007,8,587,6,corridor,individual_heating,self_management,80.0,...,1.0,2.0,1.0,2.0,1.0,1.0,0.0,3,12.0,4.0
2,380530,2006,2007,8,2056,8,terraced,individual_heating,management_in_trust,249.0,...,1.0,0.0,0.0,2.0,2.0,1.0,2.0,5,3.0,7.0
3,221238,1993,2007,8,1761,3,mixed,individual_heating,management_in_trust,523.0,...,1.0,5.0,0.0,4.0,3.0,5.0,5.0,4,14.0,17.0
4,35840,1992,2007,8,355,5,corridor,individual_heating,management_in_trust,200.0,...,1.0,5.0,1.0,4.0,3.0,5.0,5.0,3,16.0,17.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4119,511504,2007,2017,8,1643,19,terraced,individual_heating,management_in_trust,0.0,...,1.0,0.0,2.0,3.0,3.0,2.0,2.0,10,9.0,10.0
4120,298230,2006,2017,8,903,13,terraced,individual_heating,management_in_trust,123.0,...,1.0,2.0,0.0,4.0,3.0,3.0,1.0,4,8.0,11.0
4121,357522,2007,2017,8,868,20,terraced,individual_heating,management_in_trust,0.0,...,1.0,0.0,2.0,3.0,3.0,2.0,2.0,10,9.0,10.0
4122,312389,1978,2017,8,1327,1,corridor,individual_heating,self_management,87.0,...,1.0,0.0,0.0,3.0,3.0,3.0,2.0,3,7.0,11.0


In [6]:
test_df

Unnamed: 0,YearBuilt,YrSold,MonthSold,Size(sqf),Floor,HallwayType,HeatingType,AptManageType,N_Parkinglot(Ground),N_Parkinglot(Basement),...,N_FacilitiesNearBy(Mall),N_FacilitiesNearBy(ETC),N_FacilitiesNearBy(Park),N_SchoolNearBy(Elementary),N_SchoolNearBy(Middle),N_SchoolNearBy(High),N_SchoolNearBy(University),N_FacilitiesInApt,N_FacilitiesNearBy(Total),N_SchoolNearBy(Total)
0,1993,2008,5,914,14,mixed,individual_heating,management_in_trust,523.0,536.0,...,1.0,5.0,0.0,4.0,3.0,5.0,5.0,4,14.0,17.0
1,1993,2014,3,914,20,mixed,individual_heating,management_in_trust,523.0,536.0,...,1.0,5.0,0.0,4.0,3.0,5.0,5.0,4,14.0,17.0
2,2013,2016,1,644,13,terraced,individual_heating,management_in_trust,8.0,930.0,...,1.0,0.0,1.0,4.0,2.0,2.0,3.0,7,9.0,11.0
3,2009,2017,6,910,6,terraced,individual_heating,management_in_trust,25.0,203.0,...,0.0,2.0,1.0,2.0,1.0,1.0,1.0,5,6.0,5.0
4,1993,2013,12,1451,14,mixed,individual_heating,management_in_trust,523.0,536.0,...,1.0,5.0,0.0,4.0,3.0,5.0,5.0,4,14.0,17.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1762,2008,2014,12,829,20,terraced,individual_heating,management_in_trust,197.0,475.0,...,1.0,0.0,1.0,2.0,3.0,2.0,2.0,8,7.0,9.0
1763,1993,2013,7,914,19,mixed,individual_heating,management_in_trust,523.0,536.0,...,1.0,5.0,0.0,4.0,3.0,5.0,5.0,4,14.0,17.0
1764,2007,2010,11,868,21,terraced,individual_heating,management_in_trust,0.0,1270.0,...,1.0,0.0,2.0,3.0,3.0,2.0,2.0,10,9.0,10.0
1765,2006,2014,12,1743,10,terraced,individual_heating,management_in_trust,249.0,536.0,...,1.0,0.0,0.0,2.0,2.0,1.0,2.0,5,3.0,7.0


problematyczne kolumny:

MonthSold - rozbić na klasy lub wywalić

HallwayType, HeatingType, AptManageType, SubwayStation - rozbić na klasy

TimeToBusStop, TimeToSubway - rozbić na klasy albo przerobić na liczby

In [7]:
categorical_columns = ['HallwayType', 'MonthSold', 'HeatingType', 'AptManageType', 'SubwayStation', 'TimeToBusStop', 'TimeToSubway']
df = df.astype({'MonthSold': str})
categorical_values = pd.get_dummies(df[categorical_columns])
categorical_values.head()

Unnamed: 0,HallwayType_corridor,HallwayType_mixed,HallwayType_terraced,MonthSold_1,MonthSold_10,MonthSold_11,MonthSold_12,MonthSold_2,MonthSold_3,MonthSold_4,...,SubwayStation_Sin-nam,SubwayStation_no_subway_nearby,TimeToBusStop_0~5min,TimeToBusStop_10min~15min,TimeToBusStop_5min~10min,TimeToSubway_0-5min,TimeToSubway_10min~15min,TimeToSubway_15min~20min,TimeToSubway_5min~10min,TimeToSubway_no_bus_stop_nearby
0,1,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,1,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,1,0
2,0,0,1,0,0,0,0,0,0,0,...,1,0,1,0,0,1,0,0,0,0
3,0,1,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,1,0,0
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,1,0,0,0


In [8]:
numerical_values = df.drop(columns=categorical_columns)
numerical_values['SalePrice'] = numerical_values['SalePrice'].apply(lambda b: 1 if b > 300000 else 0)
train_indices = np.random.rand(len(numerical_values))>0.3

In [9]:
numerical_values

Unnamed: 0,SalePrice,YearBuilt,YrSold,Size(sqf),Floor,N_Parkinglot(Ground),N_Parkinglot(Basement),N_APT,N_manager,N_elevators,...,N_FacilitiesNearBy(Mall),N_FacilitiesNearBy(ETC),N_FacilitiesNearBy(Park),N_SchoolNearBy(Elementary),N_SchoolNearBy(Middle),N_SchoolNearBy(High),N_SchoolNearBy(University),N_FacilitiesInApt,N_FacilitiesNearBy(Total),N_SchoolNearBy(Total)
0,0,1985,2007,587,8,80.0,76.0,1.0,2.0,2.0,...,1.0,2.0,1.0,2.0,1.0,1.0,0.0,3,12.0,4.0
1,0,1985,2007,587,6,80.0,76.0,1.0,2.0,2.0,...,1.0,2.0,1.0,2.0,1.0,1.0,0.0,3,12.0,4.0
2,1,2006,2007,2056,8,249.0,536.0,6.0,5.0,11.0,...,1.0,0.0,0.0,2.0,2.0,1.0,2.0,5,3.0,7.0
3,0,1993,2007,1761,3,523.0,536.0,8.0,8.0,20.0,...,1.0,5.0,0.0,4.0,3.0,5.0,5.0,4,14.0,17.0
4,0,1992,2007,355,5,200.0,0.0,3.0,5.0,10.0,...,1.0,5.0,1.0,4.0,3.0,5.0,5.0,3,16.0,17.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4119,1,2007,2017,1643,19,0.0,1270.0,7.0,14.0,16.0,...,1.0,0.0,2.0,3.0,3.0,2.0,2.0,10,9.0,10.0
4120,0,2006,2017,903,13,123.0,181.0,3.0,3.0,11.0,...,1.0,2.0,0.0,4.0,3.0,3.0,1.0,4,8.0,11.0
4121,1,2007,2017,868,20,0.0,1270.0,7.0,14.0,16.0,...,1.0,0.0,2.0,3.0,3.0,2.0,2.0,10,9.0,10.0
4122,1,1978,2017,1327,1,87.0,0.0,2.0,1.0,4.0,...,1.0,0.0,0.0,3.0,3.0,3.0,2.0,3,7.0,11.0


In [10]:
numerical_data = torch.from_numpy(numerical_values.values[train_indices,1:]).float()
categorical_data = torch.from_numpy(categorical_values.values[train_indices]).float()
targets = torch.from_numpy(numerical_values.values[train_indices,0]).float()

validation_numerical_data = torch.from_numpy(numerical_values.values[~train_indices,1:]).float()
validation_categorical_data = torch.from_numpy(categorical_values.values[~train_indices]).float()
validation_targets = torch.from_numpy(numerical_values.values[~train_indices,0]).float()

In [11]:
train_dataset = data.TensorDataset(numerical_data,categorical_data,targets)
validation_dataset = data.TensorDataset(validation_numerical_data,validation_categorical_data,validation_targets)

In [12]:
def get_accuracy(model, data_loader):
    correct = 0
    total = 0
    model.eval() #*********#
    for x, cat_x, labels in data_loader:
        x, cat_x, labels = x.to(device), cat_x.to(device), labels.to(device)
        output = model(x, cat_x)
        pred = torch.round(output)
        correct += pred.eq(labels.view_as(pred)).sum().item()
        total += x.shape[0]
    return correct / total

In [13]:
class Flats_classifier(nn.Module):
    def __init__(self, l2_size, l3_size):
        super().__init__()
        self.emb_layer = nn.Linear(categorical_data.shape[1], categorical_data.shape[1])
        self.act_emb = nn.Tanh()
        self.layer1 = nn.Linear(numerical_data.shape[1] + categorical_data.shape[1], l2_size)
        self.batch_norm1 = nn.BatchNorm1d(l2_size)
        self.act_1 =  nn.LeakyReLU()
        self.d1 = nn.Dropout(0.4)
        self.layer2 = nn.Linear(l2_size, l3_size)
        self.batch_norm2 = nn.BatchNorm1d(l3_size)
        self.act_2 =  nn.LeakyReLU()
        self.d2 = nn.Dropout(0.4)
        self.layer3 = nn.Linear(l3_size, 1)
        self.act_out = nn.Sigmoid()
    def forward(self, x, cat_x):
        cat_x_embedded = self.emb_layer(cat_x)
        cat_x_embedded = self.act_emb(cat_x_embedded)
        x = torch.cat([x,cat_x_embedded],dim=1)
        activation1 = self.act_1(self.batch_norm1(self.layer1(x)))
        activation1 = self.d1(activation1)
        activation2 = self.act_2(self.batch_norm2(self.layer2(activation1)))
        activation2 = self.d2(activation2)
        output = self.act_out(self.layer3(activation2))
        return output

In [14]:
layer2_size = 400
layer3_size = 200
lr = 0.0001
epochs = 500
threshold = 0.97
model = Flats_classifier(layer2_size, layer3_size).to(device)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=64, shuffle=True)
validation_loader = torch.utils.data.DataLoader(validation_dataset, batch_size=128, shuffle=False)
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=lr)

iters = []
losses = []
train_acc = []
val_acc = []
for n in range(epochs):
    epoch_losses = []
    for x, cat_x, labels in iter(train_loader):
        x, cat_x, labels = x.to(device), cat_x.to(device), labels.to(device)
        model.train() 
        out = model(x, cat_x).squeeze()           

        loss = criterion(out, labels)
        loss.backward()  
        epoch_losses.append(loss.item())
        optimizer.step()              
        optimizer.zero_grad()         

    loss_mean = np.array(epoch_losses).mean()
    iters.append(n)
    losses.append(loss_mean)
    validation_acc = get_accuracy(model, validation_loader)
    print(f"Epoch {n} loss {loss_mean:.3} validation_acc: {validation_acc:.3}")
    train_acc.append(get_accuracy(model, train_loader)) # compute training accuracy 
    val_acc.append(validation_acc)  # compute validation accuracy
    if validation_acc > threshold:
        break
        

print("Final Training Accuracy: {}".format(train_acc[-1]))
print("Final Validation Accuracy: {}".format(val_acc[-1]))

Epoch 0 loss 0.792 validation_acc: 0.757
Epoch 1 loss 0.734 validation_acc: 0.828
Epoch 2 loss 0.712 validation_acc: 0.845
Epoch 3 loss 0.699 validation_acc: 0.854
Epoch 4 loss 0.689 validation_acc: 0.862
Epoch 5 loss 0.683 validation_acc: 0.862
Epoch 6 loss 0.678 validation_acc: 0.863
Epoch 7 loss 0.675 validation_acc: 0.853
Epoch 8 loss 0.671 validation_acc: 0.859
Epoch 9 loss 0.668 validation_acc: 0.859
Epoch 10 loss 0.666 validation_acc: 0.869
Epoch 11 loss 0.664 validation_acc: 0.873
Epoch 12 loss 0.663 validation_acc: 0.869
Epoch 13 loss 0.661 validation_acc: 0.873
Epoch 14 loss 0.66 validation_acc: 0.883
Epoch 15 loss 0.657 validation_acc: 0.889
Epoch 16 loss 0.658 validation_acc: 0.882
Epoch 17 loss 0.657 validation_acc: 0.873
Epoch 18 loss 0.656 validation_acc: 0.898
Epoch 19 loss 0.654 validation_acc: 0.897
Epoch 20 loss 0.653 validation_acc: 0.891
Epoch 21 loss 0.653 validation_acc: 0.897
Epoch 22 loss 0.652 validation_acc: 0.897
Epoch 23 loss 0.651 validation_acc: 0.903
Epo

In [15]:
test_df = test_df.astype({'MonthSold': str})
test_categorical_values = pd.get_dummies(test_df[categorical_columns])
test_numerical_values = test_df.drop(columns=categorical_columns)

test_numerical_data = torch.from_numpy(test_numerical_values.values).float()
test_categorical_data = torch.from_numpy(test_categorical_values.values).float()
test_dataset = data.TensorDataset(test_numerical_data, test_categorical_data)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=1, shuffle=False)

In [16]:
model.eval()
output = model(test_numerical_data, test_categorical_data)
pred = torch.round(output)
print(output.detach().numpy())

out_df = pd.DataFrame({'outputs': output.detach().numpy()[:,0], 'preds': pred.detach().numpy()[:,0]})
out_df.to_csv('results.csv', index=False)

[[1.7126629e-08]
 [1.4714068e-05]
 [1.8448581e-04]
 ...
 [3.5400717e-05]
 [9.9960655e-01]
 [7.8748288e-03]]


In [17]:
from sklearn.metrics import roc_auc_score, RocCurveDisplay, roc_curve
roc_auc = roc_auc_score(targets, preds)
fpr, tpr, thresholds = roc_curve(targets, preds)
display = RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=roc_auc, estimator_name="test")
display.plot()
roc_auc

ModuleNotFoundError: No module named 'sklearn'