In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
import torchvision.transforms as transforms
from torch.utils.data.dataset import random_split
from ignite.contrib.metrics.regression import R2Score

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [2]:
root = "total.csv"
batch_size = 256
epochs = 500
learning_rate = 1e-3

In [3]:
data_pd = pd.read_csv(root)
data_pd.head()

Unnamed: 0,Id,Shape,Weight,Clarity,Colour,Cut,Polish,Symmetry,Fluorescence,Length,Width,Depth,Price
0,1638147,CUSHION,0.55,SI2,E,EX,EX,VG,N,5.05,4.35,2.94,1378.65
1,1630155,CUSHION,0.5,VVS1,FANCY,EX,EX,VG,F,4.6,4.31,2.92,1379.74
2,1612606,CUSHION,0.51,VS2,H,EX,EX,VG,N,4.71,4.35,2.94,1380.19
3,1638140,CUSHION,0.5,VS2,H,EX,EX,VG,N,4.91,4.26,2.88,1380.61
4,1536093,CUSHION,0.53,SI1,D,EX,VG,VG,N,4.7,4.46,3.01,1383.13


In [4]:
data_numpy = data_pd.to_numpy()

In [5]:
for i in [1, 3, 4, 5, 6, 7, 8]:
    wordset = {word: idx for idx, word in enumerate(np.unique(data_numpy[:,i]))}
    print(wordset)
    for row in range(len(data_numpy)):
        data_numpy[row][i] = wordset[data_numpy[row][i]]

{'CUSHION': 0, 'EMERALD': 1, 'HEART': 2, 'MARQUISE': 3, 'OVAL': 4, 'PEAR': 5, 'PRINCESS': 6, 'ROUND': 7}
{'FL': 0, 'I1': 1, 'I2': 2, 'I3': 3, 'IF': 4, 'SI1': 5, 'SI2': 6, 'VS1': 7, 'VS2': 8, 'VVS1': 9, 'VVS2': 10}
{'D': 0, 'E': 1, 'F': 2, 'FANCY': 3, 'G': 4, 'H': 5, 'I': 6, 'J': 7, 'K': 8, 'L': 9, 'M': 10, 'N': 11, 'O': 12, 'O-P': 13, 'Q-R': 14, 'S-T': 15, 'U-V': 16, 'W': 17, 'W-X': 18, 'Y-Z': 19}
{'EX': 0, 'F': 1, 'GD': 2, 'VG': 3}
{'EX': 0, 'F': 1, 'GD': 2, 'VG': 3}
{'EX': 0, 'FR': 1, 'GD': 2, 'VG': 3}
{'F': 0, 'M': 1, 'N': 2, 'SL': 3, 'ST': 4, 'VS': 5, 'VSL': 6}


In [6]:
data_numpy = data_numpy[:,1:]
print(data_numpy[0])

[0 0.55 6 1 0 0 3 2 5.05 4.35 2.94 1378.65]


In [7]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self,ints, floats, target):
        super(Dataset).__init__()
        self.ints = ints
        self.floats = floats
        self.target = target
        
    def __getitem__(self,idx):
        return self.ints[idx],self.floats[idx], self.target[idx]
    
    def __len__(self):
        return len(self.ints)

In [8]:
data_int = torch.from_numpy(np.array(data_numpy[:,[0,2,3,4,5,6,7]], dtype="int"))
data_float = torch.from_numpy(np.array(data_numpy[:,[1,8,9,10]], dtype="float")).float()
data_target = torch.from_numpy(np.array(data_numpy[:,[11]], dtype="float")).float()
print(data_int[0])
print(data_float[0])
print(data_target[0])

tensor([0, 6, 1, 0, 0, 3, 2], dtype=torch.int32)
tensor([0.5500, 5.0500, 4.3500, 2.9400])
tensor([1378.6500])


In [9]:
train_length = int(len(data_numpy) * 0.6)
test_length = int(len(data_numpy) * 0.2)
val_length = len(data_numpy) - train_length - test_length

train_dataset = Dataset(data_int, data_float, data_target)
train_dataset, test_dataset = random_split(train_dataset, [train_length, test_length+val_length])
test_dataset, val_dataset = random_split(test_dataset, [test_length, val_length])

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle = True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle = True)
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size, shuffle = True)

In [10]:
print(train_length, test_length, val_length)

3038 1012 1014


In [11]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.emb1 = torch.nn.Embedding(8, 8)
        self.emb2 = torch.nn.Embedding(11, 11)
        self.emb3 = torch.nn.Embedding(20, 20)
        self.emb4 = torch.nn.Embedding(4, 4)
        self.emb5 = torch.nn.Embedding(4, 4)
        self.emb6 = torch.nn.Embedding(4, 4)
        self.emb7 = torch.nn.Embedding(7, 7)
        self.act = nn.ReLU()
        self.fc1 = nn.Linear(62, 8192)
        self.fc2 = nn.Linear(8192, 4096)
        self.fc3 = nn.Linear(4096, 2048)
        self.fc4 = nn.Linear(2048, 1024)
        self.fc5 = nn.Linear(1024, 1)
        self.dropout = nn.Dropout()
    
    def forward(self, x, y):
        x1 = self.emb1(x[:,0])
        x2 = self.emb2(x[:,1])
        x3 = self.emb3(x[:,2])
        x4 = self.emb4(x[:,3])
        x5 = self.emb5(x[:,4])
        x6 = self.emb6(x[:,5])
        x7 = self.emb7(x[:,6])
        x = torch.cat((x1, x2, x3, x4, x5, x6, x7, y), dim=1)
        x = self.dropout(self.act(self.fc1(x)))
        x = self.dropout(self.act(self.fc2(x)))
        x = self.dropout(self.act(self.fc3(x)))
        x = self.dropout(self.act(self.fc4(x)))
        return self.fc5(x)

In [12]:
model = Net().to(device)

criterion = nn.MSELoss().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [13]:
for epoch in range(epochs):
    model.train()
    criterion.train()
    
    avg_loss = 0

    for X1, X2, Y in train_loader:
        X1 = X1.to(device)
        X2 = X2.to(device)
        Y = Y.to(device)

        model.zero_grad()  # why we use zero_grad?
        prediction = model(X1, X2)
        loss = criterion(prediction, Y)
        loss.backward()
        optimizer.step()
        avg_loss += loss / len(train_loader)
    print(f'[Epoch: {epoch+1:>2}] Average loss: {avg_loss:.4f}, ', end='')
    
    model.eval()
    criterion.eval()
    with torch.no_grad():
        val_avg_loss = 0.
        for X1_val, X2_val, Y_val in val_loader:
            X1_val = X1_val.to(device)
            X2_val = X2_val.to(device)
            Y_val = Y_val.to(device)
            val_prediction = model(X1_val, X2_val)
            val_loss = criterion(val_prediction, Y_val)
            val_avg_loss += val_loss / len(val_loader)
        
        print(f"val_loss: {val_avg_loss:.4f}")

[Epoch:  1] Average loss: 15653694.0000, val_loss: 23244036.0000
[Epoch:  2] Average loss: 12449179.0000, val_loss: 20351696.0000
[Epoch:  3] Average loss: 10549371.0000, val_loss: 16625547.0000
[Epoch:  4] Average loss: 8038357.0000, val_loss: 11618971.0000
[Epoch:  5] Average loss: 4678450.5000, val_loss: 6365092.5000
[Epoch:  6] Average loss: 2784146.0000, val_loss: 2743608.7500
[Epoch:  7] Average loss: 1180578.0000, val_loss: 2234449.7500
[Epoch:  8] Average loss: 1603039.3750, val_loss: 2619363.2500
[Epoch:  9] Average loss: 1419917.5000, val_loss: 3415507.5000
[Epoch: 10] Average loss: 1414233.0000, val_loss: 2334508.7500
[Epoch: 11] Average loss: 920737.5625, val_loss: 2800218.0000
[Epoch: 12] Average loss: 1155988.7500, val_loss: 2241314.7500
[Epoch: 13] Average loss: 1528677.1250, val_loss: 2788281.7500
[Epoch: 14] Average loss: 1491088.8750, val_loss: 2648633.7500
[Epoch: 15] Average loss: 1548396.6250, val_loss: 3907261.0000
[Epoch: 16] Average loss: 1179535.2500, val_loss:

[Epoch: 133] Average loss: 671322.2500, val_loss: 2355275.7500
[Epoch: 134] Average loss: 598834.8125, val_loss: 2273227.0000
[Epoch: 135] Average loss: 546362.1250, val_loss: 1785188.2500
[Epoch: 136] Average loss: 860697.1250, val_loss: 1920312.5000
[Epoch: 137] Average loss: 712019.6250, val_loss: 1831452.7500
[Epoch: 138] Average loss: 527760.0000, val_loss: 1600928.0000
[Epoch: 139] Average loss: 491990.0625, val_loss: 1627869.1250
[Epoch: 140] Average loss: 416854.7188, val_loss: 2343442.2500
[Epoch: 141] Average loss: 579758.5625, val_loss: 1834027.5000
[Epoch: 142] Average loss: 525089.8750, val_loss: 2126948.0000
[Epoch: 143] Average loss: 611464.0000, val_loss: 1529791.3750
[Epoch: 144] Average loss: 499205.2812, val_loss: 1595051.3750
[Epoch: 145] Average loss: 511622.3750, val_loss: 1592738.5000
[Epoch: 146] Average loss: 685592.5000, val_loss: 2113214.5000
[Epoch: 147] Average loss: 701698.8750, val_loss: 1629361.0000
[Epoch: 148] Average loss: 792221.2500, val_loss: 20225

[Epoch: 263] Average loss: 393548.4062, val_loss: 1562222.7500
[Epoch: 264] Average loss: 645119.1875, val_loss: 2339584.7500
[Epoch: 265] Average loss: 1009854.6875, val_loss: 2054340.7500
[Epoch: 266] Average loss: 928739.3750, val_loss: 2187628.0000
[Epoch: 267] Average loss: 574457.6875, val_loss: 2011610.0000
[Epoch: 268] Average loss: 571944.6875, val_loss: 1878091.1250
[Epoch: 269] Average loss: 599573.4375, val_loss: 1755662.2500
[Epoch: 270] Average loss: 779500.0000, val_loss: 3256818.0000
[Epoch: 271] Average loss: 1316993.1250, val_loss: 2503152.7500
[Epoch: 272] Average loss: 952772.8750, val_loss: 2719006.0000
[Epoch: 273] Average loss: 566844.8125, val_loss: 1655296.1250
[Epoch: 274] Average loss: 648189.3750, val_loss: 1366129.6250
[Epoch: 275] Average loss: 888174.8125, val_loss: 1673167.5000
[Epoch: 276] Average loss: 629519.5625, val_loss: 2016437.8750
[Epoch: 277] Average loss: 388828.9375, val_loss: 1814289.0000
[Epoch: 278] Average loss: 784676.8125, val_loss: 238

[Epoch: 393] Average loss: 579641.1250, val_loss: 1541144.0000
[Epoch: 394] Average loss: 502001.1875, val_loss: 1521365.7500
[Epoch: 395] Average loss: 534519.4375, val_loss: 1408068.7500
[Epoch: 396] Average loss: 625631.3750, val_loss: 1587959.0000
[Epoch: 397] Average loss: 747147.4375, val_loss: 3261050.5000
[Epoch: 398] Average loss: 643780.3750, val_loss: 1623770.8750
[Epoch: 399] Average loss: 537064.0000, val_loss: 1530001.0000
[Epoch: 400] Average loss: 492584.4688, val_loss: 1839105.7500
[Epoch: 401] Average loss: 833257.0000, val_loss: 1630685.5000
[Epoch: 402] Average loss: 568851.6250, val_loss: 1993416.0000
[Epoch: 403] Average loss: 732154.5000, val_loss: 1915019.6250
[Epoch: 404] Average loss: 642011.6250, val_loss: 1795976.7500
[Epoch: 405] Average loss: 625522.5000, val_loss: 1611585.6250
[Epoch: 406] Average loss: 596248.6875, val_loss: 1625098.0000
[Epoch: 407] Average loss: 522794.2812, val_loss: 2022028.1250
[Epoch: 408] Average loss: 607804.8125, val_loss: 19651

In [14]:
metric = R2Score(device=device)
metric.reset()

model.eval()
criterion.eval()
with torch.no_grad():
    test_avg_acc = 0
    for X1_test, X2_test, Y_test in test_loader:
        X1_test = X1_test.to(device)
        X2_test = X2_test.to(device)
        Y_test = Y_test.to(device)
        
        test_prediction = model(X1_test, X2_test)
        metric.update([test_prediction, Y_test])
    print(f"Accuracy: {metric.compute()*100:.2f}%")

Accuracy: 84.42%
