In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
import torchvision.transforms as transforms
from torch.utils.data.dataset import random_split
from ignite.contrib.metrics.regression import R2Score

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [2]:
root = "final.csv"
batch_size = 256
epochs = 500
learning_rate = 1e-3

In [3]:
data_pd = pd.read_csv(root)
data_pd.head()

Unnamed: 0,Id,Shape,Weight,Clarity,Colour,Cut,Polish,Symmetry,Fluorescence,Length,Width,Depth,Price
0,111000-5962,CUSHION,1.01,I2,FANCY,EX,VG,VG,N,5.89,5.63,3.53,1155.27
1,111000-6281,CUSHION,1.19,I2,FANCY,EX,VG,GD,M,5.97,5.59,3.8,3638.36
2,111000-6305,OVAL,1.0,SI2,U-V,EX,EX,VG,M,8.47,5.39,3.42,2237.73
3,111000-6320,PEAR,1.01,SI2,E,EX,VG,VG,N,9.39,5.52,3.24,2953.85
4,111000-6368,CUSHION,1.01,SI2,FANCY,VG,VG,GD,ST,6.1,5.36,3.48,2241.67


In [4]:
data_numpy = data_pd.to_numpy()

In [5]:
for i in [1, 3, 4, 5, 6, 7, 8]:
    wordset = {word: idx for idx, word in enumerate(np.unique(data_numpy[:,i]))}
    print(wordset)
    for row in range(len(data_numpy)):
        data_numpy[row][i] = wordset[data_numpy[row][i]]

{'CUSHION': 0, 'EMERALD': 1, 'HEART': 2, 'MARQUISE': 3, 'OVAL': 4, 'PEAR': 5, 'PRINCESS': 6, 'ROUND': 7}
{'FL': 0, 'I1': 1, 'I2': 2, 'I3': 3, 'IF': 4, 'SI1': 5, 'SI2': 6, 'VS1': 7, 'VS2': 8, 'VVS1': 9, 'VVS2': 10}
{'D': 0, 'E': 1, 'F': 2, 'FANCY': 3, 'G': 4, 'H': 5, 'I': 6, 'J': 7, 'K': 8, 'L': 9, 'M': 10, 'N': 11, 'O': 12, 'O-P': 13, 'Q-R': 14, 'S-T': 15, 'U-V': 16, 'W': 17, 'W-X': 18, 'Y-Z': 19}
{'EX': 0, 'F': 1, 'GD': 2, 'VG': 3}
{'EX': 0, 'F': 1, 'GD': 2, 'VG': 3}
{'EX': 0, 'FR': 1, 'GD': 2, 'VG': 3}
{'F': 0, 'M': 1, 'N': 2, 'SL': 3, 'ST': 4, 'VS': 5, 'VSL': 6}


In [6]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self,ints, floats, target):
        super(Dataset).__init__()
        self.ints = ints
        self.floats = floats
        self.target = target
        
    def __getitem__(self,idx):
        return self.ints[idx],self.floats[idx], self.target[idx]
    
    def __len__(self):
        return len(self.ints)

In [7]:
data_int = torch.from_numpy(np.array(data_numpy[:,[1,3,4,5,6,7,8]], dtype="int"))
data_float = torch.from_numpy(np.array(data_numpy[:,[2,9,10,11]], dtype="float")).float()
data_target = torch.from_numpy(np.array(data_numpy[:,[12]], dtype="float")).float()
print(data_int[0])
print(data_float[0])
print(data_target[0])

tensor([0, 2, 3, 0, 3, 3, 2], dtype=torch.int32)
tensor([1.0100, 5.8900, 5.6300, 3.5300])
tensor([1155.2700])


In [8]:
train_length = int(len(data_numpy) * 0.6)
test_length = int(len(data_numpy) * 0.2)
val_length = len(data_numpy) - train_length - test_length

train_dataset = Dataset(data_int, data_float, data_target)
train_dataset, test_dataset = random_split(train_dataset, [train_length, test_length+val_length])
test_dataset, val_dataset = random_split(test_dataset, [test_length, val_length])

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle = True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle = True)
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size, shuffle = True)

In [9]:
print(train_length, test_length, val_length)

2826 942 943


In [10]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.emb1 = torch.nn.Embedding(8, 8)
        self.emb2 = torch.nn.Embedding(11, 11)
        self.emb3 = torch.nn.Embedding(20, 20)
        self.emb4 = torch.nn.Embedding(4, 4)
        self.emb5 = torch.nn.Embedding(4, 4)
        self.emb6 = torch.nn.Embedding(4, 4)
        self.emb7 = torch.nn.Embedding(7, 7)
        self.act = nn.ReLU()
        self.fc1 = nn.Linear(62, 8192)
        self.fc2 = nn.Linear(8192, 8192)
        self.fc3 = nn.Linear(8192, 4096)
        self.fc4 = nn.Linear(4096, 2048)
        self.fc5 = nn.Linear(2048, 1)
        self.dropout = nn.Dropout()
    
    def forward(self, x, y):
        x1 = self.emb1(x[:,0])
        x2 = self.emb2(x[:,1])
        x3 = self.emb3(x[:,2])
        x4 = self.emb4(x[:,3])
        x5 = self.emb5(x[:,4])
        x6 = self.emb6(x[:,5])
        x7 = self.emb7(x[:,6])
        x = torch.cat((x1, x2, x3, x4, x5, x6, x7, y), dim=1)
        x = self.dropout(self.act(self.fc1(x)))
        x = self.dropout(self.act(self.fc2(x)))
        x = self.dropout(self.act(self.fc3(x)))
        x = self.dropout(self.act(self.fc4(x)))
        return self.fc5(x)

In [11]:
model = Net().to(device)

criterion = nn.MSELoss().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [12]:
for epoch in range(epochs):
    model.train()
    criterion.train()
    
    avg_loss = 0

    for X1, X2, Y in train_loader:
        X1 = X1.to(device)
        X2 = X2.to(device)
        Y = Y.to(device)

        model.zero_grad()  # why we use zero_grad?
        prediction = model(X1, X2)
        loss = criterion(prediction, Y)
        loss.backward()
        optimizer.step()
        avg_loss += loss / len(train_loader)
    print(f'[Epoch: {epoch+1:>2}] Average loss: {avg_loss:.4f}, ', end='')
    
    model.eval()
    criterion.eval()
    with torch.no_grad():
        val_avg_loss = 0.
        for X1_val, X2_val, Y_val in val_loader:
            X1_val = X1_val.to(device)
            X2_val = X2_val.to(device)
            Y_val = Y_val.to(device)
            val_prediction = model(X1_val, X2_val)
            val_loss = criterion(val_prediction, Y_val)
            val_avg_loss += val_loss / len(val_loader)
        
        print(f"val_loss: {val_avg_loss:.4f}")

[Epoch:  1] Average loss: 19819410.0000, val_loss: 11743689.0000
[Epoch:  2] Average loss: 16487014.0000, val_loss: 9200584.0000
[Epoch:  3] Average loss: 13326647.0000, val_loss: 7546290.5000
[Epoch:  4] Average loss: 7461221.0000, val_loss: 2775768.5000
[Epoch:  5] Average loss: 3694848.2500, val_loss: 2138609.7500
[Epoch:  6] Average loss: 3137309.5000, val_loss: 1292575.7500
[Epoch:  7] Average loss: 1997264.2500, val_loss: 1314987.2500
[Epoch:  8] Average loss: 1924366.0000, val_loss: 1092884.3750
[Epoch:  9] Average loss: 1747094.2500, val_loss: 1019327.9375
[Epoch: 10] Average loss: 1368860.0000, val_loss: 966998.7500
[Epoch: 11] Average loss: 1927782.2500, val_loss: 1995332.8750
[Epoch: 12] Average loss: 2786441.7500, val_loss: 1463976.2500
[Epoch: 13] Average loss: 3313342.5000, val_loss: 1767703.7500
[Epoch: 14] Average loss: 2618971.5000, val_loss: 1263442.3750
[Epoch: 15] Average loss: 1267613.8750, val_loss: 1381898.6250
[Epoch: 16] Average loss: 1174047.3750, val_loss: 12

[Epoch: 133] Average loss: 683302.6250, val_loss: 1030988.0000
[Epoch: 134] Average loss: 996865.7500, val_loss: 930610.9375
[Epoch: 135] Average loss: 749410.0000, val_loss: 791270.1875
[Epoch: 136] Average loss: 723304.2500, val_loss: 850585.0000
[Epoch: 137] Average loss: 1114959.5000, val_loss: 895399.0000
[Epoch: 138] Average loss: 1468740.3750, val_loss: 776931.7500
[Epoch: 139] Average loss: 1095280.8750, val_loss: 1125011.5000
[Epoch: 140] Average loss: 851565.4375, val_loss: 1247109.3750
[Epoch: 141] Average loss: 1373895.2500, val_loss: 1024858.6250
[Epoch: 142] Average loss: 2468091.5000, val_loss: 885125.4375
[Epoch: 143] Average loss: 2748547.0000, val_loss: 3136804.5000
[Epoch: 144] Average loss: 2613435.2500, val_loss: 1446914.3750
[Epoch: 145] Average loss: 1960609.7500, val_loss: 1651015.0000
[Epoch: 146] Average loss: 1821921.5000, val_loss: 878497.5000
[Epoch: 147] Average loss: 1004875.7500, val_loss: 763359.0625
[Epoch: 148] Average loss: 1653708.6250, val_loss: 11

[Epoch: 264] Average loss: 1023457.3125, val_loss: 1048299.3750
[Epoch: 265] Average loss: 1207557.3750, val_loss: 737010.2500
[Epoch: 266] Average loss: 998520.9375, val_loss: 771000.9375
[Epoch: 267] Average loss: 753298.6875, val_loss: 866822.0000
[Epoch: 268] Average loss: 955095.0625, val_loss: 671380.4375
[Epoch: 269] Average loss: 1089295.1250, val_loss: 649153.1250
[Epoch: 270] Average loss: 1058075.8750, val_loss: 1051339.7500
[Epoch: 271] Average loss: 906987.4375, val_loss: 804583.8125
[Epoch: 272] Average loss: 946468.3750, val_loss: 762191.7500
[Epoch: 273] Average loss: 1000018.5000, val_loss: 1175350.8750
[Epoch: 274] Average loss: 1057028.7500, val_loss: 944516.1250
[Epoch: 275] Average loss: 871559.5625, val_loss: 590988.9375
[Epoch: 276] Average loss: 890542.6875, val_loss: 688417.3750
[Epoch: 277] Average loss: 1168178.8750, val_loss: 712266.0000
[Epoch: 278] Average loss: 712269.1875, val_loss: 626743.7500
[Epoch: 279] Average loss: 1115143.1250, val_loss: 886534.06

[Epoch: 395] Average loss: 694509.6875, val_loss: 864341.1250
[Epoch: 396] Average loss: 812156.1250, val_loss: 1003165.5625
[Epoch: 397] Average loss: 932557.6875, val_loss: 802005.1875
[Epoch: 398] Average loss: 1123812.2500, val_loss: 797918.2500
[Epoch: 399] Average loss: 922911.8750, val_loss: 1306908.7500
[Epoch: 400] Average loss: 1339945.5000, val_loss: 784149.6250
[Epoch: 401] Average loss: 874877.4375, val_loss: 933198.5625
[Epoch: 402] Average loss: 1157159.6250, val_loss: 1518649.2500
[Epoch: 403] Average loss: 1448811.3750, val_loss: 878763.2500
[Epoch: 404] Average loss: 838751.0625, val_loss: 852995.6250
[Epoch: 405] Average loss: 1023073.0625, val_loss: 968316.6250
[Epoch: 406] Average loss: 1014418.9375, val_loss: 781441.8125
[Epoch: 407] Average loss: 1003898.4375, val_loss: 918278.2500
[Epoch: 408] Average loss: 1348484.0000, val_loss: 882824.3125
[Epoch: 409] Average loss: 940269.1875, val_loss: 800802.0000
[Epoch: 410] Average loss: 549489.5625, val_loss: 860943.62

In [13]:
metric = R2Score(device=device)
metric.reset()

model.eval()
criterion.eval()
with torch.no_grad():
    test_avg_acc = 0
    for X1_test, X2_test, Y_test in test_loader:
        X1_test = X1_test.to(device)
        X2_test = X2_test.to(device)
        Y_test = Y_test.to(device)
        
        test_prediction = model(X1_test, X2_test)
        metric.update([test_prediction, Y_test])
    print(f"Accuracy: {metric.compute()*100:.2f}%")

Accuracy: 86.98%
