In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
import torchvision.transforms as transforms
from torch.utils.data.dataset import random_split

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [2]:
root = "final.csv"
batch_size = 256
epochs = 200
learning_rate = 1e-3

In [3]:
data_pd = pd.read_csv(root)
data_pd.head()

Unnamed: 0,Id,Shape,Weight,Clarity,Colour,Cut,Polish,Symmetry,Fluorescence,Length,Width,Depth,Price
0,111000-5962,CUSHION,1.01,I2,FANCY,EX,VG,VG,N,5.89,5.63,3.53,1155.27
1,111000-6281,CUSHION,1.19,I2,FANCY,EX,VG,GD,M,5.97,5.59,3.8,3638.36
2,111000-6305,OVAL,1.0,SI2,U-V,EX,EX,VG,M,8.47,5.39,3.42,2237.73
3,111000-6320,PEAR,1.01,SI2,E,EX,VG,VG,N,9.39,5.52,3.24,2953.85
4,111000-6368,CUSHION,1.01,SI2,FANCY,VG,VG,GD,ST,6.1,5.36,3.48,2241.67


In [4]:
data_numpy = data_pd.to_numpy()

In [5]:
for i in [1, 3, 4, 5, 6, 7, 8]:
    wordset = {word: idx for idx, word in enumerate(np.unique(data_numpy[:,i]))}
    print(wordset)
    for row in range(len(data_numpy)):
        data_numpy[row][i] = wordset[data_numpy[row][i]]

{'CUSHION': 0, 'EMERALD': 1, 'HEART': 2, 'MARQUISE': 3, 'OVAL': 4, 'PEAR': 5, 'PRINCESS': 6, 'ROUND': 7}
{'FL': 0, 'I1': 1, 'I2': 2, 'I3': 3, 'IF': 4, 'SI1': 5, 'SI2': 6, 'VS1': 7, 'VS2': 8, 'VVS1': 9, 'VVS2': 10}
{'D': 0, 'E': 1, 'F': 2, 'FANCY': 3, 'G': 4, 'H': 5, 'I': 6, 'J': 7, 'K': 8, 'L': 9, 'M': 10, 'N': 11, 'O': 12, 'O-P': 13, 'Q-R': 14, 'S-T': 15, 'U-V': 16, 'W': 17, 'W-X': 18, 'Y-Z': 19}
{'EX': 0, 'F': 1, 'GD': 2, 'VG': 3}
{'EX': 0, 'F': 1, 'GD': 2, 'VG': 3}
{'EX': 0, 'FR': 1, 'GD': 2, 'VG': 3}
{'F': 0, 'M': 1, 'N': 2, 'SL': 3, 'ST': 4, 'VS': 5, 'VSL': 6}


In [6]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self,ints, floats, target):
        super(Dataset).__init__()
        self.ints = ints
        self.floats = floats
        self.target = target
        
    def __getitem__(self,idx):
        return self.ints[idx],self.floats[idx], self.target[idx]
    
    def __len__(self):
        return len(self.ints)

In [7]:
data_int = torch.from_numpy(np.array(data_numpy[:,[1,3,4,5,6,7,8]], dtype="int"))
data_float = torch.from_numpy(np.array(data_numpy[:,[2,9,10,11]], dtype="float")).float()
data_target = torch.from_numpy(np.array(data_numpy[:,[12]], dtype="float")).float()
print(data_int[0])
print(data_float[0])
print(data_target[0])

tensor([0, 2, 3, 0, 3, 3, 2], dtype=torch.int32)
tensor([1.0100, 5.8900, 5.6300, 3.5300])
tensor([1155.2700])


In [8]:
train_length = int(len(data_numpy) * 0.6)
test_length = int(len(data_numpy) * 0.2)
val_length = len(data_numpy) - train_length - test_length

train_dataset = Dataset(data_int, data_float, data_target)
train_dataset, test_dataset = random_split(train_dataset, [train_length, test_length+val_length])
test_dataset, val_dataset = random_split(test_dataset, [test_length, val_length])

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle = True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle = True)
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size, shuffle = True)

In [9]:
print(train_length, test_length, val_length)

2823 941 942


In [10]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.emb1 = torch.nn.Embedding(8, 20)
        self.emb2 = torch.nn.Embedding(11, 20)
        self.emb3 = torch.nn.Embedding(20, 20)
        self.emb4 = torch.nn.Embedding(4, 20)
        self.emb5 = torch.nn.Embedding(4, 20)
        self.emb6 = torch.nn.Embedding(4, 20)
        self.emb7 = torch.nn.Embedding(7, 20)
        self.act = nn.ReLU()
        self.fc = nn.Linear(4, 80)
        self.fc1 = nn.Linear(220, 8192)
        self.fc2 = nn.Linear(8192, 8192)
        self.fc3 = nn.Linear(8192, 4096)
        self.fc4 = nn.Linear(4096, 2048)
        self.fc5 = nn.Linear(2048, 1)
        self.dropout = nn.Dropout()
    
    def forward(self, x, y):
        x1 = self.emb1(x[:,0])
        x2 = self.emb2(x[:,1])
        x3 = self.emb3(x[:,2])
        x4 = self.emb4(x[:,3])
        x5 = self.emb5(x[:,4])
        x6 = self.emb6(x[:,5])
        x7 = self.emb7(x[:,6])
        y = self.fc(y)
        x = torch.cat((x1, x2, x3, x4, x5, x6, x7, y), dim=1)
        x = self.dropout(self.act(self.fc1(x)))
        x = self.dropout(self.act(self.fc2(x)))
        x = self.dropout(self.act(self.fc3(x)))
        x = self.dropout(self.act(self.fc4(x)))
        return self.fc5(x)

In [11]:
model = Net().to(device)

criterion = nn.MSELoss().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [12]:
for epoch in range(epochs):
    model.train()
    criterion.train()
    
    avg_loss = 0

    for X1, X2, Y in train_loader:
        X1 = X1.to(device)
        X2 = X2.to(device)
        Y = Y.to(device)

        model.zero_grad()  # why we use zero_grad?
        prediction = model(X1, X2)
        loss = torch.sqrt(criterion(prediction, Y)).to(device)
        loss.backward()
        optimizer.step()
        avg_loss += loss / len(train_loader)
    print(f'[Epoch: {epoch+1:>2}] Average loss: {avg_loss:.4f}, ', end='')
    
    model.eval()
    criterion.eval()
    with torch.no_grad():
        val_avg_loss = 0.
        for X1_val, X2_val, Y_val in val_loader:
            X1_val = X1_val.to(device)
            X2_val = X2_val.to(device)
            Y_val = Y_val.to(device)
            val_prediction = model(X1_val, X2_val)
            val_loss = torch.sqrt(criterion(val_prediction, Y_val)).to(device)
            val_avg_loss += val_loss / len(val_loader)
        
        print(f"val_loss: {val_avg_loss:.4f}")

[Epoch:  1] Average loss: 4499.2910, val_loss: 3936.0359
[Epoch:  2] Average loss: 4075.3438, val_loss: 2781.2327
[Epoch:  3] Average loss: 3587.3677, val_loss: 2510.3118
[Epoch:  4] Average loss: 3054.9487, val_loss: 2544.4656
[Epoch:  5] Average loss: 2444.0068, val_loss: 1622.6880
[Epoch:  6] Average loss: 1913.0565, val_loss: 1399.6243
[Epoch:  7] Average loss: 2369.5996, val_loss: 2470.9812
[Epoch:  8] Average loss: 2059.4697, val_loss: 1363.5911
[Epoch:  9] Average loss: 1770.1033, val_loss: 1183.0544
[Epoch: 10] Average loss: 1634.8228, val_loss: 964.7881
[Epoch: 11] Average loss: 1741.6593, val_loss: 1025.1941
[Epoch: 12] Average loss: 1486.2272, val_loss: 1699.4443
[Epoch: 13] Average loss: 1830.7837, val_loss: 1188.8289
[Epoch: 14] Average loss: 1495.6873, val_loss: 1119.8545
[Epoch: 15] Average loss: 1450.7186, val_loss: 972.4531
[Epoch: 16] Average loss: 1592.2949, val_loss: 2023.9189
[Epoch: 17] Average loss: 2172.3059, val_loss: 1174.6162
[Epoch: 18] Average loss: 1631.03

[Epoch: 147] Average loss: 862.0826, val_loss: 1018.4038
[Epoch: 148] Average loss: 867.0449, val_loss: 587.1291
[Epoch: 149] Average loss: 777.7991, val_loss: 643.0063
[Epoch: 150] Average loss: 820.5073, val_loss: 677.5059
[Epoch: 151] Average loss: 1280.9557, val_loss: 857.8763
[Epoch: 152] Average loss: 978.0987, val_loss: 690.2148
[Epoch: 153] Average loss: 847.6796, val_loss: 733.9296
[Epoch: 154] Average loss: 878.9752, val_loss: 754.2997
[Epoch: 155] Average loss: 806.6649, val_loss: 620.6897
[Epoch: 156] Average loss: 706.4814, val_loss: 689.5688
[Epoch: 157] Average loss: 870.4234, val_loss: 840.2589
[Epoch: 158] Average loss: 841.9087, val_loss: 659.7205
[Epoch: 159] Average loss: 908.7751, val_loss: 739.4746
[Epoch: 160] Average loss: 822.1196, val_loss: 809.2917
[Epoch: 161] Average loss: 914.5387, val_loss: 931.6133
[Epoch: 162] Average loss: 963.8137, val_loss: 780.8416
[Epoch: 163] Average loss: 1030.1266, val_loss: 912.0414
[Epoch: 164] Average loss: 839.0438, val_loss

In [14]:
model.eval()
criterion.eval()
ss_tot = 0
ss_res = 0
with torch.no_grad():
    for X1_test, X2_test, Y_test in test_loader:
        X1_test = X1_test.to(device)
        X2_test = X2_test.to(device)
        Y_test = Y_test.to(device)
        prediction = model(X1_test, X2_test)
        prices_mean = torch.mean(Y_test)
        ss_tot += torch.sum((Y_test - prices_mean) ** 2)
        ss_res += torch.sum((Y_test - prediction) ** 2)
    
    accuracy = 1 - ss_res/ss_tot
    print(f"Accuracy: {accuracy*100:.2f}%")

Accuracy: 90.27%
