In [74]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
import torchvision.transforms as transforms
from torch.utils.data.dataset import random_split
from ignite.contrib.metrics.regression import R2Score

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device = torch.device('cpu')

In [2]:
root = "total.csv"
batch_size = 64
epochs = 200
learning_rate = 1e-4

In [3]:
data_pd = pd.read_csv(root)
data_pd.head()

Unnamed: 0,Id,Shape,Weight,Clarity,Colour,Cut,Polish,Symmetry,Fluorescence,Length,Width,Depth,Price
0,1638147,CUSHION,0.55,SI2,E,EX,EX,VG,N,5.05,4.35,2.94,1378.65
1,1630155,CUSHION,0.5,VVS1,FANCY,EX,EX,VG,F,4.6,4.31,2.92,1379.74
2,1612606,CUSHION,0.51,VS2,H,EX,EX,VG,N,4.71,4.35,2.94,1380.19
3,1638140,CUSHION,0.5,VS2,H,EX,EX,VG,N,4.91,4.26,2.88,1380.61
4,1536093,CUSHION,0.53,SI1,D,EX,VG,VG,N,4.7,4.46,3.01,1383.13


In [4]:
data_numpy = data_pd.to_numpy()

In [5]:
for i in [1, 3, 4, 5, 6, 7, 8]:
    wordset = {word: idx for idx, word in enumerate(np.unique(data_numpy[:,i]))}
    print(wordset)
    for row in range(len(data_numpy)):
        data_numpy[row][i] = wordset[data_numpy[row][i]]

{'CUSHION': 0, 'EMERALD': 1, 'HEART': 2, 'MARQUISE': 3, 'OVAL': 4, 'PEAR': 5, 'PRINCESS': 6, 'ROUND': 7}
{'FL': 0, 'I1': 1, 'I2': 2, 'I3': 3, 'IF': 4, 'SI1': 5, 'SI2': 6, 'VS1': 7, 'VS2': 8, 'VVS1': 9, 'VVS2': 10}
{'D': 0, 'E': 1, 'F': 2, 'FANCY': 3, 'G': 4, 'H': 5, 'I': 6, 'J': 7, 'K': 8, 'L': 9, 'M': 10, 'N': 11, 'O': 12, 'O-P': 13, 'Q-R': 14, 'S-T': 15, 'U-V': 16, 'W': 17, 'W-X': 18, 'Y-Z': 19}
{'EX': 0, 'F': 1, 'GD': 2, 'VG': 3}
{'EX': 0, 'F': 1, 'GD': 2, 'VG': 3}
{'EX': 0, 'FR': 1, 'GD': 2, 'VG': 3}
{'F': 0, 'M': 1, 'N': 2, 'SL': 3, 'ST': 4, 'VS': 5, 'VSL': 6}


In [6]:
data_numpy = data_numpy[:,1:]
print(data_numpy[0])

[0 0.55 6 1 0 0 3 2 5.05 4.35 2.94 1378.65]


In [58]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self,ints, floats, target):
        super(Dataset).__init__()
        self.ints = ints
        self.floats = floats
        self.target = target
        
    def __getitem__(self,idx):
        return self.ints[idx],self.floats[idx], self.target[idx]
    
    def __len__(self):
        return len(self.ints)

In [66]:
data_int = torch.from_numpy(np.array(data_numpy[:,[0,2,3,4,5,6,7]], dtype="int"))
data_float = torch.from_numpy(np.array(data_numpy[:,[1,8,9,10]], dtype="float")).float()
data_target = torch.from_numpy(np.array(data_numpy[:,[11]], dtype="float")).float()
print(data_int[0])
print(data_float[0])
print(data_target[0])

tensor([0, 6, 1, 0, 0, 3, 2], dtype=torch.int32)
tensor([0.5500, 5.0500, 4.3500, 2.9400])
tensor([1378.6500])


In [67]:
train_length = int(len(data_numpy) * 0.6)
test_length = int(len(data_numpy) * 0.2)
val_length = len(data_numpy) - train_length - test_length

train_dataset = Dataset(data_int, data_float, data_target)
train_dataset, test_dataset = random_split(train_dataset, [train_length, test_length+val_length])
test_dataset, val_dataset = random_split(test_dataset, [test_length, val_length])

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle = True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle = True)
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size, shuffle = True)

In [68]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.emb1 = torch.nn.Embedding(8, 3)
        self.emb2 = torch.nn.Embedding(11, 3)
        self.emb3 = torch.nn.Embedding(20, 3)
        self.emb4 = torch.nn.Embedding(4, 3)
        self.emb5 = torch.nn.Embedding(4, 3)
        self.emb6 = torch.nn.Embedding(4, 3)
        self.emb7 = torch.nn.Embedding(7, 3)
        self.act = nn.ReLU()
        self.fc1 = nn.Linear(7*3+4, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 1)
    
    def forward(self, x, y):
        x1 = self.emb1(x[:,0])
        x2 = self.emb2(x[:,1])
        x3 = self.emb3(x[:,2])
        x4 = self.emb4(x[:,3])
        x5 = self.emb5(x[:,4])
        x6 = self.emb6(x[:,5])
        x7 = self.emb7(x[:,6])
        x = torch.cat((x1, x2, x3, x4, x5, x6, x7, y), dim=1)
        x = self.act(self.fc1(x))
        x = self.act(self.fc2(x))
        x = self.fc3(x)
        return x

In [69]:
model = Net().to(device)

criterion = nn.MSELoss().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [72]:
for epoch in range(epochs):
    avg_loss = 0

    for X1, X2, Y in train_loader:
        X1 = X1.to(device)
        X2 = X2.to(device)
        Y = Y.to(device)

        model.zero_grad()  # why we use zero_grad?
        prediction = model(X1, X2)
        loss = criterion(prediction, Y)
        loss.backward()
        optimizer.step()
        avg_loss += loss / len(train_loader)
    print(f'[Epoch: {epoch+1:>2}] Average loss: {avg_loss:.4f}, ', end='')
    with torch.no_grad():
        val_avg_loss = 0.
        for X1_val, X2_val, Y_val in val_loader:
            X1_val = X1_val.to(device)
            X2_val = X2_val.to(device)
            Y_val = Y_val.to(device)
            val_prediction = model(X1_val, X2_val)
            val_loss = criterion(val_prediction, Y_val)
            val_avg_loss += val_loss / len(val_loader)
        
        print(f"test_loss: {val_avg_loss:.4f}")

[Epoch:  1] Average loss: 13493655.0000, test_loss: 10915603.0000
[Epoch:  2] Average loss: 13440445.0000, test_loss: 10813842.0000
[Epoch:  3] Average loss: 13426317.0000, test_loss: 10807034.0000
[Epoch:  4] Average loss: 15054504.0000, test_loss: 11466920.0000
[Epoch:  5] Average loss: 15036650.0000, test_loss: 11115783.0000
[Epoch:  6] Average loss: 13365760.0000, test_loss: 10733439.0000
[Epoch:  7] Average loss: 13343042.0000, test_loss: 10710504.0000
[Epoch:  8] Average loss: 13326338.0000, test_loss: 10798163.0000
[Epoch:  9] Average loss: 13332097.0000, test_loss: 10690954.0000
[Epoch: 10] Average loss: 13288410.0000, test_loss: 10674899.0000
[Epoch: 11] Average loss: 13273007.0000, test_loss: 10644811.0000
[Epoch: 12] Average loss: 13254641.0000, test_loss: 10640309.0000
[Epoch: 13] Average loss: 13236126.0000, test_loss: 10636708.0000
[Epoch: 14] Average loss: 13213132.0000, test_loss: 10604341.0000
[Epoch: 15] Average loss: 13206429.0000, test_loss: 11285371.0000
[Epoch: 16

[Epoch: 126] Average loss: 11227356.0000, test_loss: 8961954.0000
[Epoch: 127] Average loss: 11175677.0000, test_loss: 9217221.0000
[Epoch: 128] Average loss: 11157528.0000, test_loss: 9248045.0000
[Epoch: 129] Average loss: 11147063.0000, test_loss: 8926854.0000
[Epoch: 130] Average loss: 11124203.0000, test_loss: 9044459.0000
[Epoch: 131] Average loss: 11113030.0000, test_loss: 8926342.0000
[Epoch: 132] Average loss: 11105127.0000, test_loss: 8889637.0000
[Epoch: 133] Average loss: 11063978.0000, test_loss: 8856850.0000
[Epoch: 134] Average loss: 11064783.0000, test_loss: 8852748.0000
[Epoch: 135] Average loss: 11031049.0000, test_loss: 8965847.0000
[Epoch: 136] Average loss: 11001482.0000, test_loss: 8905648.0000
[Epoch: 137] Average loss: 10982686.0000, test_loss: 8793037.0000
[Epoch: 138] Average loss: 11133085.0000, test_loss: 8789279.0000
[Epoch: 139] Average loss: 10954764.0000, test_loss: 8784596.0000
[Epoch: 140] Average loss: 10932207.0000, test_loss: 8834406.0000
[Epoch: 14

In [130]:
metric = R2Score(device=device)
metric.reset()
with torch.no_grad():
    test_avg_acc = 0
    for X1_test, X2_test, Y_test in test_loader:
        X1_test = X1_test.to(device)
        X2_test = X2_test.to(device)
        Y_test = Y_test.to(device)
        
        test_prediction = model(X1_test, X2_test)
        metric.update([test_prediction, Y_test])
#         test_avg_acc += state.metrics['r2'] / len(test_loader)
    print(metric.compute())

0.4687736714892158
