In [1]:
import pickle
import numpy as np

from tqdm import tqdm
import torch
from torch import nn
from torch.utils.data import Dataset
from torch.utils.data import DataLoader 

SEED = 42

In [2]:
models = ['glove-wiki-gigaword-200', 'word2vec-google-news-300']
model_type = models[1]
wv_path = ['./data/train_data/train', './data/test_data/test', './data/dev_data/dev']

with open(wv_path[0] + f'_{model_type}_label_sst2.pkl', 'rb') as f:
    y_train = pickle.load(f)
with open(wv_path[1] + f'_{model_type}_label_sst2.pkl', 'rb') as f:
    y_test = pickle.load(f)
with open(wv_path[2] + f'_{model_type}_label_sst2.pkl', 'rb') as f:
    y_dev = pickle.load(f)

X_train_tensor = torch.load(wv_path[0] + f'_{model_type}_tensor_sst2.pt')
X_test_tensor = torch.load(wv_path[1] + f'_{model_type}_tensor_sst2.pt')
X_dev_tensor = torch.load(wv_path[2] + f'_{model_type}_tensor_sst2.pt')

In [3]:
wv_num = X_train_tensor.shape[1]
max_length = X_train_tensor.shape[2]

In [4]:
print(f"Shape of training data {[X_train_tensor.shape[i] for i in range(3)]}")
print(f"Shape of validation data {[X_dev_tensor.shape[i] for i in range(3)]}")

Shape of training data [67349, 300, 56]
Shape of validation data [872, 300, 56]


In [5]:
# process labels
labels = list(set(y_train + y_dev))
label_num = len(labels)
print(f"Number of label types: {label_num}")

Number of label types: 2


In [6]:
torch.manual_seed(SEED)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
class CustomDataset(Dataset):
    def __init__(self, X, y):
        self.X =torch.tensor(X, dtype=torch.float)
        self.y =torch.tensor(y, dtype=torch.long)

    def __len__(self):
        return len(self.y)

    def __getitem__(self,idx):
        return self.X[idx], self.y[idx]

def initialize_loader(X_train_tensor, X_dev_tensor, y_train, y_dev, batch_size=64):
    # no need to do the scale since original wv already did
    train_data = CustomDataset(X_train_tensor, y_train)
    dev_data = CustomDataset(X_dev_tensor, y_dev)

    # convert to DataLoader for batch processing and shuffling
    train_dataloader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
    for inputs, targets in train_dataloader:
        inputs, targets = inputs.to(device), targets.to(device)
    dev_dataloader = DataLoader(dev_data, batch_size=batch_size, shuffle=False)
    for inputs, targets in dev_dataloader:
        inputs, targets = inputs.to(device), targets.to(device)
    return train_dataloader, dev_dataloader

train_dataloader, dev_dataloader = initialize_loader(X_train_tensor, X_dev_tensor, y_train, y_dev)

  self.X =torch.tensor(X, dtype=torch.float)


In [7]:
# model
class CNN(nn.Module):
    def __init__(self, dim_in, dim_conv, dim_out, dropout_rate=0.5):
        super(CNN, self).__init__()
        self.conv1_3 = nn.Conv1d(dim_in, dim_conv, 3, padding=5)  # 33
        self.conv1_4 = nn.Conv1d(dim_in, dim_conv, 4, padding=5)  # 32
        self.conv1_5 = nn.Conv1d(dim_in, dim_conv, 5, padding=5)  # 31
        self.bn = nn.BatchNorm1d(dim_conv * 3)
        self.ReLU = nn.ReLU()
        self.maxpool_1 = nn.MaxPool1d(kernel_size=max_length+8)
        self.maxpool_2 = nn.MaxPool1d(kernel_size=max_length+7)
        self.maxpool_3 = nn.MaxPool1d(kernel_size=max_length+6)
        self.fc = nn.Linear(dim_conv * 3, dim_out)
        self.dropout = nn.Dropout(p=dropout_rate)
        self.softmax = nn.Softmax()


    def forward(self, x):
        x1 = self.conv1_3(x)
        x1 = self.ReLU(x1)
        x1 = self.maxpool_1(x1)


        x2 = self.conv1_4(x)
        x2 = self.ReLU(x2)
        x2 = self.maxpool_2(x2)

        x3 = self.conv1_5(x)
        x3 = self.ReLU(x3)
        x3 = self.maxpool_3(x3)

        x = torch.cat((x1, x2, x3), dim=1)
        x = torch.flatten(x, 1)
        x = self.fc(x)
        x = self.dropout(x)
        x = self.softmax(x)
        return x


In [8]:
torch.manual_seed(SEED)
dim_in = wv_num
dim_conv = 100
dim_out = label_num
lr = 0.001
model = CNN(dim_in, dim_conv, dim_out)
model.to(device)

loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

In [9]:
# training
max_norm = 3
def train_loop(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    optimizer.zero_grad()
    train_loss, correct_num = 0, 0
    model.train()
    for X, y in dataloader:
        X, y = X.to(device), y.to(device)
        optimizer.zero_grad()
        pred = model(X)
        loss = loss_fn(pred, y)

        loss.backward()

        for _, module in model.named_modules():
            if isinstance(module, nn.Linear):
                for _, param in module.named_parameters():
                    param_norm = param.data.norm(2)
                    if param_norm > max_norm:
                        param.data.mul_(max_norm / (param_norm + 1e-6))

        optimizer.step()
        model.eval()
        with torch.no_grad():
            pred = model(X)
            loss = loss_fn(pred, y)
            train_loss += loss.item() * X.size(0)
            correct_num += (torch.eq(torch.argmax(pred, dim=1), y)).type(torch.float).sum().item()

    train_loss /= size
    train_acc = correct_num / size
    return train_loss, train_acc

def val_loop(dataloader, model, loss_fn):
    size = len(dataloader.dataset)
    val_loss, correct_num = 0, 0
    model.eval()  # inform no dropout and fix bn during testing

    with torch.no_grad():
        for X, y in dataloader:
            X, y = X.to(device), y.to(device)

            pred = model(X)
            val_loss += loss_fn(pred, y).item() * X.size(0)
            correct_num += (torch.eq(torch.argmax(pred, dim=1), y)).type(torch.float).sum().item()

    val_loss /= size
    val_acc = correct_num / size
    return val_loss, val_acc
        

In [10]:
patience = 15
best_val_loss = np.Inf
train_loss_, train_acc_, val_loss_, val_acc_ = [], [], [], []
no_epochs = 100

# start training
for epoch in tqdm(range(no_epochs)):
    train_loss, train_acc = train_loop(train_dataloader, model, loss_fn, optimizer)
    val_loss, val_acc = val_loop(dev_dataloader, model, loss_fn)

    train_loss_.append(train_loss), train_acc_.append(train_acc)
    val_loss_.append(val_loss), val_acc_.append(val_acc)

    # early stopping
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        epochs_without_improvement = 0
        best_model = model.state_dict()
    else:
        epochs_without_improvement += 1
        if epochs_without_improvement >= patience:
            print(f'early stopping after {epoch+1} epochs')
            print(f'best val loss: {val_loss}')
            break


    print(f"epoch {epoch+1}, train_loss {train_loss:>7f} train_acc {train_acc:>4f}, val_loss {val_loss:>7f}, val_acc {val_acc:>4f}")
torch.save(best_model, './model/CNN_vector_model.pth')

  return self._call_impl(*args, **kwargs)
  1%|          | 1/100 [00:09<15:54,  9.65s/it]

epoch 1, train_loss 0.431358 train_acc 0.886249, val_loss 0.469893, val_acc 0.832569


  2%|▏         | 2/100 [00:15<12:09,  7.44s/it]

epoch 2, train_loss 0.385579 train_acc 0.931298, val_loss 0.461705, val_acc 0.845183


  3%|▎         | 3/100 [00:21<10:36,  6.56s/it]

epoch 3, train_loss 0.370042 train_acc 0.947289, val_loss 0.465974, val_acc 0.831422


  4%|▍         | 4/100 [00:26<09:46,  6.11s/it]

epoch 4, train_loss 0.361085 train_acc 0.956198, val_loss 0.463066, val_acc 0.842890


  5%|▌         | 5/100 [00:32<09:21,  5.91s/it]

epoch 5, train_loss 0.355694 train_acc 0.960549, val_loss 0.463370, val_acc 0.842890


  6%|▌         | 6/100 [00:37<09:04,  5.80s/it]

epoch 6, train_loss 0.351695 train_acc 0.964201, val_loss 0.465605, val_acc 0.839450


  7%|▋         | 7/100 [00:42<08:43,  5.63s/it]

epoch 7, train_loss 0.349004 train_acc 0.966726, val_loss 0.472666, val_acc 0.830275


  8%|▊         | 8/100 [00:48<08:28,  5.52s/it]

epoch 8, train_loss 0.346294 train_acc 0.969012, val_loss 0.453411, val_acc 0.855505


  9%|▉         | 9/100 [00:53<08:15,  5.45s/it]

epoch 9, train_loss 0.344604 train_acc 0.970096, val_loss 0.464998, val_acc 0.839450


 10%|█         | 10/100 [00:58<08:11,  5.47s/it]

epoch 10, train_loss 0.343155 train_acc 0.971373, val_loss 0.462270, val_acc 0.841743


 11%|█         | 11/100 [01:04<08:10,  5.51s/it]

epoch 11, train_loss 0.341945 train_acc 0.972175, val_loss 0.458475, val_acc 0.852064


 12%|█▏        | 12/100 [01:10<08:03,  5.49s/it]

epoch 12, train_loss 0.341294 train_acc 0.972843, val_loss 0.459817, val_acc 0.844037


 13%|█▎        | 13/100 [01:15<07:57,  5.49s/it]

epoch 13, train_loss 0.340417 train_acc 0.973318, val_loss 0.455443, val_acc 0.854358


 14%|█▍        | 14/100 [01:21<07:52,  5.49s/it]

epoch 14, train_loss 0.339823 train_acc 0.973734, val_loss 0.482728, val_acc 0.824541


 15%|█▌        | 15/100 [01:26<07:43,  5.45s/it]

epoch 15, train_loss 0.339235 train_acc 0.974313, val_loss 0.455529, val_acc 0.854358


 16%|█▌        | 16/100 [01:32<07:42,  5.51s/it]

epoch 16, train_loss 0.338554 train_acc 0.974862, val_loss 0.456579, val_acc 0.852064


 17%|█▋        | 17/100 [01:37<07:36,  5.50s/it]

epoch 17, train_loss 0.338075 train_acc 0.975040, val_loss 0.455148, val_acc 0.853211


 18%|█▊        | 18/100 [01:43<07:34,  5.54s/it]

epoch 18, train_loss 0.337791 train_acc 0.975293, val_loss 0.455334, val_acc 0.857798


 19%|█▉        | 19/100 [01:48<07:34,  5.61s/it]

epoch 19, train_loss 0.337593 train_acc 0.975501, val_loss 0.454895, val_acc 0.854358


 20%|██        | 20/100 [01:54<07:27,  5.59s/it]

epoch 20, train_loss 0.337186 train_acc 0.975872, val_loss 0.461222, val_acc 0.840596


 21%|██        | 21/100 [02:01<07:50,  5.95s/it]

epoch 21, train_loss 0.337096 train_acc 0.975872, val_loss 0.459218, val_acc 0.846330


 22%|██▏       | 22/100 [02:10<08:50,  6.80s/it]

epoch 22, train_loss 0.336701 train_acc 0.976199, val_loss 0.453687, val_acc 0.853211


 22%|██▏       | 22/100 [02:18<08:11,  6.30s/it]

early stopping after 23 epochs
best val loss: 0.4584506386463795



