In [1]:
import pickle
import random
import numpy as np

from tqdm import tqdm
import torch
from torch import nn
from torch.utils.data import Dataset
from torch.utils.data import DataLoader 

SEED = 42

In [2]:
models = ['glove-wiki-gigaword-200', 'word2vec-google-news-300']
model_type = models[1]
wv_path = ['./data/train_data/train', './data/test_data/test', './data/dev_data/dev']

with open(wv_path[0] + f'_{model_type}_label_sst2.pkl', 'rb') as f:
    y_train = pickle.load(f)
with open(wv_path[1] + f'_{model_type}_label_sst2.pkl', 'rb') as f:
    y_test = pickle.load(f)
with open(wv_path[2] + f'_{model_type}_label_sst2.pkl', 'rb') as f:
    y_dev = pickle.load(f)

X_train_tensor = torch.load(wv_path[0] + f'_{model_type}_tensor_sst2.pt')
X_test_tensor = torch.load(wv_path[1] + f'_{model_type}_tensor_sst2.pt')
X_dev_tensor = torch.load(wv_path[2] + f'_{model_type}_tensor_sst2.pt')

In [3]:
wv_num = X_train_tensor.shape[1]
max_length = X_train_tensor.shape[2]

In [4]:
print(f"Shape of training data {[X_train_tensor.shape[i] for i in range(3)]}")
print(f"Shape of training data {[X_dev_tensor.shape[i] for i in range(3)]}")

Shape of training data [67349, 300, 56]
Shape of training data [872, 300, 56]


In [5]:
# process labels
labels = list(set(y_train + y_dev))
label_num = len(labels)
for i in range(len(y_train)):
    y_train[i] = labels.index(y_train[i])
for i in range(len(y_dev)):
    y_dev[i] = labels.index(y_dev[i])
print(f"Number of label types: {label_num}")

Number of label types: 2


In [6]:
torch.manual_seed(SEED)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
class CustomDataset(Dataset):
    def __init__(self, X, y):
        self.X =torch.tensor(X, dtype=torch.float)
        self.y =torch.tensor(y, dtype=torch.long)

    def __len__(self):
        return len(self.y)

    def __getitem__(self,idx):
        return self.X[idx], self.y[idx]

def initialize_loader(X_train_tensor, X_dev_tensor, y_train, y_dev, batch_size=64):
    # no need to do the scale since original wv already did
    train_data = CustomDataset(X_train_tensor, y_train)
    dev_data = CustomDataset(X_dev_tensor, y_dev)

    # convert to DataLoader for batch processing and shuffling
    train_dataloader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
    for inputs, targets in train_dataloader:
        inputs, targets = inputs.to(device), targets.to(device)
    dev_dataloader = DataLoader(dev_data, batch_size=batch_size, shuffle=False)
    for inputs, targets in dev_dataloader:
        inputs, targets = inputs.to(device), targets.to(device)
    return train_dataloader, dev_dataloader

In [7]:
# model
class CNN(nn.Module):
    def __init__(self, dim_in, dim_conv, dim_out, dropout_rate=0.5):
        super(CNN, self).__init__()
        self.conv1_3 = nn.Conv1d(dim_in, dim_conv, 3, padding=5)  # 33
        self.conv1_4 = nn.Conv1d(dim_in, dim_conv, 4, padding=5)  # 32
        self.conv1_5 = nn.Conv1d(dim_in, dim_conv, 5, padding=5)  # 31
        self.bn = nn.BatchNorm1d(dim_conv * 3)
        self.ReLU = nn.ReLU()
        self.maxpool_1 = nn.MaxPool1d(kernel_size=max_length+8)
        self.maxpool_2 = nn.MaxPool1d(kernel_size=max_length+7)
        self.maxpool_3 = nn.MaxPool1d(kernel_size=max_length+6)
        self.fc = nn.Linear(dim_conv * 3, dim_out)
        self.dropout = nn.Dropout(p=dropout_rate)
        self.softmax = nn.Softmax()


    def forward(self, x):
        x1 = self.conv1_3(x)
        x1 = self.ReLU(x1)
        x1 = self.maxpool_1(x1)


        x2 = self.conv1_4(x)
        x2 = self.ReLU(x2)
        x2 = self.maxpool_2(x2)

        x3 = self.conv1_5(x)
        x3 = self.ReLU(x3)
        x3 = self.maxpool_3(x3)

        x = torch.cat((x1, x2, x3), dim=1)
        x = torch.flatten(x, 1)
        x = self.fc(x)
        x = self.dropout(x)
        x = self.softmax(x)
        return x


In [8]:
# training
max_norm = 3
def train_loop(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    optimizer.zero_grad()
    train_loss, correct_num = 0, 0
    model.train()
    for X, y in dataloader:
        X, y = X.to(device), y.to(device)
        optimizer.zero_grad()
        pred = model(X)
        loss = loss_fn(pred, y)

        loss.backward()

        for _, module in model.named_modules():
            if isinstance(module, nn.Linear):
                for _, param in module.named_parameters():
                    param_norm = param.data.norm(2)
                    if param_norm > max_norm:
                        param.data.mul_(max_norm / (param_norm + 1e-6))

        optimizer.step()
        model.eval()
        with torch.no_grad():
            pred = model(X)
            loss = loss_fn(pred, y)
            train_loss += loss.item() * X.size(0)
            correct_num += (torch.eq(torch.argmax(pred, dim=1), y)).type(torch.float).sum().item()

    train_loss /= size
    train_acc = correct_num / size
    return train_loss, train_acc

def val_loop(dataloader, model, loss_fn):
    size = len(dataloader.dataset)
    val_loss, correct_num = 0, 0
    model.eval()  # inform no dropout and fix bn during testing

    with torch.no_grad():
        for X, y in dataloader:
            X, y = X.to(device), y.to(device)

            pred = model(X)
            val_loss += loss_fn(pred, y).item() * X.size(0)
            correct_num += (torch.eq(torch.argmax(pred, dim=1), y)).type(torch.float).sum().item()

    val_loss /= size
    val_acc = correct_num / size
    return val_loss, val_acc
        

In [10]:
torch.manual_seed(SEED)
patience = 10
no_epochs = 100
lr = 0.001
dim_in = wv_num
dim_out = label_num

batch_sizes = [32, 64, 128]
dim_convs = [128, 256, 512, 1024]
for batch_size in batch_sizes:
    train_dataloader, dev_dataloader = initialize_loader(
        X_train_tensor, X_dev_tensor, y_train, y_dev, batch_size=batch_size)
    for dim_conv in dim_convs:
        print(f'batch size: {batch_size}; conv layer dimension: {dim_conv}')

        model = CNN(dim_in, dim_conv, dim_out)
        model.to(device)
        best_val_loss = np.Inf
        best_val_acc = 0
        train_loss_, train_acc_, val_loss_, val_acc_ = [], [], [], []
        optimizer = torch.optim.Adam(model.parameters(), lr=lr)
        loss_fn = nn.CrossEntropyLoss()

        # start training
        for epoch in tqdm(range(no_epochs)):
            train_loss, train_acc = train_loop(
                train_dataloader, model, loss_fn, optimizer)
            val_loss, val_acc = val_loop(dev_dataloader, model, loss_fn)

            train_loss_.append(train_loss), train_acc_.append(train_acc)
            val_loss_.append(val_loss), val_acc_.append(val_acc)

            # early stopping
            if val_acc > best_val_acc:
                best_val_acc = val_acc
            if val_loss < best_val_loss:
                best_val_loss = val_loss
                epochs_without_improvement = 0
            else:
                epochs_without_improvement += 1
                if epochs_without_improvement >= patience:
                    print(f'Early stopping after {epoch+1} epochs')
                    print(f'Best validation accuracy: {best_val_acc}')
                    break

            if (epoch+1) % 5 == 0:
                print(
                    f"Epoch {epoch+1}, train_loss {train_loss:>7f} train_acc {train_acc:>4f}, val_loss {val_loss:>7f}, val_acc {val_acc:>4f}")

  self.X =torch.tensor(X, dtype=torch.float)


batch size: 32; conv layer dimension: 128


  2%|▏         | 2/100 [00:31<25:43, 15.75s/it]


KeyboardInterrupt: 