In [1]:
from tqdm import tqdm
import numpy as np
import torch
from torch import nn
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torchtext.vocab import build_vocab_from_iterator
import string
from data_preprocessing import load_sst
from utils import remove_special_content, replace_punct, preprocess
import numpy as np
import pandas as pd
import gensim.downloader as api
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from keras.preprocessing.text import text_to_word_sequence
from sklearn.model_selection import train_test_split
nltk.download('stopwords')
SEED = 42

torch.manual_seed(SEED)

[nltk_data] Downloading package stopwords to C:\Users\Chihao
[nltk_data]     Shen\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Chihao
[nltk_data]     Shen\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


<torch._C.Generator at 0x22369873cd0>

## Validation for choosing hyperparameters

### Load dataset

In [2]:
train_data, train_labels, val_data, val_labels, dev_data, dev_labels = load_sst()

INFO: SST-2 loaded


In [3]:
df_train = pd.DataFrame({'content':train_data, 'sentiment':train_labels})
df_train.reset_index(drop=True, inplace=True)
X_train, X_val, y_train, y_val = train_test_split(df_train['content'], df_train['sentiment'], test_size=0.2, random_state=42)


In [4]:
model_type = 'word2vec-google-news-300'  # 25 100 200 word2vec 300 
w2vModel = api.load(model_type)

INFO: loading projection weights from C:\Users\Chihao Shen/gensim-data\word2vec-google-news-300\word2vec-google-news-300.gz
INFO: KeyedVectors lifecycle event {'msg': 'loaded (3000000, 300) matrix of type float32 from C:\\Users\\Chihao Shen/gensim-data\\word2vec-google-news-300\\word2vec-google-news-300.gz', 'binary': True, 'encoding': 'utf8', 'datetime': '2024-04-15T01:47:11.501574', 'gensim': '4.3.2', 'python': '3.10.11 (tags/v3.10.11:7d4cc5a, Apr  5 2023, 00:38:17) [MSC v.1929 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22631-SP0', 'event': 'load_word2vec_format'}


In [5]:
texts = X_train
val_text = X_val

### Do word vector embedding

In [6]:
# do tokenization
def tokenize(texts, max_length, tokenizer):
    texts = [s.lower() for s in texts]
    data = torch.zeros((len(texts), max_length), dtype=torch.int)
    for i, sentences in enumerate(texts):
        word_sequence = text_to_word_sequence(sentences)
        j = 0
        for word in word_sequence:
            try:
                if j < max_length:
                    data[i, j] = tokenizer[word]
                    j += 1
            except:
                data[i, j] = 0
                j += 1
    return data

In [7]:
# load word model
words = list(w2vModel.key_to_index.keys())

def yield_tokens(data_iter):
    for text in data_iter:
        yield text_to_word_sequence(text)

vocab = build_vocab_from_iterator(yield_tokens(words))

In [8]:
max_length_1 = 56
max_length_2 = 1506

word_to_idx = {word: idx for idx, word in enumerate(vocab.get_itos())}
data = tokenize(texts, max_length_1, word_to_idx)
val_data = tokenize(val_text, max_length_1, word_to_idx)

In [9]:
def process(data, ori_df):
    labels = ori_df.values
    print('Shape of data tensor:', data.shape)
    print('Shape of label tensor:', labels.shape)
    
    return data, labels

X_train_tensor, y_train= process(data, y_train)
X_val_tensor, y_val = process(val_data, y_val)

Shape of data tensor: torch.Size([53879, 56])
Shape of label tensor: (53879,)
Shape of data tensor: torch.Size([13470, 56])
Shape of label tensor: (13470,)


In [10]:
GLOVE_DIM = int(model_type.split('-')[-1])

def embed(model, word_to_idx):
    embed_size = GLOVE_DIM
    embedding_matrix = np.zeros((len(word_to_idx)+1, embed_size), dtype=np.float32)

    hits = 0
    misses = 0

    for word, i in word_to_idx.items():
        try:
            embedding_vector = model.get_vector(word)
            if embedding_vector is not None:
                embedding_matrix[i] = embedding_vector
                hits += 1
            else:
                misses += 1
        except:
            misses += 1
            
    print("Converted %d words (%d misses)" % (hits, misses))
    return torch.tensor(embedding_matrix)


embedding_matrix = embed (w2vModel, word_to_idx)

Converted 159622 words (613542 misses)


In [11]:
torch.manual_seed(SEED)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
class CustomDataset(Dataset):
    def __init__(self, X, y):
        self.X =torch.tensor(X, dtype=torch.int)
        self.y =torch.tensor(y, dtype=torch.long)

    def __len__(self):
        return len(self.y)

    def __getitem__(self,idx):
        return self.X[idx], self.y[idx]

def initialize_loader(X_train_tensor, X_val_tensor, y_train, y_val, batch_size=50):
    # no need to do the scale since original wv already did
    train_data = CustomDataset(X_train_tensor, y_train)
    val_data = CustomDataset(X_val_tensor, y_val)

    # convert to DataLoader for batch processing and shuffling
    train_dataloader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
    for inputs, targets in train_dataloader:
        inputs, targets = inputs.to(device), targets.to(device)
    val_dataloader = DataLoader(val_data, batch_size=batch_size, shuffle=False)
    for inputs, targets in val_dataloader:
        inputs, targets = inputs.to(device), targets.to(device)
    return train_dataloader, val_dataloader

### Model structure

In [12]:
# model
class CNN(nn.Module):
    def __init__(self, embedding_matrix, dim_in, dim_conv, dim_out, dropout_rate=0.5, max_length=max_length_1):
        super(CNN, self).__init__()
        self.embedding = nn.Embedding(embedding_matrix.shape[0], dim_in).from_pretrained(embedding_matrix, freeze=True)
        self.conv1_1 = nn.Conv1d(dim_in, dim_conv, 3, padding=5)
        self.conv1_2 = nn.Conv1d(dim_in, dim_conv, 4, padding=5)
        self.conv1_3 = nn.Conv1d(dim_in, dim_conv, 5, padding=5)
        self.conv1_4 = nn.Conv1d(dim_in, dim_conv, 6, padding=5)
        self.ReLU = nn.ReLU()
        self.maxpool_1 = nn.MaxPool1d(kernel_size=max_length+8)
        self.maxpool_2 = nn.MaxPool1d(kernel_size=max_length+7)
        self.maxpool_3 = nn.MaxPool1d(kernel_size=max_length+6)
        self.maxpool_4 = nn.MaxPool1d(kernel_size=max_length+5)
        self.fc_1 = nn.Linear(dim_conv * 4, dim_out)
        self.dropout = nn.Dropout(p=dropout_rate)
        self.softmax = nn.Softmax()


    def forward(self, x):
        x = self.embedding(x).transpose(1, 2)
        x1 = self.conv1_1(x)
        x1 = self.ReLU(x1)
        x1 = self.maxpool_1(x1)


        x2 = self.conv1_2(x)
        x2 = self.ReLU(x2)
        x2 = self.maxpool_2(x2)

        x3 = self.conv1_3(x)
        x3 = self.ReLU(x3)
        x3 = self.maxpool_3(x3)

        x4 = self.conv1_4(x)
        x4 = self.ReLU(x4)
        x4 = self.maxpool_4(x4)

        x = torch.cat((x1, x2, x3, x4), dim=1)
        x = torch.flatten(x, 1)
        x = self.fc_1(x)
        x = self.dropout(x)
        x = self.softmax(x)
        return x

### Do validation to choose the best hyperparameters

In [13]:
# training
max_norm = 3
def train_loop(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    optimizer.zero_grad()
    train_loss, correct_num = 0, 0
    model.train()
    for X, y in dataloader:
        X, y = X.to(device), y.to(device)
        optimizer.zero_grad()
        pred = model(X)
        loss = loss_fn(pred, y)

        loss.backward()

        for _, module in model.named_modules():
            if isinstance(module, nn.Linear):
                for _, param in module.named_parameters():
                    param_norm = param.data.norm(2)
                    if param_norm > max_norm:
                        param.data.mul_(max_norm / (param_norm + 1e-6))

        optimizer.step()
        model.eval()
        with torch.no_grad():
            pred = model(X)
            loss = loss_fn(pred, y)
            train_loss += loss.item() * X.size(0)
            correct_num += (torch.eq(torch.argmax(pred, dim=1), y)).type(torch.float).sum().item()

    train_loss /= size
    train_acc = correct_num / size
    return train_loss, train_acc

# validation
def val_loop(dataloader, model, loss_fn):
    size = len(dataloader.dataset)
    val_loss, correct_num = 0, 0
    model.eval()  # inform no dropout and fix bn during testing

    with torch.no_grad():
        for X, y in dataloader:
            X, y = X.to(device), y.to(device)

            pred = model(X)
            val_loss += loss_fn(pred, y).item() * X.size(0)
            correct_num += (torch.eq(torch.argmax(pred, dim=1), y)).type(torch.float).sum().item()

    val_loss /= size
    val_acc = correct_num / size
    return val_loss, val_acc
        

In [14]:
patience = 10
train_loss_, train_acc_, val_loss_, val_acc_ = [], [], [], []
no_epochs = 100


dim_in = GLOVE_DIM
dim_out = 2
lr = 0.0005
batch_sizes = [64, 128, 256]
dim_convs = [128, 256, 512, 1024]

for batch_size in batch_sizes:
    for dim_conv in dim_convs:
        # set the seed to avoid dataloader difference
        torch.manual_seed(SEED)
        train_dataloader, val_dataloader = initialize_loader(X_train_tensor, X_val_tensor, y_train, y_val, batch_size=batch_size)

        print(f'batch size: {batch_size}; conv layer dimension: {dim_conv}')
        epochs_without_improvement = 0
        best_val_loss = np.Inf

        model = CNN(embedding_matrix, dim_in, dim_conv, dim_out)
        model.to(device)
        loss_fn = nn.CrossEntropyLoss()
        optimizer = torch.optim.Adam(model.parameters(), lr=lr)

        # start training
        for epoch in tqdm(range(no_epochs)):
            train_loss, train_acc = train_loop(train_dataloader, model, loss_fn, optimizer)
            val_loss, val_acc = val_loop(val_dataloader, model, loss_fn)

            train_loss_.append(train_loss), train_acc_.append(train_acc)
            val_loss_.append(val_loss), val_acc_.append(val_acc)

            # early stopping
            if val_loss < best_val_loss:
                best_val_loss = val_loss
                epochs_without_improvement = 0
            else:
                epochs_without_improvement += 1
                if epochs_without_improvement >= patience:
                    print(f'early stopping after {epoch+1} epochs')
                    print(f'best test loss: {best_val_loss}')
                    break


            if (epoch+1) % 5 == 0:
                print(
                    f"Epoch {epoch+1}, train_loss {train_loss:>7f} train_acc {train_acc:>4f}, val_loss {val_loss:>7f}, val_acc {val_acc:>4f}")

  self.X =torch.tensor(X, dtype=torch.int)


batch size: 64; conv layer dimension: 128


  return self._call_impl(*args, **kwargs)
  5%|▌         | 5/100 [00:41<11:57,  7.56s/it]

Epoch 5, train_loss 0.355872 train_acc 0.963047, val_loss 0.383526, val_acc 0.930067


 10%|█         | 10/100 [01:24<13:32,  9.03s/it]

Epoch 10, train_loss 0.339517 train_acc 0.977078, val_loss 0.376589, val_acc 0.937936


 15%|█▌        | 15/100 [02:02<10:42,  7.56s/it]

Epoch 15, train_loss 0.333998 train_acc 0.981254, val_loss 0.375096, val_acc 0.937268


 20%|██        | 20/100 [02:39<09:45,  7.32s/it]

Epoch 20, train_loss 0.331455 train_acc 0.983018, val_loss 0.374154, val_acc 0.937936


 25%|██▌       | 25/100 [03:16<09:00,  7.20s/it]

Epoch 25, train_loss 0.330315 train_acc 0.983797, val_loss 0.373505, val_acc 0.937639


 30%|███       | 30/100 [03:53<08:35,  7.37s/it]

Epoch 30, train_loss 0.328868 train_acc 0.984966, val_loss 0.375330, val_acc 0.935412


 35%|███▌      | 35/100 [04:29<07:46,  7.18s/it]

Epoch 35, train_loss 0.328518 train_acc 0.985300, val_loss 0.374394, val_acc 0.936823


 40%|████      | 40/100 [05:07<07:19,  7.32s/it]

Epoch 40, train_loss 0.328060 train_acc 0.985634, val_loss 0.373656, val_acc 0.938382


 40%|████      | 40/100 [05:15<07:53,  7.89s/it]

early stopping after 41 epochs
best test loss: 0.37297010057576074



  self.X =torch.tensor(X, dtype=torch.int)


batch size: 64; conv layer dimension: 256


  5%|▌         | 5/100 [00:38<12:15,  7.74s/it]

Epoch 5, train_loss 0.351174 train_acc 0.966388, val_loss 0.381358, val_acc 0.931626


 10%|█         | 10/100 [01:16<11:09,  7.43s/it]

Epoch 10, train_loss 0.337131 train_acc 0.978879, val_loss 0.375340, val_acc 0.937639


 15%|█▌        | 15/100 [01:54<10:46,  7.60s/it]

Epoch 15, train_loss 0.333419 train_acc 0.981217, val_loss 0.374174, val_acc 0.938085


 20%|██        | 20/100 [02:32<10:04,  7.56s/it]

Epoch 20, train_loss 0.331256 train_acc 0.982721, val_loss 0.375393, val_acc 0.936526


 25%|██▌       | 25/100 [03:10<09:17,  7.44s/it]

Epoch 25, train_loss 0.330210 train_acc 0.983741, val_loss 0.374918, val_acc 0.936600


 28%|██▊       | 28/100 [03:42<09:33,  7.96s/it]

early stopping after 29 epochs
best test loss: 0.37397878350995606





batch size: 64; conv layer dimension: 512


  5%|▌         | 5/100 [00:53<16:54, 10.68s/it]

Epoch 5, train_loss 0.347810 train_acc 0.969042, val_loss 0.380144, val_acc 0.932220


 10%|█         | 10/100 [01:50<17:00, 11.34s/it]

Epoch 10, train_loss 0.336096 train_acc 0.978971, val_loss 0.375301, val_acc 0.936526


 15%|█▌        | 15/100 [02:47<16:09, 11.40s/it]

Epoch 15, train_loss 0.333192 train_acc 0.981013, val_loss 0.377378, val_acc 0.933333


 20%|██        | 20/100 [03:42<14:34, 10.93s/it]

Epoch 20, train_loss 0.331331 train_acc 0.982516, val_loss 0.372697, val_acc 0.938901


 25%|██▌       | 25/100 [04:37<13:40, 10.94s/it]

Epoch 25, train_loss 0.330820 train_acc 0.982962, val_loss 0.374608, val_acc 0.937491


 30%|███       | 30/100 [05:32<12:45, 10.93s/it]

Epoch 30, train_loss 0.330185 train_acc 0.983407, val_loss 0.373816, val_acc 0.937416


 31%|███       | 31/100 [05:54<13:09, 11.45s/it]

early stopping after 32 epochs
best test loss: 0.37231875157126165





batch size: 64; conv layer dimension: 1024


  5%|▌         | 5/100 [01:32<29:04, 18.37s/it]

Epoch 5, train_loss 0.349104 train_acc 0.966480, val_loss 0.383283, val_acc 0.928211


 10%|█         | 10/100 [03:12<29:25, 19.62s/it]

Epoch 10, train_loss 0.338073 train_acc 0.976466, val_loss 0.381310, val_acc 0.930661


 15%|█▌        | 15/100 [04:43<26:00, 18.36s/it]

Epoch 15, train_loss 0.334911 train_acc 0.979046, val_loss 0.378293, val_acc 0.933333


 20%|██        | 20/100 [06:14<24:12, 18.16s/it]

Epoch 20, train_loss 0.333628 train_acc 0.980085, val_loss 0.374185, val_acc 0.936526


 25%|██▌       | 25/100 [07:45<22:39, 18.13s/it]

Epoch 25, train_loss 0.333005 train_acc 0.980568, val_loss 0.375572, val_acc 0.935189


 30%|███       | 30/100 [10:11<34:42, 29.74s/it]

Epoch 30, train_loss 0.331526 train_acc 0.982071, val_loss 0.376197, val_acc 0.935561


 35%|███▌      | 35/100 [12:49<29:23, 27.14s/it]

Epoch 35, train_loss 0.331246 train_acc 0.982275, val_loss 0.375705, val_acc 0.935486


 35%|███▌      | 35/100 [13:07<24:23, 22.51s/it]

early stopping after 36 epochs
best test loss: 0.37410328097492124





batch size: 128; conv layer dimension: 128


  5%|▌         | 5/100 [00:42<14:06,  8.91s/it]

Epoch 5, train_loss 0.360562 train_acc 0.959576, val_loss 0.385859, val_acc 0.929621


 10%|█         | 10/100 [01:14<08:51,  5.91s/it]

Epoch 10, train_loss 0.343139 train_acc 0.975278, val_loss 0.378104, val_acc 0.935857


 15%|█▌        | 15/100 [01:49<09:50,  6.95s/it]

Epoch 15, train_loss 0.336353 train_acc 0.980048, val_loss 0.375697, val_acc 0.936897


 20%|██        | 20/100 [02:29<10:08,  7.61s/it]

Epoch 20, train_loss 0.332990 train_acc 0.982535, val_loss 0.373776, val_acc 0.938382


 25%|██▌       | 25/100 [03:12<10:34,  8.46s/it]

Epoch 25, train_loss 0.330947 train_acc 0.983816, val_loss 0.373722, val_acc 0.938307


 30%|███       | 30/100 [03:36<05:58,  5.13s/it]

Epoch 30, train_loss 0.329427 train_acc 0.984911, val_loss 0.374919, val_acc 0.936080


 35%|███▌      | 35/100 [03:57<04:47,  4.42s/it]

Epoch 35, train_loss 0.328837 train_acc 0.985208, val_loss 0.373321, val_acc 0.938159


 40%|████      | 40/100 [04:18<04:16,  4.28s/it]

Epoch 40, train_loss 0.328001 train_acc 0.985876, val_loss 0.374067, val_acc 0.937194


 45%|████▌     | 45/100 [04:40<04:03,  4.43s/it]

Epoch 45, train_loss 0.327602 train_acc 0.986173, val_loss 0.373452, val_acc 0.938010


 50%|█████     | 50/100 [05:24<06:38,  7.98s/it]

Epoch 50, train_loss 0.327220 train_acc 0.986581, val_loss 0.375024, val_acc 0.936377


 55%|█████▌    | 55/100 [06:03<05:42,  7.60s/it]

Epoch 55, train_loss 0.326954 train_acc 0.986674, val_loss 0.373392, val_acc 0.937788


 55%|█████▌    | 55/100 [06:08<05:01,  6.70s/it]

early stopping after 56 epochs
best test loss: 0.37316124169757475





batch size: 128; conv layer dimension: 256


  5%|▌         | 5/100 [00:28<08:59,  5.68s/it]

Epoch 5, train_loss 0.354309 train_acc 0.964365, val_loss 0.383700, val_acc 0.930512


 10%|█         | 10/100 [01:15<15:02, 10.03s/it]

Epoch 10, train_loss 0.338567 train_acc 0.978248, val_loss 0.378311, val_acc 0.936006


 15%|█▌        | 15/100 [02:24<18:41, 13.20s/it]

Epoch 15, train_loss 0.333564 train_acc 0.981830, val_loss 0.374824, val_acc 0.936674


 20%|██        | 20/100 [03:25<16:40, 12.51s/it]

Epoch 20, train_loss 0.331051 train_acc 0.983649, val_loss 0.375194, val_acc 0.936971


 25%|██▌       | 25/100 [04:23<14:30, 11.60s/it]

Epoch 25, train_loss 0.329594 train_acc 0.984502, val_loss 0.375820, val_acc 0.934744


 30%|███       | 30/100 [05:22<14:11, 12.17s/it]

Epoch 30, train_loss 0.328456 train_acc 0.985449, val_loss 0.374731, val_acc 0.935486


 35%|███▌      | 35/100 [06:27<14:01, 12.94s/it]

Epoch 35, train_loss 0.328029 train_acc 0.985802, val_loss 0.373205, val_acc 0.937862


 40%|████      | 40/100 [07:28<12:14, 12.25s/it]

Epoch 40, train_loss 0.327865 train_acc 0.985709, val_loss 0.373517, val_acc 0.937936


 45%|████▌     | 45/100 [08:31<11:22, 12.41s/it]

Epoch 45, train_loss 0.327408 train_acc 0.986117, val_loss 0.374766, val_acc 0.936229


 47%|████▋     | 47/100 [09:03<10:12, 11.56s/it]

early stopping after 48 epochs
best test loss: 0.3720446774095806





batch size: 128; conv layer dimension: 512


  5%|▌         | 5/100 [01:07<19:41, 12.44s/it]

Epoch 5, train_loss 0.349012 train_acc 0.968503, val_loss 0.379279, val_acc 0.933556


 10%|█         | 10/100 [01:58<14:38,  9.76s/it]

Epoch 10, train_loss 0.336318 train_acc 0.979287, val_loss 0.375711, val_acc 0.937120


 15%|█▌        | 15/100 [02:56<16:06, 11.37s/it]

Epoch 15, train_loss 0.332161 train_acc 0.982498, val_loss 0.375725, val_acc 0.936080


 20%|██        | 20/100 [03:37<11:37,  8.72s/it]

Epoch 20, train_loss 0.330532 train_acc 0.983686, val_loss 0.374963, val_acc 0.935932


 21%|██        | 21/100 [03:54<14:40, 11.14s/it]

early stopping after 22 epochs
best test loss: 0.37342219456266984





batch size: 128; conv layer dimension: 1024


  5%|▌         | 5/100 [01:23<26:48, 16.93s/it]

Epoch 5, train_loss 0.347627 train_acc 0.968745, val_loss 0.384616, val_acc 0.927617


 10%|█         | 10/100 [02:46<25:00, 16.67s/it]

Epoch 10, train_loss 0.336446 train_acc 0.978693, val_loss 0.374692, val_acc 0.936897


 15%|█▌        | 15/100 [04:09<23:37, 16.67s/it]

Epoch 15, train_loss 0.333399 train_acc 0.980809, val_loss 0.378456, val_acc 0.932814


 20%|██        | 20/100 [05:33<22:06, 16.58s/it]

Epoch 20, train_loss 0.331793 train_acc 0.982071, val_loss 0.374533, val_acc 0.936600


 25%|██▌       | 25/100 [06:55<20:46, 16.62s/it]

Epoch 25, train_loss 0.330669 train_acc 0.983185, val_loss 0.375129, val_acc 0.936897


 30%|███       | 30/100 [08:18<19:18, 16.54s/it]

Epoch 30, train_loss 0.330306 train_acc 0.983370, val_loss 0.374977, val_acc 0.935635


 35%|███▌      | 35/100 [09:40<17:48, 16.43s/it]

Epoch 35, train_loss 0.329690 train_acc 0.983927, val_loss 0.375371, val_acc 0.935115


 40%|████      | 40/100 [11:04<16:39, 16.66s/it]

Epoch 40, train_loss 0.329123 train_acc 0.984410, val_loss 0.374028, val_acc 0.936971


 45%|████▌     | 45/100 [12:28<15:29, 16.91s/it]

Epoch 45, train_loss 0.328409 train_acc 0.985022, val_loss 0.378813, val_acc 0.933482


 48%|████▊     | 48/100 [13:37<18:51, 21.75s/it]

Parameter of the best model: 

batch size: 128; conv layer dimension: 256