In [1]:
from imdb import *
from utils import *
from config import *
import torch
import torch.nn as nn
import matplotlib.pyplot as plt

# Dataset

In [2]:
path = "../\.data/imdb"
imdb_reviews = IMDBMovieReviews(path)
train_data, test_data = imdb_reviews.get_data()

In [3]:
train_data, dev_data = imdb_reviews.split_data(train_data)

In [4]:
for data in (train_data, dev_data, test_data):
    imdb_reviews.tokenize(data, max_seq_len=MAX_SEQ_LEN) 

In [5]:
glove_path = "../\.vector_cache/glove.6B.100d.txt"
glove = GloVeWordEmbeddings(glove_path, int((glove_path.split(".")[-2]).split("d")[0]))
token_to_index_mapping = imdb_reviews.create_vocab(train_data, unk_threshold=UNK_THRESHOLD)
token_to_glove_mapping = glove.get_token_to_embedding()
indices_found, embedding_matrix = imdb_reviews.get_embeds(token_to_index_mapping, token_to_glove_mapping, glove.get_num_dims())

In [6]:
label_to_idx = {"neg": 0, "pos": 1}
for data in (train_data, dev_data, test_data):
    imdb_reviews.apply_vocab(data, token_to_index_mapping)
    imdb_reviews.apply_label_map(data, label_to_idx)

to be continued

In [7]:
pad_idx = token_to_index_mapping[PAD]
train_dataset = SentimentDataset(train_data, pad_idx)
dev_dataset = SentimentDataset(dev_data, pad_idx)
test_dataset = SentimentDataset(test_data, pad_idx)

train_dataloader = DataLoader(
    train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=train_dataset.collate_fn
)
dev_dataloader = DataLoader(
    dev_dataset, batch_size=BATCH_SIZE, collate_fn=dev_dataset.collate_fn
)
test_dataloader = DataLoader(
    test_dataset, batch_size=BATCH_SIZE, collate_fn=test_dataset.collate_fn
)

In [8]:
embedding_matrix.shape

(30590, 100)

# Model

In [9]:
class RNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, n_labels, n_rnn_layers, pad_idx, embedding_matrix, freeze=True):
        super().__init__()
        self.pad_idx = pad_idx
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)# , padding_idx=pad_idx
        # self.embedding.weight.requires_grad = False
        self.embedding.weight.data.copy_(torch.from_numpy(embedding_matrix))
        
        self.rnn = nn.RNN(
            input_size=embedding_dim,
            hidden_size=hidden_dim,
            num_layers=n_rnn_layers,
            batch_first=True, 
            bidirectional=True
        )

        # self.output = nn.Linear(hidden_dim, n_labels)
        self.fc1 = nn.Linear(hidden_dim*2, 16*2)
        self.dropout = nn.Dropout(0.15)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(16*2, n_labels)
        # self.h = torch.zeros(1, BATCH_SIZE, hidden_dim).cuda()

    def forward(self, text):
        x = self.embedding(text)
        a, x = self.rnn(x)
        # print(a, x)
        # print(x.shape)
        output_f = x[-2, :, :]
        output_b = x[-1, :, :]
        x = torch.cat([output_f, output_b], dim=-1)

        x = self.fc1(x)
        x = self.dropout(x)
        x = self.relu(x)
        x = self.fc2(x)
        return x

In [10]:
class LSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, n_labels, pad_idx, embedding_matrix, freeze=True):
        super().__init__()
        self.pad_idx = pad_idx
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)# , padding_idx=pad_idx
        # self.embedding.weight.requires_grad = False
        self.embedding.weight.data.copy_(torch.from_numpy(embedding_matrix))

        self.lstm = nn.LSTM(
            input_size=embedding_dim, 
            hidden_size=hidden_dim, 
            num_layers=2,
            batch_first=True, 
            bidirectional=True
        )

        # self.output = nn.Linear(hidden_dim, n_labels)
        self.fc1 = nn.Linear(hidden_dim*2, 16*2)
        self.dropout = nn.Dropout(0.15)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(16*2, n_labels)
        # self.h = torch.zeros(1, BATCH_SIZE, hidden_dim).cuda()

    def forward(self, text):
        x = self.embedding(text)
        a, (x, _) = self.lstm(x)
        # print(a, x)
        # print(x.shape)
        output_f = x[-2, :, :]
        output_b = x[-1, :, :]
        x = torch.cat([output_f, output_b], dim=-1)

        x = self.fc1(x)
        x = self.dropout(x)
        x = self.relu(x)
        x = self.fc2(x)
        return x

In [11]:
class GRU(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, n_labels, pad_idx, embedding_matrix, freeze=True):
        super().__init__()
        self.pad_idx = pad_idx
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)# , padding_idx=pad_idx
        # self.embedding.weight.requires_grad = False
        self.embedding.weight.data.copy_(torch.from_numpy(embedding_matrix))

        self.gru = nn.GRU(
            input_size=embedding_dim, 
            hidden_size=hidden_dim, 
            num_layers=2,
            batch_first=True, 
            bidirectional=True
        )

        # self.output = nn.Linear(hidden_dim, n_labels)
        self.fc1 = nn.Linear(hidden_dim*2, 16*2)
        self.dropout = nn.Dropout(0.15)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(16*2, n_labels)
        # self.h = torch.zeros(1, BATCH_SIZE, hidden_dim).cuda()

    def forward(self, text):
        x = self.embedding(text)
        a, x = self.gru(x)
        # print(a, x)
        # print(x.shape)
        # x = torch.cat([output_f, output_b], dim=-1)

        x = self.fc1(x)
        x = self.dropout(x)
        x = self.relu(x)
        x = self.fc2(x)
        return x

In [12]:
# rnn1 = RNN(vocab_size=embedding_matrix.shape[0], embedding_dim=100, hidden_dim=256, n_labels=2, n_rnn_layers=1, 
#             pad_idx=pad_idx, embedding_matrix=embedding_matrix)

# rnn2 = RNN(vocab_size=embedding_matrix.shape[0], embedding_dim=100, hidden_dim=256, n_labels=2, n_rnn_layers=2, 
#             pad_idx=pad_idx, embedding_matrix=embedding_matrix)

# lstm = LSTM(vocab_size=embedding_matrix.shape[0], embedding_dim=100, hidden_dim=256, n_labels=2, 
#             pad_idx=pad_idx, embedding_matrix=embedding_matrix)

model = LSTM(vocab_size=embedding_matrix.shape[0], embedding_dim=100, hidden_dim=256, n_labels=2, 
            pad_idx=pad_idx, embedding_matrix=embedding_matrix)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss()
scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.9)

# Training

In [13]:
# model = rnn2
# model.to(device)

In [14]:
# torch.cuda.empty_cache()

In [15]:
# for epoch in range(N_EPOCHS):
#     pass
# h0 = torch.zeros(1, BATCH_SIZE, 100)
def fit(model, epochs):
    correct_train=0
    num_train=0
    ac_train=[]
    ac_validate=[]
    for epoch in range(epochs):
        
        model.train()
        correct_train=0
        num_train=0
        print('epoch:{}'.format(epoch))
        for batch, (X, Y)  in enumerate(train_dataloader):
            # torch.cuda.empty_cache()
            # print('batch:{}'.format(batch))
            X = X.cuda()
            Y = Y.cuda()
            # print(X.shape,Y.shape)
            # if X.shape[0]<128:
            #     print('not enough!')
            #     break
            num_train+=X.shape[0]
            # print(torch.unsqueeze(X, dim=0).shape)
            optimizer.zero_grad()
            out = model(X)
            out = torch.squeeze(out)
            
            # print(out.shape, Y.shape)
            # print(torch.squeeze(out).shape)
            # print(out.shape)
            loss = criterion(out, Y)
            loss.backward()
            optimizer.step()
            pred = out.argmax(dim=1)
            # print(pred)

            correct_train+=pred.eq(Y).sum()
        scheduler.step()
        ac_train.append(float(correct_train)/float(num_train))

        model.eval()
        correct_validate=0
        num_validate=0
        with torch.no_grad():
            for batch, (X, Y)  in enumerate(dev_dataloader):
                X = X.cuda()
                Y = Y.cuda()
                num_validate+=X.shape[0]
                out = model(X)
                out = torch.squeeze(out)
                pred = out.argmax(dim=1)
                # print(pred)

                correct_validate+=pred.eq(Y).sum()
                loss = criterion(out, Y)
                # optimizer.step()

        ac_validate.append(float(correct_validate)/float(num_validate))
    
    return ac_train, ac_validate

# ac=float(correct)/float(num_train)
# print('ac:{}'.format(ac))

epoch:0


KeyboardInterrupt: 

In [None]:
fig, ax = plt.subplots()
fig.set_figheight(5)
fig.set_figwidth(10)

plt.subplot(121)
plt.plot(ac_train)
plt.plot(ac_validate)
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend(('Training', 'Validation'))
plt.show()

In [None]:
print(ac_train)

In [None]:
print(ac_validate)

In [None]:
'验证集测试'
correct_validate=0
num_validate=0
ac_validate=[]
model.eval()
with torch.no_grad():
    for batch, (X, Y)  in enumerate(dev_dataloader):
        X = X.cuda()
        Y = Y.cuda()
        num_validate+=X.shape[0]
        out = model(X)
        out = torch.squeeze(out)
        pred = out.argmax(dim=1)
        # print(pred)

        correct_validate+=pred.eq(Y).sum()
        loss = criterion(out, Y)
        # optimizer.step()
print(float(correct_validate)/float(num_validate))
# ac_validate.append(float(correct_validate)/float(num_validate))

In [None]:
torch.save(model, 'lstm.pkl')

In [None]:
model_file = 'lstm.pkl'
model_copy = torch.load(model_file)

In [None]:
print(model_copy)

In [None]:
'测试集'
def test(model_to_test):
    correct_test=0
    num_test=0
    # ac_validate=[]
    model_to_test.eval()
    with torch.no_grad():
        for batch, (X, Y)  in enumerate(test_dataloader):
            X = X.cuda()
            Y = Y.cuda()
            num_test+=X.shape[0]
            out = model_to_test(X)
            out = torch.squeeze(out)
            pred = out.argmax(dim=1)
            # print(pred)

            correct_test+=pred.eq(Y).sum()
            loss = criterion(out, Y)
            # optimizer.step()
    print(float(correct_test)/float(num_test))