# If Files present start from here

In [13]:
import pandas as pd
import os
import pickle

data_dir = './data/pytorch'
with open(os.path.join(data_dir, 'word_dict_amazon.pkl'), "rb") as f:
    word_dict = pickle.load(f)

In [14]:
import pandas as pd
import os
train = pd.read_csv(os.path.join(data_dir, 'train_amazon.csv'), header=None, names=None)
test_sample = pd.read_csv(os.path.join(data_dir, 'test_amazon.csv'), header=None, names=None)
print(train.shape, test_sample.shape)

(83000, 502) (21975, 502)


In [15]:
from sklearn.model_selection import train_test_split
test, val = train_test_split(test_sample, test_size=0.5)
train.shape, test.shape, val.shape

((83000, 502), (10987, 502), (10988, 502))

In [16]:
import torch
import torch.utils.data

# Turn the input pandas dataframe into tensors
train_y = torch.from_numpy(train[[0]].values).float().squeeze()
train_X = torch.from_numpy(train.drop([0], axis=1).values).long()

# Build the dataset
train_ds = torch.utils.data.TensorDataset(train_X, train_y)
# Build the dataloader
train_dl = torch.utils.data.DataLoader(train_ds, batch_size=50)

######val data
# Turn the input pandas dataframe into tensors
val_y = torch.from_numpy(val[[0]].values).float().squeeze()
val_X = torch.from_numpy(val.drop([0], axis=1).values).long()

# Build the dataset
val_ds = torch.utils.data.TensorDataset(val_X, val_y)
# Build the dataloader
val_dl = torch.utils.data.DataLoader(val_ds, batch_size=50)


#### Test data
# Turn the input pandas dataframe into tensors
test_y = torch.from_numpy(test[[0]].values).float().squeeze()
test_X = torch.from_numpy(test.drop([0], axis=1).values).long()

# Build the dataset
test_ds = torch.utils.data.TensorDataset(test_X, test_y)
# Build the dataloader
test_dl = torch.utils.data.DataLoader(test_ds, batch_size=50)
print(test_y.shape)

torch.Size([10987])


In [17]:
import torch.nn as nn
import numpy as np

class LSTMClassifier(nn.Module):
    """
    This is the simple RNN model we will be using to perform Sentiment Analysis.
    """

    def __init__(self, embedding_dim, hidden_dim, vocab_size):
        """
        Initialize the model by settingg up the various layers.
        """
        super(LSTMClassifier, self).__init__()

        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)
        self.dense = nn.Linear(in_features=hidden_dim, out_features=1)
        self.sig = nn.Sigmoid()
        
        self.word_dict = None

    def forward(self, x):
        """
        Perform a forward pass of our model on some input.
        """
        x = x.t()
        lengths = x[0,:]
        reviews = x[1:,:]
        embeds = self.embedding(reviews)
        lstm_out, _ = self.lstm(embeds)
        out = self.dense(lstm_out)
        out = out[lengths - 1, range(len(lengths))]
        return self.sig(out.squeeze())

In [24]:
# write to file.
filename = "LSTM_amazon.csv"
def write_to_csv(epochs, train_loss, val_loss, val_acc, time_train):
    epoch = [i for i in range(epochs)]
    df_metrics = pd.DataFrame(list(zip(epoch, train_loss, val_loss, val_acc, time_train)), columns =['Epoch', 'train_loss', 'val_loss', 'val_acc', 'train_time'])
    df_metrics.to_csv(filename)
    
def append_to_csv(epochs, accuracy):
    acc = [accuracy for i in range(epochs)]
    df_csv = pd.read_csv(file_name)
    df_csv['Test_Accuracy']  = accuracy
    df_metrics.to_csv(filename)

In [29]:
import time
def train(model, train_dl, val_dl, epochs, optimizer, loss_fn, device):
    train_loss = []
    val_loss_epoch = []
    val_accuracy_epoch = []
    time_train = []
    for epoch in range(epochs):
        start = time.time()
        model.train()
        total_loss = 0
        for batch in train_dl:         
            batch_X, batch_y = batch
            batch_X = batch_X.to(device)
            batch_y = batch_y.to(device)
            optimizer.zero_grad()
            prediction = model(batch_X)
            loss = loss_fn(prediction, batch_y)
            loss.backward()
            optimizer.step()
            total_loss += loss.data.item()
        train_loss.append(total_loss / len(train_dl))
        print("Epoch: {}, BCELoss: {}".format(epoch, total_loss / len(train_dl)))
        
        with torch.no_grad():
            model.eval()
            correct = 0
            total = 0
            val_loss = []
            for inputs, labels in val_dl:
                inputs_val, labels_val = inputs.to(device), labels.to(device)
                optimizer.zero_grad()
                prediction = model(inputs_val)
                loss = loss_fn(prediction, labels_val)
                val_loss.append(loss.item())
                result = np.round(prediction.cpu())
                total += labels_val.size(0)
                correct += (result == labels_val.cpu()).sum().item()
            val_accuracy_epoch.append(correct/total)
            val_loss_epoch.append(np.mean(val_loss))
            end = time.time() - start
            print("Val Loss: {:.3f}".format(np.mean(val_loss)), "\tVal Acc: {:.3f}".format(correct/total))
        time_train.append(end)
    write_to_csv(epochs, train_loss, val_loss_epoch, val_accuracy_epoch, time_train)
    return model


def test(model, test_dl, epochs):
    model.eval()
    correct = 0
    total = 0
#     results = []
#     labels = []
    with torch.no_grad():
        for batch in test_dl:         
            batch_X, batch_y = batch
            batch_X = batch_X.to(device)
            prediction = model(batch_X)
            result = np.round(prediction.cpu())
#             results.extend(list(result.numpy()))
#             labels.extend(list(batch_y.numpy()))
            total += batch_y.size(0)
            correct += (result == batch_y).sum().item()
    acc = correct/total
    append_to_csv(epochs, correct/total)
    print("Accuracy:", correct/total)

In [30]:
import torch.optim as optim
epochs = 10
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = LSTMClassifier(32, 100, 10000).to(device)
optimizer = optim.Adam(model.parameters())
loss_fn = torch.nn.BCELoss()
train(model, train_dl, val_dl, epochs, optimizer, loss_fn, device)

Epoch: 1, BCELoss: 0.3734857620963131
Val Loss: 0.303 	Val Acc: 0.873
Epoch: 2, BCELoss: 0.277388487989644
Val Loss: 0.284 	Val Acc: 0.884
Epoch: 3, BCELoss: 0.24980635277657623
Val Loss: 0.276 	Val Acc: 0.888
Epoch: 4, BCELoss: 0.22989511548664915
Val Loss: 0.273 	Val Acc: 0.887
Epoch: 5, BCELoss: 0.20950295685913908
Val Loss: 0.281 	Val Acc: 0.886
Epoch: 6, BCELoss: 0.2005195675462664
Val Loss: 0.304 	Val Acc: 0.883
Epoch: 7, BCELoss: 0.17621557888832975
Val Loss: 0.319 	Val Acc: 0.883
Epoch: 8, BCELoss: 0.15397531381437937
Val Loss: 0.349 	Val Acc: 0.879
Epoch: 9, BCELoss: 0.13480635696618135
Val Loss: 0.371 	Val Acc: 0.877
Epoch: 10, BCELoss: 0.11911533274351203
Val Loss: 0.428 	Val Acc: 0.876


LSTMClassifier(
  (embedding): Embedding(10000, 32, padding_idx=0)
  (lstm): LSTM(32, 100)
  (dense): Linear(in_features=100, out_features=1, bias=True)
  (sig): Sigmoid()
)

In [33]:
test(model, test_dl, epochs)

NameError: name 'epochs' is not defined

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]