In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from sklearn.metrics import precision_score, recall_score, f1_score
import nltk
from nltk.corpus import stopwords


inputFile = "../ergasia_1/imdb-reviews.csv"

# Insert the path of the file you want to test HERE
testFile = None

df = pd.read_csv(inputFile, sep='\t')

# Drop the first column (url)
df = df.drop(columns=['url'])

positive = 0
negative = 0
neutral = 0

nltk.download('stopwords')


for index, row in df.iterrows():
    # if score is greater than 7, it is positive so change its value to 1, else 0
    if row["rating"] >= 7:
        positive += 1
        df.at[index, "rating"] = 1
    elif row["rating"] <= 4:
        negative += 1
        df.at[index, "rating"] = 0
    else:
        neutral += 1
    # To Remove the stop words uncomment the below line. Does not improve accuracy
    # df.at[index, "review"] = ' '.join([word for word in row["review"].split() if word not in (stopwords.words('english'))])

# Get the GloVe word embeddings
gloveFile = "../glove.twitter.27B.200d.txt"

embeddings = {}
with open(gloveFile, 'r', encoding="utf8") as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings[word] = coefs

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/stathis/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
# Split the data into train 80% , test 10% and validation 10% sets

train = df.sample(frac=0.8, random_state=42)
test = df.drop(train.index)
test = test.sample(frac=0.5, random_state=42)
validation = df.drop(train.index)
validation = validation.drop(test.index)

if testFile is not None:
    test = pd.read_csv(testFile, sep='\t')
    test = test.drop(columns=['url'])

# print df, train, test, validation sizes
print("df size: ", df.shape)
print("train size: ", train.shape)
print("test size: ", test.shape)
print("validation size: ", validation.shape)

df size:  (45008, 2)
train size:  (36006, 2)
test size:  (4501, 2)
validation size:  (4501, 2)


In [3]:
# Get X_train, y_train, X_test, y_test, X_validation, y_validation
X_train = train["review"]
y_train = train["rating"]
X_test = test["review"]
y_test = test["rating"]
X_validation = validation["review"]
y_validation = validation["rating"]


# Create word vectors for train, test and validation sets
def createWordVectors(data):
  wordVectors = [];
  for row in data:
    sum = np.zeros(len(embeddings["hello"]));
    word_count = 0;
    for word in row.split():
      if word in embeddings:
        sum = sum + embeddings[word];
        word_count = word_count+1;
    if word_count != 0:
      wordVectors.append(sum/word_count);
    else:
      wordVectors.append(np.zeros(len(embeddings["hello"])))

  wordVectors = np.array(wordVectors, dtype=np.float32)
  return wordVectors


X_train = createWordVectors(X_train)
X_test = createWordVectors(X_test)
X_validation = createWordVectors(X_validation)

# Convert the word vectors from ndarrays to tensors
X_train = torch.tensor(X_train, dtype=torch.float)
X_test = torch.tensor(X_test, dtype=torch.float)
X_validation = torch.tensor(X_validation, dtype=torch.float)

# Convert the labels to tensors
y_train = torch.tensor(y_train.values, dtype=torch.float)
y_test = torch.tensor(y_test.values, dtype=torch.float)
y_validation = torch.tensor(y_validation.values, dtype=torch.float)


In [7]:
# Create an sentiment classifier using bidirectional stacked RNNs with LSTM/GRU cells. Use the Adam optimizer and the cross-entropy loss function.

class SentimentClassifier(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes):
        super(SentimentClassifier, self).__init__()
        self.num_layers = num_layers
        self.hidden_size = hidden_size
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_size*2, num_classes)

    def forward(self, x):
        # Set initial hidden and cell states
        h0 = torch.zeros(self.num_layers*2, x.size(0), self.hidden_size).to(device)
        c0 = torch.zeros(self.num_layers*2, x.size(0), self.hidden_size).to(device)

        # Forward propagate LSTM
        out, _ = self.lstm(x, (h0, c0))  # out: tensor of shape (batch_size, seq_length, hidden_size*2)

        # Decode the hidden state of the last time step
        out = self.fc(out[:, -1, :])
        return out
    


Epoch [1/10], Step [100/36006], Loss: 0.6942
Epoch [1/10], Step [200/36006], Loss: 0.6281
Epoch [1/10], Step [300/36006], Loss: 0.5907
Epoch [1/10], Step [400/36006], Loss: 0.9674
Epoch [1/10], Step [500/36006], Loss: 0.7307
Epoch [1/10], Step [600/36006], Loss: 0.4721
Epoch [1/10], Step [700/36006], Loss: 1.3581
Epoch [1/10], Step [800/36006], Loss: 0.6551
Epoch [1/10], Step [900/36006], Loss: 0.8071
Epoch [1/10], Step [1000/36006], Loss: 1.2123
Epoch [1/10], Step [1100/36006], Loss: 0.1029
Epoch [1/10], Step [1200/36006], Loss: 0.7776
Epoch [1/10], Step [1300/36006], Loss: 0.3442
Epoch [1/10], Step [1400/36006], Loss: 0.4274
Epoch [1/10], Step [1500/36006], Loss: 0.2070
Epoch [1/10], Step [1600/36006], Loss: 0.6865
Epoch [1/10], Step [1700/36006], Loss: 0.6760
Epoch [1/10], Step [1800/36006], Loss: 0.2190
Epoch [1/10], Step [1900/36006], Loss: 0.2420
Epoch [1/10], Step [2000/36006], Loss: 0.1146
Epoch [1/10], Step [2100/36006], Loss: 0.1593
Epoch [1/10], Step [2200/36006], Loss: 1.63

KeyboardInterrupt: 

In [None]:

# Hyperparameters
input_size = 200
hidden_size = 128
num_layers = 2
num_classes = 1
learning_rate = 0.001
num_epochs = 10

# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = SentimentClassifier(input_size, hidden_size, num_layers, num_classes).to(device)

# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# Train the model
total_step = len(X_train)
for epoch in range(num_epochs):
    for i, (reviews, labels) in enumerate(zip(X_train, y_train)):
        reviews = reviews.reshape(1, 1, -1).to(device)
        labels = labels.reshape(1, 1).to(device)

        # Forward pass
        outputs = model(reviews)
        loss = criterion(outputs, labels)

        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        

        if (i+1) % 100 == 0:
            print ('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}'
                   .format(epoch+1, num_epochs, i+1, total_step, loss.item()))
    with torch.no_grad():
    Y_train_pred = model(X_train);
    loss_train = loss_func(Y_train_pred, y_train.unsqueeze(-1))/Y_train_pred.shape[0]
    train_loss.append(loss_train.item());
    Y_val_pred = model(X_validation);
    loss_val = loss_func(Y_val_pred, y_validation.unsqueeze(-1))/Y_val_pred.shape[0]
    val_loss.append(loss_val.item());

    print("Epoch: ", epoch, "Step: ", i, "Loss: ", loss.item())

# Test the model
model.eval()
with torch.no_grad():
    correct = 0
    total = 0
    for reviews, labels in zip(X_test, y_test):
        reviews = reviews.reshape(1, 1, -1).to(device)
        labels = labels.reshape(1, 1).to(device)
        outputs = model(reviews)
        predicted = (outputs > 0.5).float()
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

    print('Test Accuracy of the model on the 10000 test reviews: {} %'.format(100 * correct / total))

# Find the precision, recall and F1 score
model.eval()
with torch.no_grad():
    y_true = []
    y_pred = []
    for reviews, labels in zip(X_test, y_test):
        reviews = reviews.reshape(1, 1, -1).to(device)
        labels = labels.reshape(1, 1).to(device)
        outputs = model(reviews)
        predicted = (outputs > 0.5).float()
        y_true.append(labels.item())
        y_pred.append(predicted.item())

    print("Precision: ", precision_score(y_true, y_pred))
    print("Recall: ", recall_score(y_true, y_pred))
    print("F1 score: ", f1_score(y_true, y_pred))



        


In [8]:
# Find the precision, recall and F1 score
model.eval()
with torch.no_grad():
    y_true = []
    y_pred = []
    for reviews, labels in zip(X_test, y_test):
        reviews = reviews.reshape(1, 1, -1).to(device)
        labels = labels.reshape(1, 1).to(device)
        outputs = model(reviews)
        predicted = (outputs > 0.5).float()
        y_true.append(labels.item())
        y_pred.append(predicted.item())

    print("Precision: ", precision_score(y_true, y_pred))
    print("Recall: ", recall_score(y_true, y_pred))
    print("F1 score: ", f1_score(y_true, y_pred))


Precision:  0.8692598595353863
Recall:  0.7020069808027923
F1 score:  0.7767318368332126
