In [38]:
# install nltk
! pip install nltk



In [39]:
# Libraries
import pandas as pd
import numpy as np
import nltk
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from nltk.tokenize import word_tokenize
from collections import Counter

In [40]:
# Load drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [41]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [42]:
# Load dataset
df = pd.read_csv(r'/content/drive/MyDrive/Datasets/twitter_training.csv')
df.head()

Unnamed: 0,2401,Borderlands,Positive,"im getting on borderlands and i will murder you all ,"
0,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
1,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
2,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
3,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
4,2401,Borderlands,Positive,im getting into borderlands and i can murder y...


In [43]:
# Create columns
df.columns = ['id', 'name', 'sentiment', 'tweet']
df.head()

Unnamed: 0,id,name,sentiment,tweet
0,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
1,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
2,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
3,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
4,2401,Borderlands,Positive,im getting into borderlands and i can murder y...


In [44]:
# Shape
df.shape

(74681, 4)

In [45]:
# Remove rows with missing values
df = df.dropna()
df.shape

(73995, 4)

In [46]:
# Remove unneccesary columns
df = df.drop(columns = ['id', 'name'], axis = 1)
df.head()

Unnamed: 0,sentiment,tweet
0,Positive,I am coming to the borders and I will kill you...
1,Positive,im getting on borderlands and i will kill you ...
2,Positive,im coming on borderlands and i will murder you...
3,Positive,im getting on borderlands 2 and i will murder ...
4,Positive,im getting into borderlands and i can murder y...


In [47]:
# Unique classes
df['sentiment'].value_counts()

Unnamed: 0_level_0,count
sentiment,Unnamed: 1_level_1
Negative,22358
Positive,20654
Neutral,18108
Irrelevant,12875


In [48]:
# basic nltk modules
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [49]:
# Tokenize
all_tokens = []
for tweet in df['tweet']:
    all_tokens.extend(word_tokenize(str(tweet).lower()))

In [50]:
len(all_tokens)

1670058

In [51]:
# Build vocab
vocab = {
    '<PAD>': 0,
    '<UNK>': 1
}

In [52]:
for token in Counter(all_tokens).keys():
    if token not in vocab:
        vocab[token] = len(vocab)

In [53]:
len(vocab)

41261

In [54]:
# Convert text to indices
def text_to_indices(sentence, vocab, max_len = 50):
    numerical_sentence = []
    token_sentence = word_tokenize(str(sentence).lower())
    for token in token_sentence:
        if token in vocab:
            numerical_sentence.append(vocab[token])
        else:
            numerical_sentence.append(vocab['<UNK>'])

    # padding
    if len(numerical_sentence) < max_len:
        numerical_sentence += [vocab['<PAD>']] * (max_len - len(numerical_sentence))
    else:
        numerical_sentence = numerical_sentence[:max_len]
    return numerical_sentence

In [55]:
text_to_indices('This movie is good', vocab)

[128,
 908,
 41,
 126,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0]

In [56]:
df['tweet_indices'] = df['tweet'].apply(lambda x: text_to_indices(x, vocab))
df.head()

Unnamed: 0,sentiment,tweet,tweet_indices
0,Positive,I am coming to the borders and I will kill you...,"[2, 3, 4, 5, 6, 7, 8, 2, 9, 10, 11, 12, 13, 0,..."
1,Positive,im getting on borderlands and i will kill you ...,"[14, 15, 16, 17, 8, 2, 9, 10, 11, 12, 13, 0, 0..."
2,Positive,im coming on borderlands and i will murder you...,"[14, 4, 16, 17, 8, 2, 9, 18, 11, 12, 13, 0, 0,..."
3,Positive,im getting on borderlands 2 and i will murder ...,"[14, 15, 16, 17, 19, 8, 2, 9, 18, 11, 20, 12, ..."
4,Positive,im getting into borderlands and i can murder y...,"[14, 15, 21, 17, 8, 2, 22, 18, 11, 12, 13, 0, ..."


In [57]:
df['sentiment'].value_counts()

Unnamed: 0_level_0,count
sentiment,Unnamed: 1_level_1
Negative,22358
Positive,20654
Neutral,18108
Irrelevant,12875


In [58]:
# Convert sentiment into numerics
sentiment_encode = {
    'Negative': 0,
    'Positive': 1,
    'Neutral': 2,
    'Irrelevant': 3
}

In [59]:
df['sentiment'] = df['sentiment'].map(sentiment_encode)
df.head()

Unnamed: 0,sentiment,tweet,tweet_indices
0,1,I am coming to the borders and I will kill you...,"[2, 3, 4, 5, 6, 7, 8, 2, 9, 10, 11, 12, 13, 0,..."
1,1,im getting on borderlands and i will kill you ...,"[14, 15, 16, 17, 8, 2, 9, 10, 11, 12, 13, 0, 0..."
2,1,im coming on borderlands and i will murder you...,"[14, 4, 16, 17, 8, 2, 9, 18, 11, 12, 13, 0, 0,..."
3,1,im getting on borderlands 2 and i will murder ...,"[14, 15, 16, 17, 19, 8, 2, 9, 18, 11, 20, 12, ..."
4,1,im getting into borderlands and i can murder y...,"[14, 15, 21, 17, 8, 2, 22, 18, 11, 12, 13, 0, ..."


In [60]:
df.iloc[9]

Unnamed: 0,9
sentiment,1
tweet,2010 So I spent a few hours making something f...
tweet_indices,"[77, 23, 2, 24, 25, 26, 27, 28, 29, 30, 31, 32..."


In [61]:
# Create X and y
X = torch.tensor(df['tweet_indices'], dtype = torch.long)
y = torch.tensor(df['sentiment'], dtype = torch.long)

In [62]:
# Split data
from sklearn.model_selection import train_test_split

In [63]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)

In [64]:
# Custom Dataset Class
class CustomDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

In [65]:
# Dataset object
train_dataset = CustomDataset(X_train, y_train)
test_dataset = CustomDataset(X_test, y_test)

In [66]:
# DataLoader class
train_loader = DataLoader(train_dataset, batch_size = 64, shuffle = True, pin_memory = True, num_workers = 2)
test_loader = DataLoader(test_dataset, batch_size = 64, shuffle = False, pin_memory = True, num_workers = 2)

In [67]:
# LSTM architecture
class LSTMmodel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim = 100) # embedding layer
        self.lstm = nn.LSTM(input_size = 100, hidden_size = 150, batch_first = True) # lstm unit
        self.fc = nn.Linear(150, 4) # 4 classes

    def forward(self, x):
        x = self.embedding(x)
        int_hidden_state, (final_hidden_state, final_cell_state) = self.lstm(x)
        final_hidden_state = final_hidden_state[-1]
        output = self.fc(final_hidden_state)
        return output

In [71]:
# Define parameters
learning_rate = 0.001
epochs = 50

In [72]:
# Define model, loss and optimizer
model = LSTMmodel(len(vocab)).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr = learning_rate)

In [73]:
# Define training loop
for epoch in range(epochs):
    total_loss = 0
    for batch_features, batch_labels in train_loader:
        batch_features = batch_features.to(device)
        batch_labels = batch_labels.to(device)
        outputs = model(batch_features)
        loss = criterion(outputs, batch_labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    avg_loss = total_loss / len(train_loader)
    print(f'Epoch: {epoch + 1}, Loss: {avg_loss}')

Epoch: 1, Loss: 1.3094838666327207
Epoch: 2, Loss: 1.0480274059154369
Epoch: 3, Loss: 0.8118790872303057
Epoch: 4, Loss: 0.5789377766810817
Epoch: 5, Loss: 0.3903428962937108
Epoch: 6, Loss: 0.2661992371174288
Epoch: 7, Loss: 0.18910651988361352
Epoch: 8, Loss: 0.1511599384466715
Epoch: 9, Loss: 0.12424848633300926
Epoch: 10, Loss: 0.10937399792680402
Epoch: 11, Loss: 0.09253105321571369
Epoch: 12, Loss: 0.08327365617014836
Epoch: 13, Loss: 0.08157187989757707
Epoch: 14, Loss: 0.07644226876336127
Epoch: 15, Loss: 0.07049141230728523
Epoch: 16, Loss: 0.06813535906872453
Epoch: 17, Loss: 0.06300639233316215
Epoch: 18, Loss: 0.0642550726717246
Epoch: 19, Loss: 0.05718840514755245
Epoch: 20, Loss: 0.061629877896421804
Epoch: 21, Loss: 0.055932466392056804
Epoch: 22, Loss: 0.05337500472468278
Epoch: 23, Loss: 0.05538438203199163
Epoch: 24, Loss: 0.05568697578406657
Epoch: 25, Loss: 0.05260505218455499
Epoch: 26, Loss: 0.050477761956651115
Epoch: 27, Loss: 0.04836813357067086
Epoch: 28, Loss

In [83]:
batch_labels.shape

torch.Size([55])

In [74]:
# Evaluation
model.eval()
with torch.no_grad():
    total = 0
    correct = 0
    for batch_features, batch_labels in test_loader:
        batch_features = batch_features.to(device)
        batch_labels = batch_labels.to(device)
        outputs = model(batch_features)
        _, predicted = torch.max(outputs, dim = 1)
        total += batch_labels.shape[0]
        correct += (predicted == batch_labels).sum()
acc = correct.float() / total
print(f'Accuracy: {acc}')

Accuracy: 0.8561196327209473


In [80]:
# Sample predictions
reverse_mapping = {v: k for k, v in sentiment_encode.items()}
def predict(text, model, vocab, max_len = 50):
    model.eval() # eval mode
    indices = text_to_indices(text, vocab) # indices
    if len(indices) < max_len:
        indices += [vocab['<PAD>']] * (max_len - len(indices)) # padding
    else:
        indices = indices[:max_len] # truncate if indices > threshold

    X = torch.tensor([indices]).to(device)
    with torch.no_grad():
        output = model(X)
        pred_class = torch.argmax(output, dim = 1).item()
        pred_label = reverse_mapping[pred_class]

    return pred_label

In [82]:
text = "This a very bad movie, I don't' like it"
pred = predict(text, model, vocab)
print('Sentiment:', pred)

Sentiment: Negative


In [84]:
text = "I've had great experience, the movie was very good"
pred = predict(text, model, vocab)
print('Sentiment:', pred)

Sentiment: Positive
