In [1]:
!python -m spacy download en
# Download spacy's English corpus

Collecting en_core_web_sm==2.3.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.3.1/en_core_web_sm-2.3.1.tar.gz (12.0 MB)
Building wheels for collected packages: en-core-web-sm
  Building wheel for en-core-web-sm (setup.py): started
  Building wheel for en-core-web-sm (setup.py): finished with status 'done'
  Created wheel for en-core-web-sm: filename=en_core_web_sm-2.3.1-py3-none-any.whl size=12047105 sha256=95e26889384df81d9115b789d74a22b5567857414f8fb35aeabf6718b0258a8c
  Stored in directory: C:\Users\SAFIUD~1\AppData\Local\Temp\pip-ephem-wheel-cache-b7e2tcld\wheels\b7\0d\f0\7ecae8427c515065d75410989e15e5785dd3975fe06e795cd9
Successfully built en-core-web-sm
Installing collected packages: en-core-web-sm
Successfully installed en-core-web-sm-2.3.1
✔ Download and installation successful
You can now load the model via spacy.load('en_core_web_sm')
✘ Couldn't link model to 'en'
Creating a symlink in spacy/data failed. Make sure you have the req

You do not have sufficient privilege to perform this operation.


In [1]:
import spacy
spacy.__version__

'2.3.5'

In [2]:
import torch
import torchtext
from torchtext import datasets

import re
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt

In [4]:
tweets = pd.read_csv('data/tweets/tweets.csv', on_bad_lines='skip')
tweets.shape

(50000, 4)

In [5]:
tweets.head()

Unnamed: 0,ItemID,Sentiment,SentimentSource,SentimentText
0,1,neg,Sentiment140,is so sad for my APL frie...
1,2,neg,Sentiment140,I missed the New Moon trail...
2,3,pos,Sentiment140,omg its already 7:30 :O
3,4,neg,Sentiment140,.. Omgaga. Im sooo im gunna CRy. I've been at...
4,5,neg,Sentiment140,i think mi bf is cheating on me!!! ...


In [6]:
tweets = tweets[['Sentiment', 'SentimentText']]
tweets.head()

Unnamed: 0,Sentiment,SentimentText
0,neg,is so sad for my APL frie...
1,neg,I missed the New Moon trail...
2,pos,omg its already 7:30 :O
3,neg,.. Omgaga. Im sooo im gunna CRy. I've been at...
4,neg,i think mi bf is cheating on me!!! ...


In [7]:
tweets.Sentiment.value_counts()

pos    26921
neg    23079
Name: Sentiment, dtype: int64

In [8]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(tweets, test_size=0.2, random_state=42)

In [9]:
train.reset_index(drop=True, inplace=True)
test.reset_index(drop=True, inplace=True)

In [10]:
train.shape, test.shape

((40000, 2), (10000, 2))

In [11]:
train.to_csv('data/tweets/train_tweets.csv', index=False)
test.to_csv('data/tweets/test_tweets.csv', index=False)

In [12]:
!ls data/tweets/

test_tweets.csv
train_tweets.csv
tweets.csv


In [13]:
def tweet_clean(text):

    text = re.sub(r'[^A-Za-z0-9]+', ' ', text) # remove non-alphanumeric characters
    text = re.sub(r'https?:/\/\S+', ' ', text) # remove links

    return text.strip()

In [14]:
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner', 'tagger'])

def tokenizer(s):
    return [w.text.lower() for w in nlp(tweet_clean(s))]

In [17]:
# Field, LabelField etc have been deprecated
TEXT = torchtext.legacy.data.Field(tokenize=tokenizer)
LABEL = torchtext.legacy.data.LabelField(dtype=torch.float)

In [18]:
datafields = [('Sentiment', LABEL), ('SentimentText', TEXT)]

In [19]:
trn, tst = torchtext.legacy.data.TabularDataset.splits(path='data/tweets/',
                                                        train='train_tweets.csv',
                                                        test='test_tweets.csv',
                                                        format='csv',
                                                        skip_header=True,
                                                        fields=datafields)

In [20]:
print(f'Number of training examples: {len(trn)}')
print(f'Number of testing examples: {len(tst)}')

Number of training examples: 40000
Number of testing examples: 10000


In [21]:
vars(trn.examples[0])

{'Sentiment': 'pos',
 'SentimentText': ['amyrenea',
  'omg',
  'so',
  'am',
  'i',
  'lol',
  'i',
  'fell',
  'asleep',
  'when',
  'it',
  'was',
  'on',
  'last',
  'night',
  'so',
  'now',
  'i',
  'get',
  'to',
  'finish',
  'it']}

In [22]:
TEXT.build_vocab(trn, max_size=25000, vectors="glove.6B.100d", unk_init=torch.Tensor.normal_)
LABEL.build_vocab(trn)

In [23]:
TEXT.vocab.freqs.most_common(20)

[('i', 25644),
 ('the', 12219),
 ('to', 12111),
 ('you', 10723),
 ('a', 9197),
 ('it', 8440),
 ('and', 6889),
 ('my', 6208),
 ('quot', 5582),
 ('s', 5565),
 ('that', 5306),
 ('is', 5203),
 ('for', 4971),
 ('in', 4852),
 ('t', 4844),
 ('m', 4683),
 ('me', 4588),
 ('of', 4331),
 ('on', 3918),
 ('have', 3752)]

In [24]:
train_iterator, test_iterator = torchtext.legacy.data.BucketIterator.splits(
    (trn, tst),
    batch_size=64,
    sort_key=lambda x: len(x.SentimentText),
    sort_within_batch=False)

In [25]:
import torch.nn as nn

class RNN(nn.Module):

    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, bidirectional, dropout):
        super().__init__()

        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.GRU(embedding_dim, hidden_dim, num_layers=n_layers,
                            bidirectional=bidirectional, dropout=dropout)
        self.fc = nn.Linear(hidden_dim*2, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, text):

        embedded = self.dropout(self.embedding(text))
        output, hidden = self.rnn(embedded)
        hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1))

        return self.fc(hidden.squeeze(0))

In [26]:
input_dim = len(TEXT.vocab)
embedding_dim = 100
hidden_dim = 20
output_dim = 1
n_layers = 2
bidirectional = True
dropout = 0.5

In [27]:
model = RNN(
    vocab_size=input_dim,
    embedding_dim=embedding_dim,
    hidden_dim=hidden_dim,
    output_dim=output_dim,
    n_layers=n_layers,
    bidirectional=bidirectional,
    dropout=dropout
)

In [28]:
model

RNN(
  (embedding): Embedding(25002, 100)
  (rnn): GRU(100, 20, num_layers=2, dropout=0.5, bidirectional=True)
  (fc): Linear(in_features=40, out_features=1, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
)

In [29]:
pretrained_embeddings = TEXT.vocab.vectors
print(pretrained_embeddings)

tensor([[ 0.0845,  0.1299,  1.0558,  ...,  1.6125, -1.7121, -2.0358],
        [-0.0477, -0.6343,  0.6997,  ...,  0.0605,  0.2213, -0.6617],
        [-0.0465,  0.6197,  0.5665,  ..., -0.3762, -0.0325,  0.8062],
        ...,
        [-1.3180, -1.1452, -0.1318,  ..., -0.1025,  0.9116, -1.0183],
        [-0.4787, -1.8472, -1.8026,  ..., -0.2249, -0.2704,  0.7180],
        [-1.1346, -1.2243, -0.1198,  ...,  0.4250, -1.2294,  0.6927]])


In [30]:
pretrained_embeddings.shape

torch.Size([25002, 100])

In [31]:
# Initialize embedding layer with GloVe vectors
model.embedding.weight.data.copy_(pretrained_embeddings)

tensor([[ 0.0845,  0.1299,  1.0558,  ...,  1.6125, -1.7121, -2.0358],
        [-0.0477, -0.6343,  0.6997,  ...,  0.0605,  0.2213, -0.6617],
        [-0.0465,  0.6197,  0.5665,  ..., -0.3762, -0.0325,  0.8062],
        ...,
        [-1.3180, -1.1452, -0.1318,  ..., -0.1025,  0.9116, -1.0183],
        [-0.4787, -1.8472, -1.8026,  ..., -0.2249, -0.2704,  0.7180],
        [-1.1346, -1.2243, -0.1198,  ...,  0.4250, -1.2294,  0.6927]])

In [32]:
unk_idx = TEXT.vocab.stoi[TEXT.unk_token]
pad_idx = TEXT.vocab.stoi[TEXT.pad_token]

model.embedding.weight.data[unk_idx] = torch.zeros(embedding_dim)
model.embedding.weight.data[pad_idx] = torch.zeros(embedding_dim)

print(model.embedding.weight.data)

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.0465,  0.6197,  0.5665,  ..., -0.3762, -0.0325,  0.8062],
        ...,
        [-1.3180, -1.1452, -0.1318,  ..., -0.1025,  0.9116, -1.0183],
        [-0.4787, -1.8472, -1.8026,  ..., -0.2249, -0.2704,  0.7180],
        [-1.1346, -1.2243, -0.1198,  ...,  0.4250, -1.2294,  0.6927]])


In [33]:
import torch.optim as optim
optimizer = optim.Adam(model.parameters())
criterion = nn.BCEWithLogitsLoss()

In [34]:
def train(model, iterator, optimizer, criterion):

    epoch_loss = 0
    epoch_acc = 0

    model.train()

    for batch in iterator:

        optimizer.zero_grad()
        predictions = model(batch.SentimentText).squeeze(1)
        loss = criterion(predictions, batch.Sentiment)
        rounded_preds = torch.round(torch.sigmoid(predictions))
        correct = (rounded_preds == batch.Sentiment).float()

        acc = correct.sum() / len(correct)

        loss.backward()
        optimizer.step()


        epoch_loss += loss.item()
        epoch_acc += acc.item()
    
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [35]:
num_epochs = 10

for epoch in range(1, num_epochs + 1):

    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)

    print(f'Epoch: {epoch:02} | Train Loss: {train_loss:.4f} | Train Acc: {train_acc*100:.2f}%')

Epoch: 01 | Train Loss: 0.6416 | Train Acc: 62.33%
Epoch: 02 | Train Loss: 0.5432 | Train Acc: 73.14%
Epoch: 03 | Train Loss: 0.5005 | Train Acc: 76.16%
Epoch: 04 | Train Loss: 0.4687 | Train Acc: 78.20%
Epoch: 05 | Train Loss: 0.4464 | Train Acc: 79.50%
Epoch: 06 | Train Loss: 0.4271 | Train Acc: 80.62%
Epoch: 07 | Train Loss: 0.4131 | Train Acc: 81.45%
Epoch: 08 | Train Loss: 0.3994 | Train Acc: 82.24%
Epoch: 09 | Train Loss: 0.3780 | Train Acc: 83.42%
Epoch: 10 | Train Loss: 0.3661 | Train Acc: 84.08%


In [36]:
epoch_loss = 0
epoch_acc = 0

model.eval()

with torch.no_grad():

    for batch in test_iterator:

        predictions = model(batch.SentimentText).squeeze(1)
        loss = criterion(predictions, batch.Sentiment)
        rounded_preds = torch.round(torch.sigmoid(predictions))
        correct = (rounded_preds == batch.Sentiment).float()

        acc = correct.sum()/len(correct)

        epoch_loss += loss.item()
        epoch_acc += acc.item()

test_loss = epoch_loss / len(test_iterator)
test_acc = epoch_acc / len(test_iterator)

print(f'Test Loss: {test_loss:.4f} | Test Acc: {test_acc*100:.2f}%')

Test Loss: 0.4955 | Test Acc: 76.60%


In [43]:
sentence = input("Enter a sentence: ")
print(f'Sentence: {sentence}')
tokenized = [tok.text for tok in nlp.tokenizer(sentence)]
indexed = [TEXT.vocab.stoi[t] for t in tokenized]
tensor = torch.LongTensor(indexed)
tensor = tensor.unsqueeze(1) # Add the batch dimension
prediction = torch.sigmoid(model(tensor))
result = LABEL.vocab.itos[int(prediction.item()>0.5)]
print(f'Sentiment: {result}')

Sentence: Awful show. Do not recommend
Sentiment: neg


In [45]:
sentence = input("Enter a sentence: ")
print(f'Sentence: {sentence}')
tokenized = [tok.text for tok in nlp.tokenizer(sentence)]
indexed = [TEXT.vocab.stoi[t] for t in tokenized]
tensor = torch.LongTensor(indexed)
tensor = tensor.unsqueeze(1) # Add the batch dimension
prediction = torch.sigmoid(model(tensor))
result = LABEL.vocab.itos[int(prediction.item()>0.5)]
print(f'Sentiment: {result}')

Sentence: Marvelous show
Sentiment: pos


In [46]:
import gc
gc.collect()

199