In [None]:
!pip install emoji



In [None]:
import re
import numpy as np
import pandas as pd

from lxml import html
from emoji import demojize
from tqdm.notebook import tqdm

import nltk
nltk.download('stopwords')

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

import torch
from torch.utils.data import Dataset, DataLoader
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
is_gpu = torch.cuda.is_available()

In [None]:
df = pd.read_csv('/content/drive/MyDrive/tweet_data.csv')
df.head()

Unnamed: 0,text,sentiment
0,Sooo SAD I will miss you here in San Diego!!!,negative
1,my boss is bullying me...,negative
2,what interview! leave me alone,negative
3,"Sons of ****, why couldn`t they put them on t...",negative
4,2am feedings for the baby are fun when he is a...,positive


In [None]:
df['sentiment'].value_counts()

positive    8582
negative    7781
Name: sentiment, dtype: int64

# Data Pre-Processing

In [None]:
stemmer = PorterStemmer()
stop = stopwords.words('english')

def clean_text(text):
    # Convert Emoji to strings
    text = demojize(text)

    # Remove HTML Tags
    try:
        text = html.document_fromstring(text).text_content()
    except:
        pass
    
    # Remove Hyperlinks
    text = re.sub('http\S+', ' ', text)

    # Remove non alphabets
    text = re.sub('[^a-zA-Z ]+', ' ', text)

    # Lowercase and split
    text = text.lower().split()

    # Join and return
    return ' '.join(text)

In [None]:
sample_text = "Hi there! i've been trying this product: for a while now it's 🔥 https://www.amazon.in/s?k=keyboard&page=2"
print(f'Original String: {sample_text}')
print(f'Cleaned String: {clean_text(sample_text)}')

Original String: Hi there! i've been trying this product: for a while now it's 🔥 https://www.amazon.in/s?k=keyboard&page=2
Cleaned String: hi there i ve been trying this product for a while now it s fire


In [None]:
texts = df['text'].apply(clean_text)
labels = df['sentiment'].map({'positive':1, 'negative': 0})

In [None]:
texts

0               sooo sad i will miss you here in san diego
1                                   my boss is bullying me
2                            what interview leave me alone
3        sons of why couldn t they put them on the rele...
4        am feedings for the baby are fun when he is al...
                               ...                        
16358                                       enjoy ur night
16359    wish we could come see u on denver husband los...
16360    i ve wondered about rake to the client has mad...
16361    yay good for both of you enjoy the break you p...
16362                                  but it was worth it
Name: text, Length: 16363, dtype: object

In [None]:
labels

0        0
1        0
2        0
3        0
4        1
        ..
16358    1
16359    0
16360    0
16361    1
16362    1
Name: sentiment, Length: 16363, dtype: int64

In [None]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(texts, labels, random_state = 0, stratify = labels)
x_valid, x_test, y_valid, y_test = train_test_split(x_test, y_test, random_state = 0, test_size = 0.5, stratify = y_test)

# Sentences as Sequence

## Data Modelling

In [None]:
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, maxlen = 50):
        self.X = texts.values
        self.Y = labels.values
        self.tokenizer = tokenizer
        self.maxlen = maxlen
    
    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        text = self.X[idx]
        tokens = self.tokenizer.texts_to_sequences([text])[0]
        padded_tokens = pad_sequences([tokens], maxlen = self.maxlen, padding = 'post')[0]

        return {
            'X': torch.tensor(padded_tokens),
            'Y': torch.tensor(self.Y[idx], dtype = torch.float32)
        }

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(x_train)

In [None]:
train_data = TextDataset(x_train, y_train, tokenizer, 30)
valid_data = TextDataset(x_valid, y_valid, tokenizer, 30)
test_data = TextDataset(x_test, y_test, tokenizer, 30)

trainloader = DataLoader(train_data, batch_size = 32)
validloader = DataLoader(valid_data, batch_size = 32)

## Model Building

In [None]:
from torch import nn, optim

In [None]:
class LSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dims, hidden_dims):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, embed_dims)
        self.lstm = nn.LSTM(input_size = embed_dims,
                            hidden_size = hidden_dims,
                            num_layers = 3,
                            batch_first = True)
        self.out = nn.Linear(hidden_dims, 1)

    def forward(self, x):
        x = self.embed(x)
        _, (h_n, c_n) = self.lstm(x)
        lstm_out = h_n[-1,:,:]
        return torch.sigmoid(self.out(lstm_out))

In [None]:
vocab_size = len(tokenizer.word_index)

In [None]:
model = LSTMClassifier(vocab_size + 1, 300, 128)
if is_gpu:
    model = model.cuda()

In [None]:
optimizer = optim.Adam(model.parameters())
criterion = nn.BCELoss()

## Training and Inference

In [None]:
epochs = 5
min_valid_loss = np.inf

for i in range(epochs):
    train_loss = 0.0
    model.train()
    for batch in tqdm(trainloader, desc = f'Epoch {i+1} Training'):
        optimizer.zero_grad()
        text, label = batch['X'], batch['Y']
        
        if is_gpu:
            text, label = text.cuda(), label.cuda()
        output = model(text)
        loss = criterion(output.flatten(), label)
        train_loss += loss.item()

        loss.backward()
        optimizer.step()
    
    with torch.no_grad():
        valid_loss = 0.0
        model.eval()
        for batch in tqdm(validloader, desc = f'Epoch {i+1} Validation'):
            text, label = batch['X'], batch['Y']
            
            if is_gpu:
                text, label = text.cuda(), label.cuda()
            output = model(text)
            loss = criterion(output.flatten(), label)
            valid_loss += loss.item()
    
    train_loss /= len(trainloader)
    valid_loss /= len(validloader)
    print(f'Epoch {i+1} \t Training Loss:{train_loss} \t Validation Loss:{valid_loss} ')
    
    if min_valid_loss > valid_loss:
        print(f'Validation Loss Decreased({min_valid_loss:.6f}--------->{valid_loss:.6f})\t... Saving Model')
        torch.save(model.state_dict(), 'model.pth')
        print('Model Weights Saved!')
        min_valid_loss = valid_loss

HBox(children=(FloatProgress(value=0.0, description='Epoch 1 Training', max=384.0, style=ProgressStyle(descrip…




HBox(children=(FloatProgress(value=0.0, description='Epoch 1 Validation', max=64.0, style=ProgressStyle(descri…


Epoch 1 	 Training Loss:0.6920807673595846 	 Validation Loss:0.6919466825202107 
Validation Loss Decreased(inf--------->0.691947)	... Saving Model
Model Weights Saved!


HBox(children=(FloatProgress(value=0.0, description='Epoch 2 Training', max=384.0, style=ProgressStyle(descrip…




HBox(children=(FloatProgress(value=0.0, description='Epoch 2 Validation', max=64.0, style=ProgressStyle(descri…


Epoch 2 	 Training Loss:0.5748208113169918 	 Validation Loss:0.4169403382111341 
Validation Loss Decreased(0.691947--------->0.416940)	... Saving Model
Model Weights Saved!


HBox(children=(FloatProgress(value=0.0, description='Epoch 3 Training', max=384.0, style=ProgressStyle(descrip…




HBox(children=(FloatProgress(value=0.0, description='Epoch 3 Validation', max=64.0, style=ProgressStyle(descri…


Epoch 3 	 Training Loss:0.33213401461640996 	 Validation Loss:0.3659438705071807 
Validation Loss Decreased(0.416940--------->0.365944)	... Saving Model
Model Weights Saved!


HBox(children=(FloatProgress(value=0.0, description='Epoch 4 Training', max=384.0, style=ProgressStyle(descrip…




HBox(children=(FloatProgress(value=0.0, description='Epoch 4 Validation', max=64.0, style=ProgressStyle(descri…


Epoch 4 	 Training Loss:0.2095660559280077 	 Validation Loss:0.3861286483006552 


HBox(children=(FloatProgress(value=0.0, description='Epoch 5 Training', max=384.0, style=ProgressStyle(descrip…




HBox(children=(FloatProgress(value=0.0, description='Epoch 5 Validation', max=64.0, style=ProgressStyle(descri…


Epoch 5 	 Training Loss:0.13287503902150397 	 Validation Loss:0.45659927965607494 


In [None]:
model.load_state_dict(torch.load('model.pth'))

<All keys matched successfully>

In [None]:
correct = 0
pred_labels = []
test-_labels = []
with torch.no_grad():
    model.eval()
    for batch in tqdm(test_data):
        text, label = batch['X'], batch['Y']
        
        if is_gpu:
            text, label = text.cuda(), label.cuda()
        output = model(text.unsqueeze(dim = 0))
        logit = 0 if output[0].item() < 0.5 else 1
        pred_labels.append(logit)
        valid_labels.append(label.item())
        if logit == label:
            correct+=1

correct

HBox(children=(FloatProgress(value=0.0, max=2046.0), HTML(value='')))




1730

In [None]:
from sklearn.metrics import classification_report

print(classification_report(test_labels, pred_labels))

              precision    recall  f1-score   support

         0.0       0.83      0.85      0.84       973
         1.0       0.86      0.84      0.85      1073

    accuracy                           0.85      2046
   macro avg       0.85      0.85      0.85      2046
weighted avg       0.85      0.85      0.85      2046

