In [1]:
! pip install -q "gdown==4.6"

In [2]:
!gdown --id 19TGf1A2MwlBlYM1ORKJREjP77Y1dGoB7

Downloading...
From: https://drive.google.com/uc?id=19TGf1A2MwlBlYM1ORKJREjP77Y1dGoB7
To: /content/training.1600000.processed.noemoticon.csv
100% 239M/239M [00:01<00:00, 121MB/s]


In [3]:
import pandas as pd
import numpy as np
import re
import re
import string
import nltk
from nltk.corpus import stopwords
import matplotlib.pyplot as plt
import torch
from sklearn.model_selection import train_test_split
from tokenizers import Tokenizer, pre_tokenizers, trainers
from tokenizers.models import BPE
from tokenizers.processors import TemplateProcessing
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.trainers import BpeTrainer

In [4]:
df = pd.read_csv("/content/training.1600000.processed.noemoticon.csv", encoding='latin')

In [5]:
df.columns = ['polarity','id', 'date','query', 'user', 'text']

In [6]:
df['text']

0          is upset that he can't update his Facebook by ...
1          @Kenichan I dived many times for the ball. Man...
2            my whole body feels itchy and like its on fire 
3          @nationwideclass no, it's not behaving at all....
4                              @Kwesidei not the whole crew 
                                 ...                        
1599994    Just woke up. Having no school is the best fee...
1599995    TheWDB.com - Very cool to hear old Walt interv...
1599996    Are you ready for your MoJo Makeover? Ask me f...
1599997    Happy 38th Birthday to my boo of alll time!!! ...
1599998    happy #charitytuesday @theNSPCC @SparksCharity...
Name: text, Length: 1599999, dtype: object

In [7]:
urlPattern = r"((http://)[^ ]*|(https://)[^ ]*|( www\.)[^ ]*)"
userPattern = '@[^\s]+'
alphaPattern = "[^a-zA-Z0-9]"

def process_tweet(tweet):
  tweet = tweet.lower()

  #Replace all URls
  tweet = re.sub(urlPattern,'',tweet)
  #Removing all userName
  tweet = re.sub(userPattern, '', tweet)
  #Remove punctuations
  tweet = tweet.translate(str.maketrans("","", string.punctuation))

  return tweet

In [8]:
df['preprocessed_tweets'] = df['text'].apply(lambda x: process_tweet(x))
print("Text pre-processing is done")

Text pre-processing is done


In [9]:
df['preprocessed_tweets']

0          is upset that he cant update his facebook by t...
1           i dived many times for the ball managed to sa...
2            my whole body feels itchy and like its on fire 
3           no its not behaving at all im mad why am i he...
4                                        not the whole crew 
                                 ...                        
1599994    just woke up having no school is the best feel...
1599995    thewdbcom  very cool to hear old walt intervie...
1599996    are you ready for your mojo makeover ask me fo...
1599997    happy 38th birthday to my boo of alll time tup...
1599998                             happy charitytuesday    
Name: preprocessed_tweets, Length: 1599999, dtype: object

In [10]:
df['polarity'] = df['polarity'].replace({4: 1})

In [11]:
from tokenizers import Tokenizer, trainers, pre_tokenizers
from tokenizers.models import BPE

In [12]:
# Initialize the tokenizer
tokenizer = Tokenizer(BPE())

# Set pre-tokenizer
tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()

# Prepare a trainer for the tokenizer
trainer = BpeTrainer()

# Train the tokenizer on the text data
tokenizer.train_from_iterator(df['preprocessed_tweets'].tolist(), trainer=trainer)

# Enable padding to a length of 60 tokens and truncation to a maximum length of 60 tokens
tokenizer.enable_padding(length=100)
tokenizer.enable_truncation(max_length=100)

# Encode the text column and get the IDs with padding and truncation
df['encoded'] = df['preprocessed_tweets'].apply(lambda x: tokenizer.encode(x).ids)

In [13]:
df['encoded']

0          [136, 1655, 178, 176, 269, 1326, 564, 1295, 40...
1          [18, 13, 2982, 786, 728, 155, 130, 973, 3289, ...
2          [147, 1104, 893, 1191, 5004, 144, 244, 209, 12...
3          [142, 209, 210, 14041, 129, 165, 164, 523, 425...
4          [210, 130, 1104, 3987, 0, 0, 0, 0, 0, 0, 0, 0,...
                                 ...                        
1599994    [201, 937, 190, 601, 142, 503, 136, 130, 555, ...
1599995    [130, 32, 7305, 337, 265, 624, 123, 707, 669, ...
1599996    [214, 143, 427, 155, 257, 10881, 11284, 1135, ...
1599997    [439, 3, 4987, 785, 123, 147, 779, 162, 7325, ...
1599998    [439, 25661, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
Name: encoded, Length: 1599999, dtype: object

In [14]:
df['encoded'].apply(len).max()

100

In [15]:
df['encoded'].apply(len).min()

100

In [16]:
print(df.loc[1, 'encoded'])

[18, 13, 2982, 786, 728, 155, 130, 973, 3289, 123, 1983, 1675, 130, 932, 158, 183, 162, 27911, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [17]:
# Decode a specific row, e.g., the second row (index 1)
encoded_row = df.loc[1, 'encoded']
decoded_text = tokenizer.decode(encoded_row)

# Display the decoded text
print("Decoded text from the second row:")
print(decoded_text)

Decoded text from the second row:
i d ived many times for the ball managed to save 50 the rest go out of bounds 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


In [18]:
x = df['encoded'].values

In [19]:
X = np.array([np.array(lst) for lst in x])

In [20]:
X.shape

(1599999, 100)

In [21]:
y = df['polarity'].values

In [22]:
y.shape

(1599999,)

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [24]:
X_train.shape

(1119999, 100)

In [25]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

# Convert your numpy arrays to tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.long)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)
X_test_tensor = torch.tensor(X_test, dtype=torch.long)
y_test_tensor = torch.tensor(y_test, dtype=torch.long)

In [26]:
# Create TensorDatasets
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=512, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=512, shuffle=False)

In [27]:
# Define the LSTM model with an embedding layer
class LSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, num_layers, num_classes):
        super(LSTMClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, num_classes)
        self.hidden_size = hidden_size
        self.num_layers = num_layers

    def forward(self, x):
        x = self.embedding(x)
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)

        out, _ = self.lstm(x, (h0, c0))
        out = self.fc(out[:, -1, :])
        return out

In [28]:
# Hyperparameters
vocab_size = max(X_train.max(), X_test.max()) + 1  # assuming X_train and X_test contain token IDs starting from 0
embed_size = 128
hidden_size = 512
num_layers = 2
num_classes = len(torch.unique(y_train_tensor))

print(num_classes)

2


In [29]:
model = LSTMClassifier(vocab_size, embed_size, hidden_size, num_layers, num_classes)
model = model.to('cuda' if torch.cuda.is_available() else 'cpu')
model

LSTMClassifier(
  (embedding): Embedding(30000, 128)
  (lstm): LSTM(128, 512, num_layers=2, batch_first=True)
  (fc): Linear(in_features=512, out_features=2, bias=True)
)

In [30]:
# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [31]:
num_epochs = 5

for epoch in range(num_epochs):
    model.train()
    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to('cuda' if torch.cuda.is_available() else 'cpu'), y_batch.to('cuda' if torch.cuda.is_available() else 'cpu')

        outputs = model(X_batch)
        loss = criterion(outputs, y_batch)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    # Evaluate on the test set
    model.eval()
    with torch.no_grad():
        correct = 0
        total = 0
        for X_batch, y_batch in test_loader:
            X_batch, y_batch = X_batch.to('cuda' if torch.cuda.is_available() else 'cpu'), y_batch.to('cuda' if torch.cuda.is_available() else 'cpu')
            outputs = model(X_batch)
            _, predicted = torch.max(outputs.data, 1)
            total += y_batch.size(0)
            correct += (predicted == y_batch).sum().item()

        accuracy = 100 * correct / total
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}, Accuracy: {accuracy:.2f}%')

Epoch [1/5], Loss: 0.6931, Accuracy: 50.13%
Epoch [2/5], Loss: 0.3849, Accuracy: 80.73%
Epoch [3/5], Loss: 0.3718, Accuracy: 82.59%
Epoch [4/5], Loss: 0.3750, Accuracy: 83.01%
Epoch [5/5], Loss: 0.2615, Accuracy: 82.83%


In [32]:
# Function to predict polarity for a given text
def predict_polarity(text, tokenizer, model):
    # Encode the input text
    encoded_text = tokenizer.encode(text).ids
    # Pad/truncate to the required length
    encoded_text = encoded_text[:100] + [0] * (100 - len(encoded_text))
    # Convert to tensor
    input_tensor = torch.tensor(encoded_text, dtype=torch.long).unsqueeze(0)
    input_tensor = input_tensor.to('cuda' if torch.cuda.is_available() else 'cpu')
    # Predict polarity
    model.eval()
    with torch.no_grad():
        output = model(input_tensor)
        _, predicted = torch.max(output.data, 1)
    return predicted.item()

data = "This is a good example"
polarity = predict_polarity(data, tokenizer, model)
print(f'The polarity of the given text "{data}" is {polarity}')

The polarity of the given text "This is a good example" is 1
