# Text Classfication

- [Dataset Link](https://huggingface.co/datasets/rotten_tomatoes)

In [1]:
!pip install datasets --quiet

[0m

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
from datasets import load_dataset

train = load_dataset('rotten_tomatoes', split = 'train')

In [4]:
val = load_dataset('rotten_tomatoes', split = 'validation')
test = load_dataset('rotten_tomatoes', split = 'test')

In [5]:
import pandas as pd

traindf = pd.DataFrame(train)
valdf = pd.DataFrame(val)
testdf = pd.DataFrame(test)

In [6]:
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')

engStopwords = stopwords.words('english')

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
from nltk.stem.snowball import SnowballStemmer

stemmer = SnowballStemmer(language = 'english')

In [8]:
from nltk.tokenize import word_tokenize
nltk.download('punkt')

def tokenize(text):
    return [stemmer.stem(word) for word in word_tokenize(text)]

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [9]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(lowercase=True, 
                             tokenizer=tokenize,
                             stop_words=engStopwords,
                             max_features=2000)

In [10]:
vectorizer.fit(traindf.text)

In [11]:
", ".join(vectorizer.get_feature_names_out().tolist()[:100])

"!, $, &, ', 'd, 'm, 's, (, ), *, ,, -, --, ., /, 10, 2, 2002, 60s, 70s, 90, :, ;, ?, [, ], ``, abandon, abil, abl, abov, absolut, absorb, absurd, abus, accent, accept, access, accomplish, account, accur, achiev, across, act, action, actor, actress, actual, ad, adam, adapt, add, addit, admir, admit, adolesc, adult, adventur, affair, affect, afraid, after-school, age, aggress, ago, aim, aimless, air, alabama, alien, aliv, allen, allow, almost, alon, along, alreadi, also, altern, although, alway, amateurish, amaz, ambigu, ambit, ambiti, america, american, ami, among, amount, amus, analyz, anchor, angel, angst, ani, anim, annoy, anoth"

In [12]:
%%time

trainInputs = vectorizer.transform(traindf.text).toarray()
trainTargets = traindf.label.to_numpy()

valInputs = vectorizer.transform(valdf.text).toarray()
valTargets = valdf.label.to_numpy()

testInputs = vectorizer.transform(testdf.text).toarray()
testTargets = testdf.label.to_numpy()

CPU times: user 6.32 s, sys: 73 ms, total: 6.39 s
Wall time: 6.4 s


In [13]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [14]:
if torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')
print(f"Device: {device}")

Device: cuda


In [15]:
from torch.utils.data import TensorDataset, DataLoader


train_data = TensorDataset(torch.from_numpy(trainInputs), torch.from_numpy(trainTargets))
valid_data = TensorDataset(torch.from_numpy(valInputs), torch.from_numpy(valTargets))
test_data = TensorDataset(torch.from_numpy(testInputs), torch.from_numpy(testTargets))

In [16]:
BATCH_SIZE = 64


trainLoader = DataLoader(train_data, shuffle=True, batch_size=BATCH_SIZE)
validLoader = DataLoader(valid_data, shuffle=True, batch_size=BATCH_SIZE)
testLoader = DataLoader(test_data, shuffle=True, batch_size=BATCH_SIZE)

In [17]:
!pip install pytorch_lightning --quiet

[0m

In [18]:
import pytorch_lightning as pl

In [19]:
VOCAB_SIZE = len(vectorizer.get_feature_names_out().tolist())
VOCAB_SIZE

2000

In [20]:
emb = nn.Embedding(VOCAB_SIZE, 256, 1)
rnn = nn.RNN(256, 128, 1, batch_first = True)

In [21]:
for batch in trainLoader:
    b_inputs, b_targets = batch
    print('b_input.shape', b_inputs.shape)
    print('b_targets.shape', b_targets.shape)

    emb_out = emb(b_inputs)
    print('emb_out.shape', emb_out.shape)

    rnn_out, hn = rnn(emb_out)
    print('rnn_out.shape', rnn_out.shape)
    print('hn.shape', hn.shape)

    break

b_input.shape torch.Size([64, 2000])
b_targets.shape torch.Size([64])
emb_out.shape torch.Size([64, 2000, 256])
rnn_out.shape torch.Size([64, 2000, 128])
hn.shape torch.Size([1, 64, 128])


In [22]:
class Classifier(pl.LightningModule):
    def __init__(self):
        super().__init__()
        self.emb = nn.Embedding(VOCAB_SIZE, 256, 1)
        self.lstm = nn.LSTM(256, 128, 1, batch_first = True)
        self.linear = nn.Linear(128, 1)
        self.learning_rate = 0.001
    
    def forward(self, x):
        out = self.emb(x)
        out, hn = self.lstm(out)
        out = F.relu(out[:,-1,:])
        out = self.linear(out)
        return out

In [27]:
sample = Classifier()

for batch in trainLoader:
    value, targets = batch
    print('b_input.shape', value.shape)
    print('b_targets.shape', targets.shape)

    output = sample(value)
    print(output.shape)
    probs = torch.sigmoid(output)
    print(probs.shape)
    break

b_input.shape torch.Size([64, 2000])
b_targets.shape torch.Size([64])
torch.Size([64, 1])
torch.Size([64, 1])
