In [None]:
# Logistic regression using PyTorch

In [1]:
from torchtext import data
import torch
from torch import nn
from torch.nn import functional as F

In [2]:
if torch.cuda.is_available():
  DEVICE = torch.device("cuda")
else:
  DEVICE = torch.device("cpu")

In [3]:
SEED = 1234

torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [4]:
import nltk
import re
import os
import pandas as pd

nltk.download("movie_reviews")

[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Unzipping corpora/movie_reviews.zip.


True

In [5]:
POS = "pos"
NEG = "neg"

In [6]:
text_sentiments = (POS, NEG)

example = []

for sentiment in text_sentiments:
  for filename in os.listdir(os.path.join(nltk.corpus.movie_reviews.root.path, sentiment)):
    with open(os.path.join(nltk.corpus.movie_reviews.root.path, sentiment, filename), "r", encoding="utf-8") as file:
      example.append({"text": file.read().strip(),
                      "sentiment": int(sentiment == POS)})

In [7]:
examples_df = pd.DataFrame(example)
examples_df.sample(5)

Unnamed: 0,text,sentiment
1612,"absolute power , the new film produced and dir...",0
1091,you think that these people only exist in the ...,0
1603,running time approximately 1hr 40mins \nreview...,0
836,i had been expecting more of this movie than t...,1
1219,it is movies like these that make a jaded movi...,0


In [8]:
examples_df = examples_df.sample(frac=1)
train_df = examples_df.sample(frac=0.7)
test_df = examples_df.drop(index=train_df.index)
train_texts, train_labels = train_df["text"].values, train_df["sentiment"].values
test_texts, test_labels = test_df["text"].values, test_df["sentiment"].values

In [9]:
test_labels[:5]

array([1, 1, 1, 1, 0])

In [10]:
len(test_df.text.values), len(test_df.sentiment.values), len(test_labels)

(600, 600, 600)

In [11]:
from typing import List, Dict, Any, Iterable
from collections import Counter, OrderedDict
import math
from itertools import islice

In [12]:
class TfIdfVectorizer:
  

  def __init__(self, lower=True, tokenizer_pattern=r"(?i)\b[a-z]{2,}\b"):
    self.lower = lower
    self.tokenizer_pattern = re.compile(tokenizer_pattern)
    self.vocab_df = OrderedDict()

  def __tokenize(self, text: str) -> List[str]:
    return self.tokenizer_pattern.findall(text.lower() if self.lower else text)

  def fit(self, texts: Iterable[str]):
    term_id = 0
    for doc_idx, doc in enumerate(texts):
      tokenized = self.__tokenize(doc)
      for term in tokenized:
        if term not in self.vocab_df:
          self.vocab_df[term] = {}
          self.vocab_df[term]["doc_ids"] = {doc_idx}
          self.vocab_df[term]["doc_count"] = 1
          self.vocab_df[term]["id"] = term_id
          term_id += 1
        elif doc_idx not in self.vocab_df[term]["doc_ids"]:
          self.vocab_df[term]["doc_ids"].add(doc_idx)
          self.vocab_df[term]["doc_count"] += 1
    texts_len = len(texts)
    for term in self.vocab_df:
      self.vocab_df[term]["idf"] = math.log(texts_len / self.vocab_df[term]["doc_count"])

  def transform(self, texts: Iterable[str]) -> torch.sparse.LongTensor:
    values = []
    doc_indices = []
    term_indices = []
    for doc_idx, raw_doc in enumerate(texts):
      term_counter = {}
      for token in self.__tokenize(raw_doc):
        if token in self.vocab_df:
          term = self.vocab_df[token]
          term_idx = term["id"]
          term_idf = term["idf"]
          if term_idx not in term_counter:
            term_counter[term_idx] = term_idf
          else:
            term_counter[term_idx] += term_idf
      term_indices.extend(term_counter.keys())
      values.extend(term_counter.values())
      doc_indices.extend([doc_idx] * len(term_counter))
    indices = torch.LongTensor([doc_indices, term_indices], device=DEVICE)
    values_tensor = torch.LongTensor(values, device=DEVICE)
    tf_idf = torch.sparse.LongTensor(indices,
                                     values_tensor,
                                     torch.Size([len(texts), len(self.vocab_df)]),
                                     device=DEVICE)
    return tf_idf

In [13]:
%%time
vectorizer = TfIdfVectorizer()
vectorizer.fit(train_texts)

CPU times: user 943 ms, sys: 28.5 ms, total: 971 ms
Wall time: 976 ms


In [14]:
%%time
train_data = vectorizer.transform(train_texts)
test_data = vectorizer.transform(test_texts)

CPU times: user 1.18 s, sys: 25.4 ms, total: 1.21 s
Wall time: 1.27 s


In [15]:
# Making the dataset iterable
from torch.utils.data import DataLoader, Dataset

In [16]:
BATCH_SIZE = 64
train_data_loader = DataLoader(train_texts, batch_size=BATCH_SIZE)
test_data_loader = DataLoader(test_texts, batch_size=BATCH_SIZE)

In [17]:
def batch(iterable, n=1):
  l = len(iterable)
  for ndx in range(0, l, n):
    yield iterable[ndx:min(ndx+n, l)]

In [18]:
# Build the model
class LogisticRegressionModel(nn.Module):

  def __init__(self, input_dim, output_dim):
    super(LogisticRegressionModel, self).__init__()
    self.linear_1 = nn.Linear(input_dim, output_dim)

  def forward(self, x):
    x = F.softmax(self.linear_1(x))
    return x

In [19]:
model = LogisticRegressionModel(len(vectorizer.vocab_df), 2)

In [20]:
criterion = nn.CrossEntropyLoss()
learning_rate = 0.001
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

In [21]:
print(model.parameters())
print(len(list(model.parameters())))
print(list(model.parameters())[0].size())
print(list(model.parameters())[1].size())

<generator object Module.parameters at 0x7fe95fe1e200>
2
torch.Size([2, 33771])
torch.Size([2])


In [22]:
NUM_EPOCHS = 5
iteration = 0
for epoch in range(NUM_EPOCHS):
  print(f"Epoch # {epoch}")
  for i, (texts, labels) in enumerate(zip(train_data_loader, batch(train_labels, BATCH_SIZE))):
    labels = torch.LongTensor(labels)
    texts = F.normalize(vectorizer.transform(texts).to(torch.float).to_dense()).requires_grad_()
    #print(texts.size(), labels.size())
    optimizer.zero_grad()
    outputs = model(texts)
    loss = criterion(outputs, labels)
    loss.backward()
    optimizer.step()
    iteration += 1

    if iteration % 50 == 0:
      correct = 0
      total = 0
      for test_texts, test_labels_batch in zip(test_data_loader, batch(test_labels, BATCH_SIZE)):
        test_texts = F.normalize(vectorizer.transform(test_texts).to(torch.float).to_dense())
        test_labels_batch = torch.Tensor(test_labels_batch).to(torch.long)

        outputs = model(test_texts)
        _, predicted = torch.max(outputs.data, 1)
        total += test_labels_batch.size(0)
        correct += (predicted == test_labels_batch).sum()
      accuracy = 100 * correct / total

      print(f"Iteration: {iteration}. Loss {loss.item()}. Accuracy {accuracy}")

Epoch # 0


  if __name__ == '__main__':


Epoch # 1
Epoch # 2
Iteration: 50. Loss 0.6931905150413513. Accuracy 47.5
Epoch # 3
Epoch # 4
Iteration: 100. Loss 0.6933770775794983. Accuracy 47.16666793823242
