In [1]:
!pip install lightning

Collecting lightning
  Downloading lightning-2.2.1-py3-none-any.whl (2.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
Collecting lightning-utilities<2.0,>=0.8.0 (from lightning)
  Downloading lightning_utilities-0.10.1-py3-none-any.whl (24 kB)
Collecting torchmetrics<3.0,>=0.7.0 (from lightning)
  Downloading torchmetrics-1.3.1-py3-none-any.whl (840 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m840.4/840.4 kB[0m [31m11.8 MB/s[0m eta [36m0:00:00[0m
Collecting pytorch-lightning (from lightning)
  Downloading pytorch_lightning-2.2.1-py3-none-any.whl (801 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m801.6/801.6 kB[0m [31m49.3 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch<4.0,>=1.13.0->lightning)
  Downloading nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from nltk.corpus import movie_reviews, stopwords
from nltk.tokenize import word_tokenize
import nltk
import torch
from torch import nn, optim
from torch.utils.data import TensorDataset, DataLoader
import random

nltk.download('movie_reviews')
nltk.download('punkt')
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Preprocessing function
def preprocess_text(text):
    tokens = word_tokenize(text.lower())
    filtered_tokens = [w for w in tokens if not w in stop_words and w.isalnum()]
    return " ".join(filtered_tokens)

# Load and preprocess the NLTK movie_reviews dataset
def load_and_preprocess_data():
    documents = []
    labels = []

    # NLTK movie_reviews data is structured with individual file ids
    # Positive reviews have a 'pos' category, and negative reviews have a 'neg' category
    movie_list = movie_reviews.fileids()
    random.shuffle(movie_list)
    for file_id in movie_list:
        documents.append(preprocess_text(movie_reviews.raw(file_id)))
        # Assign labels: 0 for negative and 1 for positive
        labels.append(0 if file_id.split('/')[0] == 'neg' else 1)

    return documents, np.array(labels)

# Load and preprocess data
documents, labels = load_and_preprocess_data()

# Split the data into training and testing sets
split_ratio = 0.8
split_index = int(len(documents) * split_ratio)
train_texts, test_texts = documents[:split_index], documents[split_index:]
train_labels, test_labels = labels[:split_index], labels[split_index:]

# Compute TF-IDF
vectorizer = TfidfVectorizer(max_features=20000)  # Adjust max_features to your dataset size
train_tfidf = vectorizer.fit_transform(train_texts)
test_tfidf = vectorizer.transform(test_texts)

# Convert to PyTorch tensors
train_tfidf_tensor = torch.from_numpy(train_tfidf.toarray()).float()
test_tfidf_tensor = torch.from_numpy(test_tfidf.toarray()).float()
train_labels_tensor = torch.from_numpy(train_labels).long()
test_labels_tensor = torch.from_numpy(test_labels).long()

# Create TensorDatasets and DataLoaders
batch_size = 16
train_dataset = TensorDataset(train_tfidf_tensor, train_labels_tensor)
test_dataset = TensorDataset(test_tfidf_tensor, test_labels_tensor)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)

[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Unzipping corpora/movie_reviews.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [20]:
import lightning as L

class LightningSimpleTextClassifier(L.LightningModule):
    def __init__(self, input_dim, hidden_size, num_classes, learning_rate=0.01):
        super(LightningSimpleTextClassifier, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, num_classes)
        self.dropout = nn.Dropout(0.5)
        self.learning_rate = learning_rate

        # Loss function
        self.criterion = nn.CrossEntropyLoss()

    def forward(self, x):
        out = self.fc1(x)
        out = self.relu(out)
        out = self.dropout(out)
        out = self.fc2(out)
        return out

    def configure_optimizers(self):
        optimizer = optim.Adam(self.parameters(), lr=self.learning_rate)
        return optimizer

    def training_step(self, batch, batch_idx):
        features, labels = batch
        outputs = self(features)
        loss = self.criterion(outputs, labels)
        self.log("train_loss", loss)
        return loss

    def test_step(self, batch, batch_idx):
        features, labels = batch
        outputs = self(features)
        loss = self.criterion(outputs, labels)

        # Calculate accuracy
        _, predicted = torch.max(outputs, 1)
        correct = predicted.eq(labels).sum().item()
        accuracy = correct / labels.size(0)

        # Log test loss and test accuracy
        self.log('test_loss', loss)
        self.log('test_accuracy', accuracy)

        return {'test_loss': loss, 'test_accuracy': accuracy}


# Hyperparameters
input_dim = train_tfidf_tensor.shape[1]  # Number of features in TF-IDF vectors
hidden_size = 1500
num_classes = 2

In [21]:
from lightning.pytorch.callbacks import ModelCheckpoint, EarlyStopping

# Initialize the Lightning model
model = LightningSimpleTextClassifier(input_dim, hidden_size, num_classes, learning_rate=0.01)

# Instantiate built-in callbacks (optional)
checkpoint_callback = ModelCheckpoint(dirpath='checkpoints/', save_top_k=1, verbose=True, monitor='train_loss', mode='min')
early_stopping_callback = EarlyStopping(monitor='train_loss', patience=3)

# Trainer
trainer = L.Trainer(max_epochs=3, callbacks=[checkpoint_callback, early_stopping_callback])
trainer.fit(model, train_loader)

INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
/usr/local/lib/python3.10/dist-packages/lightning/pytorch/callbacks/model_checkpoint.py:653: Checkpoint directory /content/checkpoints exists and is not empty.
INFO: LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:lightning.pytorch.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO: 
  | Name      | Type             | Params
-----------------------------------------------
0 | fc1       | Linear           | 30.0 M
1 | relu      | ReLU             | 0     
2 | fc2       | Linear           | 3.0 K 
3 | dropout   | Dropout          | 0     
4 | criterion | CrossEntropyLoss | 0     
-----------------------------------------------
30.0 M    Trainable params
0  

Training: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:Epoch 0, global step 100: 'train_loss' reached 0.29183 (best 0.29183), saving model to '/content/checkpoints/epoch=0-step=100.ckpt' as top 1
INFO:pytorch_lightning.utilities.rank_zero:Epoch 1, global step 200: 'train_loss' reached 0.00185 (best 0.00185), saving model to '/content/checkpoints/epoch=1-step=200.ckpt' as top 1
INFO:pytorch_lightning.utilities.rank_zero:Epoch 2, global step 300: 'train_loss' reached 0.00147 (best 0.00147), saving model to '/content/checkpoints/epoch=2-step=300-v1.ckpt' as top 1
INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=3` reached.


In [26]:
model.eval()
trainer.test(model, test_loader)

INFO: LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:lightning.pytorch.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: |          | 0/? [00:00<?, ?it/s]

[{'test_loss': 0.5492565631866455, 'test_accuracy': 0.8499999642372131}]