## 1: Dataset Preparation

In [1]:
import os

# Importing files
if os.path.exists("train.tsv"):
    print("File exists")
else:
    !wget https://raw.githubusercontent.com/clairett/pytorch-sentiment-classification/master/data/SST2/train.tsv

if os.path.exists("test.tsv"):
    print("File exists")
else:
    !wget https://raw.githubusercontent.com/clairett/pytorch-sentiment-classification/master/data/SST2/test.tsv

if os.path.exists("IMDB-Dataset.csv"):
    print("File exists")
else:
    !wget https://raw.githubusercontent.com/Ankit152/IMDB-sentiment-analysis/master/IMDB-Dataset.csv


File exists
File exists
File exists


In [2]:
# Loading the datasets
import pandas as pd
import numpy as np

def extractfiles(file):
    df = pd.read_csv(file, sep='\t', header=None)
    df.columns = ['text', 'label']

    # Extracting text
    df['text'] = df['text'].astype(str).apply(lambda x: x.replace('\t', ''))

    # Extracting labels
    df['label'] = df['label'].astype(str).apply(lambda x: x.replace('\n', ''))
    df['label'] = df['label'].astype(int)

    # Printing counts to ensure no imbalance of classes
    print("Counts of each label:")
    print(df['label'].value_counts())

    df.head(10)

    return df

In [3]:
traindf = extractfiles("train.tsv") 
test = extractfiles("test.tsv")

Counts of each label:
label
1    3610
0    3310
Name: count, dtype: int64
Counts of each label:
label
0    912
1    909
Name: count, dtype: int64


In [4]:
# Split the dataset into train and validation
from sklearn.model_selection import train_test_split

train, val = train_test_split(traindf, test_size=0.2, random_state=11)
print(f"Train shape: {train.shape}, Validation shape: {val.shape}")

train.head()

Train shape: (5536, 2), Validation shape: (1384, 2)


Unnamed: 0,text,label
1934,"for those in search of something different , w...",1
3425,"yes , mibii is rote work and predictable , but...",1
6025,this is an insultingly inept and artificial ex...,0
6478,"plunges you into a reality that is , more ofte...",1
1084,the problem with the mayhem in formula 51 is n...,0


## 2: Model Construction

In [33]:
import torch
import torch.nn as nn
import torch.nn.functional as F

# Model architecture
class NeuralNetwork(nn.Module):
    def __init__(self):
        super(NeuralNetwork, self).__init__()
        self.fc1 = nn.Linear(10000, 512)
        self.fc2 = nn.Linear(512, 256)
        self.fc3 = nn.Linear(256, 128)
        self.fc4 = nn.Linear(128, 64)
        self.fc5 = nn.Linear(64, 2)
        self.dropout = nn.Dropout(0.3)

    def forward(self, x):
        x = self.dropout(F.relu(self.fc1(x)))
        x = self.dropout(F.relu(self.fc2(x)))
        x = self.dropout(F.relu(self.fc3(x)))
        x = self.dropout(F.relu(self.fc4(x)))
        return x

In [34]:
# Instantiate model
model = NeuralNetwork()

# Print summary
from torchsummary import summary
summary(model, (1, 10000)) # 1 is the batch size and 10000 is the input size

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Linear-1               [-1, 1, 512]       5,120,512
           Dropout-2               [-1, 1, 512]               0
            Linear-3               [-1, 1, 256]         131,328
           Dropout-4               [-1, 1, 256]               0
            Linear-5               [-1, 1, 128]          32,896
           Dropout-6               [-1, 1, 128]               0
            Linear-7                [-1, 1, 64]           8,256
           Dropout-8                [-1, 1, 64]               0
Total params: 5,292,992
Trainable params: 5,292,992
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.04
Forward/backward pass size (MB): 0.01
Params size (MB): 20.19
Estimated Total Size (MB): 20.24
----------------------------------------------------------------


In [35]:
total_params = sum(p.numel() for p in model.parameters())
print(f"Number of parameters: {total_params}")

Number of parameters: 5293122


In [56]:
train.index[:5]

Index([1934, 3425, 6025, 6478, 1084], dtype='int64')

## 3: Bag-of-words

In [21]:
# Implementing Bag of Words on the text data
from sklearn.feature_extraction.text import CountVectorizer

# Instantiate CountVectorizer
vectorizer = CountVectorizer(max_features=10000)

# Fit and transform the training data
train_features = vectorizer.fit_transform(train['text'])
train_features = train_features.toarray()

# Transform the validation data
val_features = vectorizer.transform(val['text'])
val_features = val_features.toarray()

# Transform the test data
test_features = vectorizer.transform(test['text'])
test_features = test_features.toarray()

# Print the shapes (number of samples, number of features)
print(f"Train features shape: {train_features.shape}")
print(f"Validation features shape: {val_features.shape}")
print(f"Test features shape: {test_features.shape}")

# Print length of vocab
print(f"Length of vocab: {len(vectorizer.vocabulary_)}")

# Print feature names
print("Feature names:")
print(vectorizer.get_feature_names_out()[:5])

Train features shape: (5536, 10000)
Validation features shape: (1384, 10000)
Test features shape: (1821, 10000)
Length of vocab: 10000
Feature names:
['000' '10' '100' '101' '103']


In [22]:
# Convert the features to tensors
train_features = torch.tensor(train_features, dtype=torch.float32)
val_features = torch.tensor(val_features, dtype=torch.float32)
test_features = torch.tensor(test_features, dtype=torch.float32)

# Convert the labels to tensors
train_labels = torch.tensor(train['label'].values, dtype=torch.int64)
val_labels = torch.tensor(val['label'].values, dtype=torch.int64)
test_labels = torch.tensor(test['label'].values, dtype=torch.int64)

# Print the shapes
print(f"Train features shape: {train_features.shape}, Train labels shape: {train_labels.shape}")
print(f"Validation features shape: {val_features.shape}, Validation labels shape: {val_labels.shape}")
print(f"Test features shape: {test_features.shape}, Test labels shape: {test_labels.shape}")

Train features shape: torch.Size([5536, 10000]), Train labels shape: torch.Size([5536])
Validation features shape: torch.Size([1384, 10000]), Validation labels shape: torch.Size([1384])
Test features shape: torch.Size([1821, 10000]), Test labels shape: torch.Size([1821])


## 4: Constructing a function to use LLaMa-3.1 embeddings on the same model

In [91]:
from transformers import AutoTokenizer, AutoModel

# Load LLaMA-3.1 model and tokenizer
class LLaMaEmbedder:
    def __init__(self, model_name="meta-llama/Llama-3.1-8B", device="cuda" if torch.cuda.is_available() else "cpu"):
        self.device = device
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModel.from_pretrained(model_name).to(device)
        self.embedding_size = self.model.config.hidden_size
        self.model_loaded = True

    def get_embedding(self, text):
        """Generate sentence embeddings using LLaMA-3.1-8B"""
        tokens = self.tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512).to(self.device)
        with torch.no_grad():
            outputs = self.model(**tokens)
        embeddings = outputs.last_hidden_state.mean(dim=1)  # Mean pooling over token embeddings
        return embeddings.cpu().numpy()

# Initialize LLaMA embedder
llama_embedder = LLaMaEmbedder()


OSError: You are trying to access a gated repo.
Make sure to have access to it at https://huggingface.co/meta-llama/Llama-3.1-8B.
401 Client Error. (Request ID: Root=1-67cc6ab7-53b93e6a272c1f1b7591d5e6;1e9131a8-f0e0-40ec-8578-29d617b7e280)

Cannot access gated repo for url https://huggingface.co/meta-llama/Llama-3.1-8B/resolve/main/config.json.
Access to model meta-llama/Llama-3.1-8B is restricted. You must have access to it and be authenticated to access it. Please log in.

In [24]:
import os
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModel

# Make sure you have defined your token (e.g., read from a file)
with open("api.txt", "r") as f:
    token = f.read().strip()

# Load LLaMA-3.1 model and tokenizer
class LLaMaEmbedder:
    def __init__(self, model_name="meta-llama/Llama-3.1-8B", device=None):
        # Determine the appropriate device
        if device is None:
            if torch.cuda.is_available():
                device = "cuda"
            elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
                device = "mps"  # Mac M1/M2 GPU
            else:
                device = "cpu"
        
        print(f"Using device: {device}")
        self.device = device
        
        # Use the correct parameter name for authentication
        self.tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=token)
        
        # Use FP16 only if on GPU (CUDA/MPS); use FP32 for CPU
        dtype = torch.float16 if device in ["cuda", "mps"] else torch.float32
        self.model = AutoModel.from_pretrained(model_name, torch_dtype=dtype, use_auth_token=token).to(device)
        
        self.embedding_size = self.model.config.hidden_size
        self.model_loaded = True

    def get_embeddings(self, texts, batch_size=16):
        embeddings_list = []
        for i in range(0, len(texts), batch_size):
            batch = texts[i:i + batch_size]
            tokens = self.tokenizer(
                batch, 
                return_tensors="pt", 
                padding=True, 
                truncation=True, 
                max_length=512
            ).to(self.device)
            with torch.no_grad():
                outputs = self.model(**tokens)
            # Mean pooling over token embeddings
            embeddings = outputs.last_hidden_state.mean(dim=1)
            embeddings_list.append(embeddings.cpu().numpy())
        return np.concatenate(embeddings_list, axis=0)

# Initialize LLaMA embedder
llama_embedder = LLaMaEmbedder()


Using device: mps


Downloading shards:  50%|█████     | 2/4 [00:07<00:07,  3.67s/it]


KeyboardInterrupt: 

In [23]:
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModel

# Determine the appropriate device
if torch.cuda.is_available():
    device = "cuda"
elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
    device = "mps"  # Mac M1/M2 GPU
else:
    device = "cpu"
print(f"Using device: {device}")

# Specify the model name for bert-base-uncased
model_name = "bert-base-uncased"

class BertEmbedder:
    def __init__(self, model_name=model_name, device=device):
        self.device = device
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        
        # Use FP16 if GPU is available, else default to FP32 for CPU
        dtype = torch.float16 if device == "cuda" else torch.float32
        self.model = AutoModel.from_pretrained(model_name, torch_dtype=dtype).to(device)
        
        self.embedding_size = self.model.config.hidden_size
        self.model_loaded = True

    def get_embeddings(self, texts, batch_size=16):
        embeddings_list = []
        for i in range(0, len(texts), batch_size):
            batch = texts[i:i + batch_size]
            tokens = self.tokenizer(
                batch, 
                return_tensors="pt", 
                padding=True, 
                truncation=True, 
                max_length=512
            ).to(self.device)
            with torch.no_grad():
                outputs = self.model(**tokens)
            # Mean pooling over token embeddings to generate a fixed-size sentence embedding
            embeddings = outputs.last_hidden_state.mean(dim=1)
            embeddings_list.append(embeddings.cpu().numpy())
        return np.concatenate(embeddings_list, axis=0)

# Initialize the Bert embedder
embedder = BertEmbedder()

# Example usage: Generate embeddings for sample texts
sample_texts = ["Hello world!", "How are you today?"]
embeddings = embedder.get_embeddings(sample_texts)
print("Embeddings shape:", embeddings.shape)


Using device: mps
Embeddings shape: (2, 768)


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import gzip
import torch

# Move model to the appropriate device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model.to(device)

# Define the batch size
batch_size = 32

# Verify the shapes of the features and labels
print(f"Train features shape: {train_features.shape}, Train labels shape: {train_labels.shape}")
print(f"Validation features shape: {val_features.shape}, Validation labels shape: {val_labels.shape}")
print(f"Test features shape: {test_features.shape}, Test labels shape: {test_labels.shape}")
# They should be torch.Size([num_samples, num_features]) and torch.Size([num_samples])

# Create Tensor datasets
train_dataset = TensorDataset(train_features, train_labels)
val_dataset = TensorDataset(val_features, val_labels)
test_dataset = TensorDataset(test_features, test_labels)

# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Define the loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Define the number of epochs
n_epochs = 10
best_val_loss = float('inf')  # Track the best validation loss

# Training loop
for epoch in range(n_epochs):
    model.train()
    train_loss = 0.0
    
    for inputs, labels in train_loader:
        # Move data to device
        inputs, labels = inputs.to(device).float(), labels.to(device).long()

        # Zero the gradients
        optimizer.zero_grad()
        
        # Forward pass
        outputs = model(inputs)
        
        # Compute loss
        loss = criterion(outputs, labels)
        
        # Backward pass & optimization
        loss.backward()
        optimizer.step()
        
        train_loss += loss.item()

    train_loss /= len(train_loader)  # Average training loss

    # Validation phase
    model.eval()
    val_loss = 0.0
    correct = 0
    total = 0

    with torch.no_grad():
        for inputs, labels in val_loader:
            inputs, labels = inputs.to(device).float(), labels.to(device).long()

            # Forward pass
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            val_loss += loss.item()

            # Accuracy calculation
            _, predicted = torch.max(outputs, 1)
            correct += (predicted == labels).sum().item()
            total += labels.size(0)

    val_loss /= len(val_loader)
    val_accuracy = correct / total

    # Save the best model based on validation loss
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        #torch.save(model.state_dict(), 'checkpoint.pt')
        checkpoint = {
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'epoch': epoch,
            'best_val_loss': best_val_loss
        }
        # Using gzip to compress the checkpoint file
        with gzip.open("checkpoint.pt.gz", "wb") as f:
            torch.save(checkpoint, f)

    # Print epoch results
    print(f"Epoch {epoch+1}/{n_epochs}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}, Val Acc: {val_accuracy:.4f}")


Train features shape: torch.Size([5536, 10000]), Train labels shape: torch.Size([5536])
Validation features shape: torch.Size([1384, 10000]), Validation labels shape: torch.Size([1384])
Test features shape: torch.Size([1821, 10000]), Test labels shape: torch.Size([1821])
Epoch 1/10, Train Loss: 2.3904, Val Loss: 1.3294, Val Acc: 0.5340
Epoch 2/10, Train Loss: 2.0814, Val Loss: 1.1735, Val Acc: 0.7442
Epoch 3/10, Train Loss: 1.7000, Val Loss: 1.0608, Val Acc: 0.7478
Epoch 4/10, Train Loss: 1.4466, Val Loss: 1.0152, Val Acc: 0.7767
Epoch 5/10, Train Loss: 1.3285, Val Loss: 1.2791, Val Acc: 0.7572
Epoch 6/10, Train Loss: 1.2753, Val Loss: 1.4355, Val Acc: 0.7536
Epoch 7/10, Train Loss: 1.3143, Val Loss: 1.6327, Val Acc: 0.7558
Epoch 8/10, Train Loss: 1.2903, Val Loss: 1.7345, Val Acc: 0.7500
Epoch 9/10, Train Loss: 1.2712, Val Loss: 1.7553, Val Acc: 0.7565
Epoch 10/10, Train Loss: 1.2906, Val Loss: 1.8324, Val Acc: 0.7514


In [37]:
# Load the best model
#model.load_state_dict(torch.load('checkpoint.pt'))

# Gzip
with gzip.open("checkpoint.pt.gz", "rb") as f:
    checkpoint = torch.load(f, map_location=torch.device('cpu'))
model.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])

  checkpoint = torch.load(f, map_location=torch.device('cpu'))


In [None]:
# Use the checkpoint from before and train on the IMDB dataset (Dataset 2)
