## 1: Dataset Preparation

In [1]:
import os

# Importing files
if os.path.exists("train.tsv"):
    print("File exists")
else:
    !wget https://raw.githubusercontent.com/clairett/pytorch-sentiment-classification/master/data/SST2/train.tsv

if os.path.exists("test.tsv"):
    print("File exists")
else:
    !wget https://raw.githubusercontent.com/clairett/pytorch-sentiment-classification/master/data/SST2/test.tsv

if os.path.exists("IMDB-Dataset.csv"):
    print("File exists")
else:
    !wget https://raw.githubusercontent.com/Ankit152/IMDB-sentiment-analysis/master/IMDB-Dataset.csv


File exists
File exists
File exists


In [3]:
# Loading the datasets
import pandas as pd
import numpy as np

def extractfiles(file):
    df = pd.read_csv(file, sep='\t', header=None)
    df.columns = ['text', 'label']

    # Extracting text
    df['text'] = df['text'].astype(str).apply(lambda x: x.replace('\t', ''))

    # Extracting labels
    df['label'] = df['label'].astype(str).apply(lambda x: x.replace('\n', ''))
    df['label'] = df['label'].astype(int)

    # Printing counts to ensure no imbalance of classes
    print("Counts of each label:")
    print(df['label'].value_counts())

    df.head(10)

    return df

In [5]:
traindf = extractfiles("train.tsv") 
test = extractfiles("test.tsv")

Counts of each label:
label
1    3610
0    3310
Name: count, dtype: int64
Counts of each label:
label
0    912
1    909
Name: count, dtype: int64


In [10]:
# Split the dataset into train and validation
from sklearn.model_selection import train_test_split

train, val = train_test_split(traindf, test_size=0.2, random_state=11)
print(f"Train shape: {train.shape}, Validation shape: {val.shape}")

train.head()

Train shape: (5536, 2), Validation shape: (1384, 2)


Unnamed: 0,text,label
1934,"for those in search of something different , w...",1
3425,"yes , mibii is rote work and predictable , but...",1
6025,this is an insultingly inept and artificial ex...,0
6478,"plunges you into a reality that is , more ofte...",1
1084,the problem with the mayhem in formula 51 is n...,0


## 2: Model Construction

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

# Model architecture
class NeuralNetwork(nn.Module):
    def __init__(self):
        super(NeuralNetwork, self).__init__()
        self.fc1 = nn.Linear(10000, 512)
        self.fc2 = nn.Linear(512, 256)
        self.fc3 = nn.Linear(256, 128)
        self.fc4 = nn.Linear(128, 64)
        self.fc5 = nn.Linear(64, 2)
        self.dropout = nn.Dropout(0.3)

    def forward(self, x):
        x = self.dropout(F.relu(self.fc1(x)))
        x = self.dropout(F.relu(self.fc2(x)))
        x = self.dropout(F.relu(self.fc3(x)))
        x = self.dropout(F.relu(self.fc4(x)))
        x = self.dropout(F.relu(self.fc5(x)))
        return x

In [None]:
# Instantiate model
model = NeuralNetwork()

# Print summary
from torchsummary import summary
summary(model, (1, 10000)) # 1 is the batch size and 10000 is the input size

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Linear-1               [-1, 1, 512]       5,120,512
           Dropout-2               [-1, 1, 512]               0
            Linear-3               [-1, 1, 256]         131,328
           Dropout-4               [-1, 1, 256]               0
            Linear-5               [-1, 1, 128]          32,896
           Dropout-6               [-1, 1, 128]               0
            Linear-7                [-1, 1, 64]           8,256
           Dropout-8                [-1, 1, 64]               0
            Linear-9                 [-1, 1, 2]             130
          Dropout-10                 [-1, 1, 2]               0
Total params: 5,293,122
Trainable params: 5,293,122
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.04
Forward/backward pass size (MB): 0.01
Params size (MB): 20.19
Estima

In [15]:
total_params = sum(p.numel() for p in model.parameters())
print(f"Number of parameters: {total_params}")

Number of parameters: 5293122


In [56]:
train.index[:5]

Index([1934, 3425, 6025, 6478, 1084], dtype='int64')

## 3: Bag-of-words

In [None]:
# Implementing Bag of Words on the text data
from sklearn.feature_extraction.text import CountVectorizer

# Instantiate CountVectorizer
vectorizer = CountVectorizer(max_features=10000)

# Fit and transform the training data
train_features = vectorizer.fit_transform(train['text'])
train_features = train_features.toarray()

# Transform the validation data
val_features = vectorizer.transform(val['text'])
val_features = val_features.toarray()

# Transform the test data
test_features = vectorizer.transform(test['text'])
test_features = test_features.toarray()

# Print the shapes (number of samples, number of features)
print(f"Train features shape: {train_features.shape}")
print(f"Validation features shape: {val_features.shape}")
print(f"Test features shape: {test_features.shape}")

# Print length of vocab
print(f"Length of vocab: {len(vectorizer.vocabulary_)}")

# Print feature names
print("Feature names:")
print(vectorizer.get_feature_names_out()[:5])

Train features shape: (5536, 10000)
Validation features shape: (1384, 10000)
Test features shape: (1821, 10000)
Length of vocab: 10000
Feature names:
['000' '10' '100' '101' '103']


In [80]:
# Convert the features to tensors
train_features = torch.tensor(train_features, dtype=torch.float32)
val_features = torch.tensor(val_features, dtype=torch.float32)
test_features = torch.tensor(test_features, dtype=torch.float32)

# Convert the labels to tensors
train_labels = torch.tensor(train['label'].values, dtype=torch.int64)
val_labels = torch.tensor(val['label'].values, dtype=torch.int64)
test_labels = torch.tensor(test['label'].values, dtype=torch.int64)

# Print the shapes
print(f"Train features shape: {train_features.shape}, Train labels shape: {train_labels.shape}")
print(f"Validation features shape: {val_features.shape}, Validation labels shape: {val_labels.shape}")
print(f"Test features shape: {test_features.shape}, Test labels shape: {test_labels.shape}")

Train features shape: torch.Size([5536, 10000]), Train labels shape: torch.Size([5536])
Validation features shape: torch.Size([1384, 10000]), Validation labels shape: torch.Size([1384])
Test features shape: torch.Size([1821, 10000]), Test labels shape: torch.Size([1821])


## 4: Constructing a function to use LLaMa-3.1 embeddings on the same model

In [91]:
from transformers import AutoTokenizer, AutoModel

# Load LLaMA-3.1 model and tokenizer
class LLaMaEmbedder:
    def __init__(self, model_name="meta-llama/Llama-3.1-8B", device="cuda" if torch.cuda.is_available() else "cpu"):
        self.device = device
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModel.from_pretrained(model_name).to(device)
        self.embedding_size = self.model.config.hidden_size
        self.model_loaded = True

    def get_embedding(self, text):
        """Generate sentence embeddings using LLaMA-3.1-8B"""
        tokens = self.tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512).to(self.device)
        with torch.no_grad():
            outputs = self.model(**tokens)
        embeddings = outputs.last_hidden_state.mean(dim=1)  # Mean pooling over token embeddings
        return embeddings.cpu().numpy()

# Initialize LLaMA embedder
llama_embedder = LLaMaEmbedder()


OSError: You are trying to access a gated repo.
Make sure to have access to it at https://huggingface.co/meta-llama/Llama-3.1-8B.
401 Client Error. (Request ID: Root=1-67cc6ab7-53b93e6a272c1f1b7591d5e6;1e9131a8-f0e0-40ec-8578-29d617b7e280)

Cannot access gated repo for url https://huggingface.co/meta-llama/Llama-3.1-8B/resolve/main/config.json.
Access to model meta-llama/Llama-3.1-8B is restricted. You must have access to it and be authenticated to access it. Please log in.