# Submission by

| Name       | Roll No.|
| ---------  |---------|
| Ankan Kar  | MCS202303   |
| Aman       | MCS202305   |
| Utpalraj Kemprai      |  MDS202352   |

## Importing libraries

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split


### Downloading the dataset

In [2]:
!kaggle datasets download -d uciml/sms-spam-collection-dataset
!unzip sms-spam-collection-dataset.zip

Dataset URL: https://www.kaggle.com/datasets/uciml/sms-spam-collection-dataset
License(s): unknown
sms-spam-collection-dataset.zip: Skipping, found more recently modified local copy (use --force to force download)
Archive:  sms-spam-collection-dataset.zip
replace spam.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: spam.csv                


### Preparing the data

In [3]:
data = pd.read_csv('/content/spam.csv', encoding='latin-1')
data.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [4]:
data.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis=1, inplace=True)
data.columns = ['label', 'sms']
data['label'] = data['label'].map({'ham': 0, 'spam': 1})
data.head()

Unnamed: 0,label,sms
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
data.isna().mean()

Unnamed: 0,0
label,0.0
sms,0.0


In [6]:
data.value_counts('label')

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
0,4825
1,747


In [7]:
# Define function to split messages
def split_message(message):
    words = message.split()
    half = len(words) // 2
    return ' '.join(words[:half]) + ' <END>', ' '.join(words[half:]) + ' <END>'

# Apply the split to create two halves of each SMS
data['sms_first_half'], data['sms_second_half'] = zip(*data['sms'].apply(split_message))


In [8]:
data.head()

Unnamed: 0,label,sms,sms_first_half,sms_second_half
0,0,"Go until jurong point, crazy.. Available only ...","Go until jurong point, crazy.. Available only ...",great world la e buffet... Cine there got amor...
1,0,Ok lar... Joking wif u oni...,Ok lar... Joking <END>,wif u oni... <END>
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,Free entry in 2 a wkly comp to win FA Cup fina...,May 2005. Text FA to 87121 to receive entry qu...
3,0,U dun say so early hor... U c already then say...,U dun say so early <END>,hor... U c already then say... <END>
4,0,"Nah I don't think he goes to usf, he lives aro...",Nah I don't think he goes <END>,"to usf, he lives around here though <END>"


In [9]:
from collections import defaultdict

# Create vocabulary
vocab = defaultdict(lambda: len(vocab))  # Assigns each new word an increasing index
vocab['<PAD>']  # Padding token (index 0)
vocab['<END>']  # End token

def encode_text(text):
    return [vocab[word] for word in text.split()]

# Encode first and second halves
data['first_half_encoded'] = data['sms_first_half'].apply(encode_text)
data['second_half_encoded'] = data['sms_second_half'].apply(encode_text)

# Get the size of the vocabulary
vocab_size = len(vocab)

In [10]:
data.head()

Unnamed: 0,label,sms,sms_first_half,sms_second_half,first_half_encoded,second_half_encoded
0,0,"Go until jurong point, crazy.. Available only ...","Go until jurong point, crazy.. Available only ...",great world la e buffet... Cine there got amor...,"[2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 1]","[845, 725, 8481, 999, 8482, 8483, 41, 222, 848..."
1,0,Ok lar... Joking wif u oni...,Ok lar... Joking <END>,wif u oni... <END>,"[12, 13, 14, 1]","[376, 142, 8485, 1]"
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,Free entry in 2 a wkly comp to win FA Cup fina...,May 2005. Text FA to 87121 to receive entry qu...,"[15, 16, 9, 17, 18, 19, 20, 21, 22, 23, 24, 25...","[1087, 8486, 797, 23, 21, 8487, 21, 579, 16, 8..."
3,0,U dun say so early hor... U c already then say...,U dun say so early <END>,hor... U c already then say... <END>,"[28, 29, 30, 31, 32, 1]","[8492, 28, 1821, 380, 177, 8493, 1]"
4,0,"Nah I don't think he goes to usf, he lives aro...",Nah I don't think he goes <END>,"to usf, he lives around here though <END>","[33, 34, 35, 36, 37, 38, 1]","[21, 7500, 37, 8494, 2800, 601, 3411, 1]"


In [11]:
vocab_size

15587

In [12]:
data['first_half_encoded'].apply(len).describe()

Unnamed: 0,first_half_encoded
count,5572.0
mean,8.499282
std,5.676067
min,1.0
25%,4.0
50%,7.0
75%,12.0
max,86.0


In [13]:
data['second_half_encoded'].apply(len).describe()

Unnamed: 0,second_half_encoded
count,5572.0
mean,8.995154
std,5.664383
min,2.0
25%,5.0
50%,7.0
75%,13.0
max,87.0


In [14]:
class SMSSplitDataset(Dataset):
    def __init__(self, first_half, second_half, vocab_size, max_len=20):
        self.first_half = first_half
        self.second_half = second_half
        self.vocab_size = vocab_size
        self.max_len = max_len

    def __len__(self):
        return len(self.first_half)

    def __getitem__(self, idx):
        first_half = self.first_half[idx][:self.max_len]
        second_half = self.second_half[idx][:self.max_len]

        # Padding
        first_half += [0] * (self.max_len - len(first_half))
        second_half += [0] * (self.max_len - len(second_half))

        return torch.tensor(first_half), torch.tensor(second_half)

# Split dataset into training and validation
train_data, val_data = train_test_split(data, test_size=0.2, random_state=42)
train_dataset = SMSSplitDataset(train_data['first_half_encoded'].tolist(),
                                train_data['second_half_encoded'].tolist(), vocab_size)
val_dataset = SMSSplitDataset(val_data['first_half_encoded'].tolist(),
                              val_data['second_half_encoded'].tolist(), vocab_size)


## Defining the Models

In [15]:
# Define RNN Model
class RNNModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
        super(RNNModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.RNN(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        embedded = self.embedding(x)
        output, _ = self.rnn(embedded)
        return self.fc(output)

# Define LSTM Model
class LSTMModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
        super(LSTMModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        embedded = self.embedding(x)
        output, (hidden, cell) = self.lstm(embedded)
        return self.fc(output)

In [16]:
def train_model(model, dataloader, loss_fn, optimizer, num_epochs):
    model.train()
    for epoch in range(num_epochs):
        total_loss = 0
        for first_half, second_half in dataloader:
            first_half, second_half = first_half.to(device), second_half.to(device)
            optimizer.zero_grad()
            output = model(first_half)
            loss = loss_fn(output.view(-1, vocab_size), second_half.view(-1))
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch+1}, Loss: {total_loss/len(dataloader):.6f}")

In [17]:
def evaluate_model(model, dataloader):
    model.eval()
    correct, total = 0, 0
    with torch.no_grad():
        for first_half, second_half in dataloader:
            first_half, second_half = first_half.to(device), second_half.to(device)
            output = model(first_half)
            _, predicted = torch.max(output, dim=2)
            correct += (predicted == second_half).sum().item()
            total += second_half.numel()
    print(f"Accuracy: {correct/total:.6f}")

In [18]:
# Hyperparameters
embedding_dim = 50
hidden_dim = 128
output_dim = vocab_size
num_epochs = 100
batch_size = 32
learning_rate = 0.001

In [19]:
# Dataloaders
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)

## Training the models

In [20]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


In [21]:
# Instantiate models, optimizers, and loss function
rnn_model = RNNModel(vocab_size, embedding_dim, hidden_dim, output_dim).to(device)
lstm_model = LSTMModel(vocab_size, embedding_dim, hidden_dim, output_dim).to(device)
loss_fn = nn.CrossEntropyLoss(ignore_index=0)  # Ignore padding token
optimizer_rnn = torch.optim.Adam(rnn_model.parameters(), lr=learning_rate)
optimizer_lstm = torch.optim.Adam(lstm_model.parameters(), lr=learning_rate)

### Number of parameters

In [22]:
def get_n_params(model):
    np=0
    for p in list(model.parameters()):
        np += p.nelement()
    return np

In [23]:
print("Number of parameters in RNN Model: ",get_n_params(rnn_model))

Number of parameters in RNN Model:  2813113


In [24]:
print("Number of parameters in LSTM Model: ",get_n_params(lstm_model))

Number of parameters in LSTM Model:  2882233


### RNN model

In [25]:
# Train and evaluate RNN model
print("Training RNN Model")
train_model(rnn_model, train_loader, loss_fn, optimizer_rnn, num_epochs)
evaluate_model(rnn_model, val_loader)

Training RNN Model
Epoch 1, Loss: 7.797582
Epoch 2, Loss: 6.876654
Epoch 3, Loss: 6.635037
Epoch 4, Loss: 6.381008
Epoch 5, Loss: 6.080314
Epoch 6, Loss: 5.738302
Epoch 7, Loss: 5.379286
Epoch 8, Loss: 5.031061
Epoch 9, Loss: 4.694624
Epoch 10, Loss: 4.376037
Epoch 11, Loss: 4.078998
Epoch 12, Loss: 3.821360
Epoch 13, Loss: 3.576812
Epoch 14, Loss: 3.363665
Epoch 15, Loss: 3.178461
Epoch 16, Loss: 3.001294
Epoch 17, Loss: 2.853184
Epoch 18, Loss: 2.712728
Epoch 19, Loss: 2.589089
Epoch 20, Loss: 2.470533
Epoch 21, Loss: 2.361346
Epoch 22, Loss: 2.262577
Epoch 23, Loss: 2.168433
Epoch 24, Loss: 2.081670
Epoch 25, Loss: 2.004932
Epoch 26, Loss: 1.925186
Epoch 27, Loss: 1.854577
Epoch 28, Loss: 1.785815
Epoch 29, Loss: 1.725766
Epoch 30, Loss: 1.666075
Epoch 31, Loss: 1.605322
Epoch 32, Loss: 1.559824
Epoch 33, Loss: 1.505289
Epoch 34, Loss: 1.460545
Epoch 35, Loss: 1.417867
Epoch 36, Loss: 1.377883
Epoch 37, Loss: 1.334378
Epoch 38, Loss: 1.297699
Epoch 39, Loss: 1.261156
Epoch 40, Loss:

### LSTM model

In [26]:
# Train and evaluate LSTM model
print("Training LSTM Model")
train_model(lstm_model, train_loader, loss_fn, optimizer_lstm, num_epochs)
evaluate_model(lstm_model, val_loader)

Training LSTM Model
Epoch 1, Loss: 7.966163
Epoch 2, Loss: 6.963295
Epoch 3, Loss: 6.781351
Epoch 4, Loss: 6.637821
Epoch 5, Loss: 6.479371
Epoch 6, Loss: 6.314645
Epoch 7, Loss: 6.116709
Epoch 8, Loss: 5.890348
Epoch 9, Loss: 5.613692
Epoch 10, Loss: 5.311489
Epoch 11, Loss: 5.006666
Epoch 12, Loss: 4.707243
Epoch 13, Loss: 4.427123
Epoch 14, Loss: 4.161506
Epoch 15, Loss: 3.919512
Epoch 16, Loss: 3.703444
Epoch 17, Loss: 3.500510
Epoch 18, Loss: 3.320824
Epoch 19, Loss: 3.153265
Epoch 20, Loss: 3.005683
Epoch 21, Loss: 2.862931
Epoch 22, Loss: 2.735065
Epoch 23, Loss: 2.617637
Epoch 24, Loss: 2.506903
Epoch 25, Loss: 2.408685
Epoch 26, Loss: 2.312530
Epoch 27, Loss: 2.219581
Epoch 28, Loss: 2.142963
Epoch 29, Loss: 2.058718
Epoch 30, Loss: 1.985050
Epoch 31, Loss: 1.913787
Epoch 32, Loss: 1.849306
Epoch 33, Loss: 1.785947
Epoch 34, Loss: 1.720111
Epoch 35, Loss: 1.662363
Epoch 36, Loss: 1.604787
Epoch 37, Loss: 1.551150
Epoch 38, Loss: 1.499459
Epoch 39, Loss: 1.452973
Epoch 40, Loss

## Seeing the prediction of RNN and LSTM model given the first half of an SMS

In [27]:
def decode_text(encoded_text):
    decoded_words = [list(vocab.keys())[list(vocab.values()).index(i)] for i in encoded_text if i != 0]
    return " ".join(decoded_words)

In [28]:
def predict_and_decode(model, input_tensor):
    model.eval()
    with torch.no_grad():
        output = model(input_tensor.to(device))
        _, predicted = torch.max(output, dim=2)
        predicted_list = predicted[0].cpu().numpy().tolist()  # Get prediction for the first example in the batch

    decoded_text = []
    end_token_found = False
    for i in predicted_list:
        if i != 0:
            word = list(vocab.keys())[list(vocab.values()).index(i)]
            if word == "<END>":
                if not end_token_found:
                    decoded_text.append(word)
                    end_token_found = True
            else:
                decoded_text.append(word)
    return " ".join(decoded_text)

In [29]:
# Get some examples from the validation set
num_examples_to_show = 20
for i in range(num_examples_to_show):
    first_half, second_half = val_dataset[i] #use val_dataset since we trained the model on training data
    first_half = first_half.unsqueeze(0) # Add batch dimension
    second_half = second_half.unsqueeze(0) # Add batch dimension
    print(f"\nExample {i + 1}:")

    rnn_decoded = predict_and_decode(rnn_model, first_half)
    lstm_decoded = predict_and_decode(lstm_model, first_half)

    print(f"Input First Half: {decode_text(val_dataset.first_half[i])}")
    # print(f"Ground Truth Second Half: {decode_text(val_dataset.second_half[i])}")

    print(f"RNN Prediction : {rnn_decoded}")
    print(f"LSTM Prediction: {lstm_decoded}")


Example 1:
Input First Half: Funny fact Nobody teaches volcanoes 2 erupt, tsunamis 2 arise, hurricanes 2 sway <END>
RNN Prediction : know on as you can't are on Stop Ts&Cs stop next then good <END>
LSTM Prediction: to will off which so so U U T&Cs T Txt call call <END>

Example 2:
Input First Half: I sent my scores to sophas and i had to do secondary application for a few schools. I think if you <END>
RNN Prediction : I you touch how in to a from try are in to know ? by a not . and u
LSTM Prediction: I you i So tomo, you reply know call know your crave it. I you i the reach reach i

Example 3:
Input First Half: We know someone who you know that fancies you. <END>
RNN Prediction : to to to stay so or some out to <END>
LSTM Prediction: planned, to dat a to well... who to ok? <END>

Example 4:
Input First Half: Only if you promise your getting out as SOON as you can. And you'll <END>
RNN Prediction : &lt;#&gt; a to tonight? just then must just a you! you its than to ... <END>
LSTM Predic