In [1]:
import pandas as pd

## Loading the Dataset

In [3]:
df = pd.read_csv("C:\\Users\\jacob\\Documents\\Programming\\Theory\\Python\\AI\\Datasets\\NLP\\sms_spam.csv", encoding="latin-1")[["v1", "v2"]]
df.rename(columns={"v1": "Label", "v2": "Text"}, inplace=True)

df["Label"] = df["Label"].map({
    "ham": 0,
    "spam": 1
})

df

Unnamed: 0,Label,Text
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,1,This is the 2nd time we have tried 2 contact u...
5568,0,Will Ì_ b going to esplanade fr home?
5569,0,"Pity, * was in mood for that. So...any other s..."
5570,0,The guy did some bitching but I acted like i'd...


In [4]:
df.value_counts("Label")

Label
0    4825
1     747
Name: count, dtype: int64

## Converting DataFrame to Pytorch Dataset

In [5]:
class Dataset:
    def __init__(self, dataframe, classes):
        super().__init__()

        self.samples = [(dataframe["Text"][i], dataframe["Label"][i]) for i in range(len(dataframe))]
        self.classes = classes
        self.class_to_idx = {c: i for i, c in enumerate(self.classes)}
        self.idx_to_classes = {i: c for i, c in enumerate(self.classes)}
    
    def __getitem__(self, index):
        return self.samples[index]

    def __len__(self):
        return len(self.samples)

In [6]:
ds = Dataset(df, ["ham", "spam"])

print(len(ds))
ds[0]

5572


('Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...',
 0)

## Splitting the Dataset into Training and Testing Sets

In [7]:
train_size = int(len(ds) * 0.95)

train_ds = ds[:train_size]
test_ds = ds[train_size:]

print(len(train_ds), len(test_ds))

5293 279


## Splitting the Training Dataset into Batches

In [8]:
from math import ceil
from random import randint


class Loader:
    def __init__(self, ds, batch_size, shuffle):
        self.batch_size = batch_size
        self.shuffle = shuffle

        self._dsx = [s[0] for s in ds] # contains the x-values (inputs) of the dataset
        self._dsy = [s[1] for s in ds] # contains the y-values (targets) of the dataset

        if shuffle:
            self._temp_dsx = self._dsx.copy() 
            self._temp_dsy = self._dsy.copy()

    def __iter__(self):
        if self.shuffle:
            # Iterating over the number of batches that the dataset is going to bet split
            for _ in range(int(len(self._dsx) / self.batch_size)):
                
                # This random index gives the index of the first sample for the batch
                ridx = randint(0, len(self._temp_dsx) - self.batch_size)

                yield (self._temp_dsx[ridx: ridx + self.batch_size], self._temp_dsy[ridx: ridx + self.batch_size])

                # Removing the already `yield`ed batch from the dataset
                self._temp_dsx = self._temp_dsx[:ridx] + self._temp_dsx[ridx + self.batch_size:]
                self._temp_dsy = self._temp_dsy[:ridx] + self._temp_dsy[ridx + self.batch_size:]

            # Returning the last batch, which is not going to contain `batch_size` samples
            if len(self._temp_dsx) > 0:
                yield (self._temp_dsx, self._temp_dsy)

            # If we try to iterate again over the loader without those two lines, no samples are going to be returned
            self._temp_dsx = self._dsx.copy()
            self._temp_dsy = self._dsy.copy()

        else:
            j = 0
            for _ in range(ceil(len(self._dsx) / self.batch_size)):
                yield (self._dsx[j: j + self.batch_size], self._dsy[j: j + self.batch_size])
                j += self.batch_size

    def __len__(self):
        return ceil(len(self._dsx) / self.batch_size)

## Creating the Cross Validation Function

In [9]:
def cross_validation(ds, valid_prop, batch_size):
    valid_size = int(len(ds) * valid_prop)
    ridx = randint(0, len(ds) - valid_size)

    return (Loader(ds[ridx: ridx + valid_size], batch_size=batch_size, shuffle=False), Loader(ds[:ridx] + ds[ridx + valid_size:], batch_size=batch_size, shuffle=True))

## Demonstrating Cross Validation

In [10]:
EPOCH = 5
VALID_PROP = 0.2
BATCH_SIZE = 32

for epoch in range(EPOCH):
    ### Getting DataLoaders for Training and Evaluation
    valid_dl, train_dl = cross_validation(ds=ds, valid_prop=VALID_PROP, batch_size=BATCH_SIZE)

    ### Training and Evaluating the Model
    break