In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/email-spam-classification-dataset/combined_data.csv


In [2]:
dataset = pd.read_csv("/kaggle/input/email-spam-classification-dataset/combined_data.csv")

In [3]:
total_text = set()
for index, row in dataset.iterrows():
    total_text.update(row[1].split())
stoi = {char:i for i, char in enumerate(sorted(total_text))}

In [4]:
!wget https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-raw-v1.zip
!unzip wikitext-103-raw-v1.zip

--2023-11-14 17:57:00--  https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-raw-v1.zip
Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.216.218.128, 52.216.177.189, 52.217.207.40, ...
Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.216.218.128|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 191984949 (183M) [application/zip]
Saving to: ‘wikitext-103-raw-v1.zip’


2023-11-14 17:57:04 (46.3 MB/s) - ‘wikitext-103-raw-v1.zip’ saved [191984949/191984949]

Archive:  wikitext-103-raw-v1.zip
   creating: wikitext-103-raw/
  inflating: wikitext-103-raw/wiki.test.raw  
  inflating: wikitext-103-raw/wiki.valid.raw  
  inflating: wikitext-103-raw/wiki.train.raw  


In [5]:
from tokenizers import Tokenizer
from tokenizers.models import BPE
tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
from tokenizers.trainers import BpeTrainer
trainer = BpeTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])

from tokenizers.pre_tokenizers import Whitespace
tokenizer.pre_tokenizer = Whitespace()

files = [f"/kaggle/working/wikitext-103-raw/wiki.{split}.raw" for split in ["test", "train", "valid"]]
tokenizer.train(files, trainer)






In [6]:
for index, row in dataset.iterrows():
    print(tokenizer.encode(row[1]).ids)
    print(len(tokenizer.encode(row[1]).ids))
    break

[19969, 22090, 15383, 6199, 6876, 7336, 6943, 71, 9412, 5022, 5099, 6508, 5726, 8567, 10058, 7743, 5042, 17392, 5355, 5152, 19731, 77, 6861, 5034, 5149, 5438, 5037, 5561, 5136, 5280, 10437, 5137, 8372, 19005, 5812, 5111, 5120, 7384, 5278, 8740]
40


In [7]:
tokenizer.get_vocab_size()

30000

In [8]:
vocab_size = tokenizer.get_vocab_size()

In [9]:
block_size = 50

In [10]:
tokenizer.enable_truncation(block_size)

In [11]:
tokenizer.enable_padding(direction='left', length=block_size)

In [12]:
for index, row in dataset.iterrows():
    print(tokenizer.encode(row[1]).ids)
    print(len(tokenizer.encode(row[1]).ids))
    break

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 19969, 22090, 15383, 6199, 6876, 7336, 6943, 71, 9412, 5022, 5099, 6508, 5726, 8567, 10058, 7743, 5042, 17392, 5355, 5152, 19731, 77, 6861, 5034, 5149, 5438, 5037, 5561, 5136, 5280, 10437, 5137, 8372, 19005, 5812, 5111, 5120, 7384, 5278, 8740]
50


In [13]:
import torch

In [14]:
def build_dataset():
    xs, ys = [], []
    for index, row in dataset.iterrows():
        xs.append(tokenizer.encode(row[1]).ids)
        ys.append(row[0])
    xs = torch.tensor(xs)
    ys = torch.tensor(ys)
    return xs, ys
xs, ys = build_dataset()

In [15]:
xs[0]

tensor([    0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
        19969, 22090, 15383,  6199,  6876,  7336,  6943,    71,  9412,  5022,
         5099,  6508,  5726,  8567, 10058,  7743,  5042, 17392,  5355,  5152,
        19731,    77,  6861,  5034,  5149,  5438,  5037,  5561,  5136,  5280,
        10437,  5137,  8372, 19005,  5812,  5111,  5120,  7384,  5278,  8740])

In [16]:
ys[0] # i think 1=scam, 0=no scam

tensor(1)

In [17]:
emb_dim = 10

In [18]:
# initialize parameters
C = torch.randn(vocab_size, emb_dim)
w1 = torch.randn(block_size * emb_dim, 100)
b1 = torch.randn(100)
w2 = torch.randn(100, 2)
b2 = torch.randn(2)

params = [C, w1, b1, w2, b2]
for p in params:
    p.requires_grad = True
    

In [19]:
num_examples = len(xs)

In [20]:
# training params
num_epochs = 100000
lr = 0.001
batch_size = 16

In [21]:
import torch.nn.functional as F

In [22]:
for epoch in range(num_epochs):
    selection = torch.randint(0, num_examples-1, (batch_size, ))
    ins = xs[selection]
    outs = ys[selection]
    emb = C[ins].view(-1, emb_dim * block_size)
    l1 = ((emb @ w1) + b1).tanh()
    logits = ((l1 @ w2) + b2).sigmoid()
    for p in params:
        p.grad = None
        
    loss = F.cross_entropy(logits, outs)
    if epoch % 1000 == 0:
        print("Epoch:", epoch, ", loss:", loss.item())
    loss.backward()
    
    for p in params:
        p.data -= p.grad * lr
    

Epoch: 0 , loss: 0.7608835697174072
Epoch: 1000 , loss: 0.7754023671150208
Epoch: 2000 , loss: 0.7598123550415039
Epoch: 3000 , loss: 0.7555802464485168
Epoch: 4000 , loss: 0.7374330759048462
Epoch: 5000 , loss: 0.7787920236587524
Epoch: 6000 , loss: 0.7095420360565186
Epoch: 7000 , loss: 0.7818339467048645
Epoch: 8000 , loss: 0.765285074710846
Epoch: 9000 , loss: 0.8838711380958557
Epoch: 10000 , loss: 0.7236379981040955
Epoch: 11000 , loss: 0.680838406085968
Epoch: 12000 , loss: 0.6897302865982056
Epoch: 13000 , loss: 0.8212460875511169
Epoch: 14000 , loss: 0.7416567206382751
Epoch: 15000 , loss: 0.8155355453491211
Epoch: 16000 , loss: 0.7229146957397461
Epoch: 17000 , loss: 0.7862445116043091
Epoch: 18000 , loss: 0.6995608806610107
Epoch: 19000 , loss: 0.7637975215911865
Epoch: 20000 , loss: 0.6000310182571411
Epoch: 21000 , loss: 0.7045799493789673
Epoch: 22000 , loss: 0.7777068018913269
Epoch: 23000 , loss: 0.7292614579200745
Epoch: 24000 , loss: 0.7775517702102661
Epoch: 25000 , 

In [23]:
with torch.no_grad():
    selection = torch.randint(0, num_examples-1, (1,))
    ins = xs[selection]
    outs = ys[selection]
    emb = C[ins].view(-1, emb_dim * block_size)
    l1 = ((emb @ w1) + b1).tanh()
    logits = ((l1 @ w2) + b2).sigmoid()
    print(tokenizer.decode(ins[0].numpy()))
    print("prediction:", torch.argmax(logits).item())
    print("truth:", outs[0].item())

author pm ich aud date fri jun escap en umber escap en umber escap en umber escap en umber escap en umber new revision escap en umber modified trunk doc s p d ds p d des cap en umber exceptions pod log doc s the phrase ' may not
prediction: 0
truth: 0


In [24]:
def prompt(email):
    with torch.no_grad():
        ins = torch.tensor(tokenizer.encode(email).ids)
        emb = C[ins].view(-1, emb_dim * block_size)
        l1 = ((emb @ w1) + b1).tanh()
        logits = ((l1 @ w2) + b2).sigmoid()
        return torch.argmax(logits)

In [25]:
prompt("the world has gone wireless and mobile air waves mb w c is in the right place in the right time with a red hot product we are looking forward financial results to be announced by the company any moment with all the new contracts they have acquired we are")

tensor(0)