# Natural Language Processing Assignment: Spam Filter
## Import necessary libs and datasets

In [1]:
import numpy as np
import pandas as pd
import urllib.request

urllib.request.urlretrieve("https://raw.githubusercontent.com/mohitgupta-omg/Kaggle-SMS-Spam-Collection-Dataset-/master/spam.csv", filename="spam.csv")
data = pd.read_csv('spam.csv', encoding='latin1')

import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader

In [2]:
del data['Unnamed: 2']
del data['Unnamed: 3']
del data['Unnamed: 4']

data['v1'] = data['v1'].replace(['ham','spam'],[0,1])   #Ham 이 0, spam 이 1
data['text'] = data['v2']
data['isSpam'] = data['v1']

del data['v1'], data['v2']

print(f'Data Shape: {data.shape}')
# imbalanced data
print(data['isSpam'].value_counts())
data.head()

Data Shape: (5572, 2)
0    4825
1     747
Name: isSpam, dtype: int64


Unnamed: 0,text,isSpam
0,"Go until jurong point, crazy.. Available only ...",0
1,Ok lar... Joking wif u oni...,0
2,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,U dun say so early hor... U c already then say...,0
4,"Nah I don't think he goes to usf, he lives aro...",0


## Preprocessing

In [4]:
import re

def preprocess(string: str, *args, **kwargs) -> str:
    from nltk.stem.porter import PorterStemmer #어간 추출
    from nltk.corpus import stopwords

    string = data.text
    string = string.str.replace(r'^.+@[^\.].*\.[a-z]{2,}$',
                                 'emailaddress') #이메일 한 번에 처리
    string = string.str.replace(r'^http\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(/\S*)?$',
                                  'webaddress') #웹주소 한 번에 처리
    string = string.str.replace(r'£|\$', 'moneysymb') #이런건 왜 나올까
    string = string.str.replace(r'^\(?[\d]{3}\)?[\s-]?[\d]{3}[\s-]?[\d]{4}$',
                                  'phonenumbr') #휴대폰 번호 한 번에 처리
    string = string.str.replace('[^a-zA-Z]', ' ') #영어 말고다 버려
    string= string.str.lower() #소문자
    
    ##불용어 제거
    stop_words = set(stopwords.words('english'))
    string = string.apply(lambda x: ' '.join(word for word 
                                             in x.split() if word not in stop_words))
    ###어간만!              
    ps = PorterStemmer()
    final_processed = string.apply(lambda x: ' '.join(ps.stem(term) for term in x.split()))
    
    print(final_processed)
    return final_processed

In [5]:
final_processed = preprocess(data.text)
final_processed[0] #check

0       go jurong point crazi avail bugi n great world...
1                                   ok lar joke wif u oni
2       free entri wkli comp win fa cup final tkt st m...
3                     u dun say earli hor u c alreadi say
4                    nah think goe usf live around though
                              ...                        
5567    nd time tri contact u u moneysymb pound prize ...
5568                                b go esplanad fr home
5569                                    piti mood suggest
5570    guy bitch act like interest buy someth els nex...
5571                                       rofl true name
Name: text, Length: 5572, dtype: object


'go jurong point crazi avail bugi n great world la e buffet cine got amor wat'

## Tokenizing

In [6]:
def tokenize(string: str, *args, **kwargs) -> list:
    from nltk.tokenize import word_tokenize
    document = [] 
    word_docx = []
    for mails in final_processed:
        words = word_tokenize(mails)
        word_docx.append(words)
        for i in words:
            document.append(i)  
            
    return document, word_docx

In [7]:
document, word_docx = tokenize(final_processed)
#tokenize(final_processed)[:10]
len(word_docx)

5572

<br>

Ex) 
```python
tokenize('hello world!',  *args, **kwargs) = ['hello', 'world']
```

## Build Vocabulary


In [8]:
def build_vocab(n, *args, **kwargs):
    from nltk import FreqDist
    #이중 리스트 벗기지 않아도 되는 document 미리 만들어둠
    vocab = FreqDist(document)
    vocab_size = n-1
    vocab =  vocab.most_common(vocab_size)

    word_to_index = {word[0] : index+2 for index, word in enumerate(vocab)}
    word_to_index['unk_idx'] = 1
    word_to_index['padding_idx'] = 0
    #print(word_to_index)
    
    return vocab, word_to_index

In [9]:
vocab, word_to_index = build_vocab(2000)

### toTensor


In [12]:
import torch
import torch.nn as nn
import torchvision.transforms as transforms


def toTensor(max_len, *args, **kwargs) -> torch.LongTensor:
    encoded =[]
    for s in word_docx: # 문장에 대하여 반복
        temp = []
        for w in s: # 단어에 대하여 반복
            try: # 단어 집합에 존재하는 단어인 경우
                temp.append(word_to_index[w]) # temp에 해당 단어의 인덱스 추가
            except KeyError: # 단어 집합에 존재하지 않는 단어인 경우
                temp.append(word_to_index['unk_idx']) # temp에 unk_idx의 인덱스 추가
        
        encoded.append(temp) # encoded에 정수 인코딩 한 리스트 추가
    
    final = encoded
    from torch.autograd import Variable
    seq_lengths = torch.LongTensor(list(map(len, final))) 

    seq_tensor = Variable(torch.zeros((len(final), seq_lengths.max()))).long()
    for idx, (seq, seqlen) in enumerate(zip(final, seq_lengths)):
        seq_tensor[idx, :seqlen] = torch.LongTensor(seq)

    print(seq_lengths.max()) 
    print(seq_tensor[0])  #확인
    print(seq_lengths[0]) #확인
            
    #tensor = torch.zeros(len(tokens), max_len).long()
       

    return encoded, seq_tensor, seq_lengths

In [13]:
encoded, seq_tensor, seq_lengths = toTensor(77)

tensor(77)
tensor([   4,    1,  298,  603,  516, 1070,   28,   70,  250,  818,   86,    1,
        1071,   18,    1,   76,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0])
tensor(16)


#### 경험삼아.. 만들어본 dataset, dataloader

In [14]:
import torch.utils.data.sampler as splr
from torch.utils.data import Dataset, TensorDataset
from torch.utils.data import DataLoader
from torch.utils.data.dataset import random_split
from sklearn.model_selection import train_test_split



class MailDataset(Dataset):
    '''mail dataset'''
    
    def __init__(self, x_tensor, y_tensor, batch_size):
        self.batch_size = batch_size
        self.x = x_tensor
        self.y = y_tensor
        
    
    def __getitem__(self, index):
        return (self.x[index], self.y[index])
    
    def __len__(self):
        return len(self.x)
    
x_tensor = seq_tensor
y_tensor = torch.tensor(data['isSpam'])    

X_train_, X_test_, y_train_, y_test_ = train_test_split(x_tensor, y_tensor, random_state=0,
                                                   stratify=y, test_size=0.1)


dataset = MailDataset(x_tensor, y_tensor, 80)
train_dataset = MailDataset(X_train,y_train, 80)
test_dataset = MailDataset(X_test,y_test, 80)

print(dataset[0]) #확인차
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=True)



(tensor([   4,    1,  298,  603,  516, 1070,   28,   70,  250,  818,   86,    1,
        1071,   18,    1,   76,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0]), tensor(0))


## train, test split 
#### 사실상 이것이 쓰임

In [15]:

def train_test_split(data, vocab_size, max_len=30):
    from sklearn.model_selection import train_test_split


    final_processed = preprocess(data.text)
    document, word_docx = tokenize(final_processed)
    vocab, word_to_index = build_vocab(vocab_size)
    encoded, seq_tensor, seq_lengths = toTensor(max_len)

    
    X, y =  seq_tensor, data['isSpam']
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, stratify=y, test_size=0.1)
    
    print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)
    
    return X_train, X_test, y_train, y_test



In [16]:
X_train, X_test,y_train,y_test=train_test_split(data, len(word_to_index)+1, 30)

0       go jurong point crazi avail bugi n great world...
1                                   ok lar joke wif u oni
2       free entri wkli comp win fa cup final tkt st m...
3                     u dun say earli hor u c alreadi say
4                    nah think goe usf live around though
                              ...                        
5567    nd time tri contact u u moneysymb pound prize ...
5568                                b go esplanad fr home
5569                                    piti mood suggest
5570    guy bitch act like interest buy someth els nex...
5571                                       rofl true name
Name: text, Length: 5572, dtype: object
tensor(77)
tensor([   4,    1,  298,  603,  516, 1070,   28,   70,  250,  818,   86,    1,
        1071,   18,    1,   76,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    

### KERAS

<br>

***simple RNN***

In [18]:
from tensorflow.keras.layers import SimpleRNN, Embedding, Dense
from tensorflow.keras.models import Sequential
import random

###40개의 example 준비하기
idx = random.sample(range(len(y_test)), 40)

X_sample = torch.index_select(X_test, 0, torch.tensor(idx))
y_sample = y_test.reset_index(drop=True)[idx]


#simple RNN model
vocab_size = len(word_to_index)+1
model = Sequential()
model.add(Embedding(vocab_size, 32)) # 임베딩 벡터의 차원은 32
model.add(SimpleRNN(32)) # RNN 셀의 hidden_size는 32
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
history = model.fit(X_train.numpy(), (y_train).to_numpy(), epochs=20, batch_size=64, 
                    validation_data=(X_sample.numpy(), (y_sample).to_numpy()))


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [20]:
print("\n 테스트 정확도: %.4f" % (model.evaluate(X_test.numpy(), y_test.to_numpy())[1]))


 테스트 정확도: 0.8978


<br>

***bi-LSTM***

In [21]:
#잠을 자기 위해 pytorch를 던져보자 Keras..나와라

from tensorflow.keras.layers import SimpleRNN, Embedding, Dense, LSTM,Bidirectional,Dropout
from tensorflow.keras.models import Sequential

vocab_size = len(word_to_index)+1
model = Sequential()
model.add(Embedding(5000, 32)) # 임베딩 벡터의 차원은 32
model.add(Bidirectional(LSTM(64)))
model.add(Dense(16, activation='relu'))
model.add(Dropout(0.1))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
history = model.fit(X_train.numpy(), (y_train).to_numpy(), epochs=30, batch_size=512,
                    validation_data=(X_sample.numpy(), (y_sample).to_numpy()))

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [22]:
print("\n 테스트 정확도: %.4f" % (model.evaluate(X_test.numpy(), (y_test).to_numpy())[1]))


 테스트 정확도: 0.9857
