In [1]:
import numpy as np
import os 
import re
import pickle
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords

In [2]:
def process_content(content,output):
    content = content.lower()    
    content = re.sub('<[^>]*>','',content)
    content = content.replace('&nbsp','')
    content = re.sub('(http|https)://[^\s]*','httpaddr',content)
    content = re.sub('[0-9]+','number',content)
    content = re.sub('[$]+','dollar',content)
    content = re.sub('[^\s]+@[^\s]+','emailaddr',content)
    
    words = word_tokenize(content)
    ps = PorterStemmer()
    all_words = []
    for word in words:
        all_words.append(ps.stem(word))
    
    content = " ".join(all_words)
    with open(output, 'w+') as output_file:
        output_file.write(content)


In [3]:
def preprocess_email(files_dir,mail_type,split_rate):
    f_count = len(os.listdir(spam_dir))
    train_count = int(f_count * split_rate)
    files = [os.path.join(files_dir,f) for f in os.listdir(files_dir)]
    content = []
    for file in files[:train_count]:
        with open(file, errors='ignore') as f:
            for line in f:
                if line in ['\n','\r\n']:
                    break
                
            content = f.read()
            content = content.lower()
            output = 'preprocessed-files/train-mails/' + mail_type + file.split('/')[-1]                
            content = process_content(content,output)
     
    for file in files[train_count:]:
        with open(file, errors='ignore') as f:
            for line in f:
                if line in ['\n','\r\n']:
                    break
                
            content = f.read()
            output = 'preprocessed-files/test-mails/' + mail_type + file.split('/')[-1]
            content = content.lower()
            content = process_content(content,output)
                
        

In [4]:
def extract_features(mail_dir,vocab):
    files = [os.path.join(mail_dir,f) for f in os.listdir(mail_dir)]
    features = np.zeros((len(files),len(vocab)))
    y = []
    fileID = 0
    count = 0
    for f in files:
        with open(f,'r') as fi:
            text = fi.read()
            words = text.split()
            for word in words:
                for wordID,d in enumerate(vocab):
                    if word == d:
                        features[fileID,wordID] = words.count(word)
                        break      
            fi.close()
            fileID = fileID + 1
      
        if f.split("/")[-1].startswith('spam'):
            y.append(1)
        else:
            y.append(0)
                       
    return features,y

In [5]:
from collections import Counter
def make_vocabulary():
    files = []
    mail_dir = 'preprocessed-files/train-mails/'
    for f in os.listdir(mail_dir):
        if f.startswith('spam'):
            files.append(os.path.join(mail_dir,f))
    words = []
    for file in files:
        with open(file, errors='ignore') as f:
            content = f.read()
            words += content.split()
            
    vocab = Counter(words).most_common(5000)
    return vocab

In [6]:
import nltk
def clean_vocabulary(vocabulary):
    dictionary = dict(vocabulary)
    stop_words = set(stopwords.words('english'))
    
    for key in list(dictionary.keys()):
        if len(key)==1:
            del dictionary[key]
        elif not key.isalpha():
            del dictionary[key]
        elif key in stop_words:
            del dictionary[key]
            
    return dictionary

#### Preprocessing mails

In [7]:
spam_dir = 'dataset/spam_2'
ham_dir = 'dataset/easy_ham_2'

train_directory = 'preprocessed-files/train-mails'
if not os.path.exists(train_directory):
    os.makedirs(train_directory)

test_directory = 'preprocessed-files/test-mails'
if not os.path.exists(test_directory):
    os.makedirs(test_directory)

In [8]:
preprocess_email(spam_dir,'spam.',0.8)
preprocess_email(ham_dir,'ham.',0.8)

#### Vocabulary for spam words

In [9]:
spam_vocabulary = make_vocabulary()
print(spam_vocabulary)



#### Cleaning the vocabulary

In [10]:
spam_vocabulary = clean_vocabulary(spam_vocabulary)
print(spam_vocabulary)

{'number': 11042, 'thi': 3864, 'httpaddr': 2066, 'email': 1913, 'free': 1609, 'click': 1389, 'dollarnumb': 1300, 'get': 1243, 'list': 1240, 'busi': 1180, 'receiv': 1154, 'pleas': 1106, 'mail': 1068, 'order': 1028, 'remov': 1022, 'address': 971, 'one': 910, 'money': 906, 'use': 905, 'inform': 888, 'emailaddr': 886, 'anumb': 871, 'onli': 807, 'site': 800, 'make': 800, 'time': 775, 'send': 772, 'us': 772, 'program': 728, 'peopl': 716, 'offer': 705, 'ani': 698, 'new': 697, 'name': 683, 'product': 664, 'work': 663, 'want': 627, 'compani': 623, 'servic': 617, 'wa': 616, 'fnumber': 598, 'internet': 593, 'ha': 584, 'day': 579, 'market': 578, 'call': 575, 'need': 572, 'report': 570, 'grant': 553, 'home': 551, 'credit': 542, 'messag': 539, 'like': 527, 'web': 524, 'year': 521, 'includ': 513, 'quot': 502, 'state': 498, 'may': 486, 'rate': 476, 'help': 464, 'would': 463, 'price': 449, 'month': 448, 'also': 432, 'million': 427, 'form': 423, 'enumb': 422, 'ad': 416, 'dollar': 415, 'phone': 413, 'gov

#### Extracting features

In [11]:
X_train, y_train = extract_features(train_directory,spam_vocabulary)

In [12]:
X_test , y_test = extract_features(test_directory,spam_vocabulary)

#### Fit the model and predict

In [13]:
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix
model = SVC(C = 1, gamma = 0.01, kernel = 'linear')
model.fit(X_train,y_train)
res = model.predict(X_test)
print(confusion_matrix(y_test,res))
print(model.score(X_test,y_test))

[[274   9]
 [  5 275]]
0.9751332149200711
