In [2]:
% matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB

import nltk
from nltk.corpus import stopwords
import re #regular expressions
from bs4 import BeautifulSoup
from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer




In [4]:
# Importeer de dataset en splits op in features en targets

# Inlezen dataset
dataset = pd.read_csv('spam_train.csv')

testset = pd.read_csv('spam_test.csv')

# Opsplitsen in features en targets
y_train = dataset.type.values
X_train = dataset.text.values

y_test = testset.type.values
X_test = testset.text.values


#print(X_train[2])


In [6]:
# Text preprocessing

def text_preprocessing(text, language, minWordSize):
    
    # remove html
    text_no_html = BeautifulSoup(str(text),"html.parser" ).get_text()
    
    # remove non-letters
    text_alpha_chars = re.sub("[^a-zA-Z']", " ", str(text_no_html)) 
        
    # convert to lower-case
    text_lower = text_alpha_chars.lower()
    
    # remove stop words
    stops = set(stopwords.words(language)) 
    text_no_stop_words = ' '
    
    for w in text_lower.split():
        if w not in stops:  
            text_no_stop_words = text_no_stop_words + w + ' '
      
       # do stemming
    text_stemmer = ' '
    stemmer = SnowballStemmer(language)
    for w in text_no_stop_words.split():
        text_stemmer = text_stemmer + stemmer.stem(w) + ' '
         
    # remove short words
    text_no_short_words = ' '
    for w in text_stemmer.split(): 
        if len(w) >=minWordSize:
            text_no_short_words = text_no_short_words + w + ' '
 

    return text_no_short_words


In [7]:
# Convert training and test set to bag of words
language = 'english'
minWordLength = 2

for i in range(X_train.size):
    X_train[i] = text_preprocessing(X_train[i], language, minWordLength)
    
    
for i in range(X_test.size):
    X_test[i] = text_preprocessing(X_test[i], language, minWordLength)
    


In [8]:
# Make sparse features vectors 
# Bag of words

count_vect = CountVectorizer()
X_train_bag_of_words = count_vect.fit(X_train)
X_train_bag_of_words = count_vect.transform(X_train)
X_test_bag_of_words = count_vect.transform(X_test)

#print(X_test_bag_of_words)

tfidf_transformer = TfidfTransformer()
tf_transformer = TfidfTransformer(use_idf=True).fit(X_train_bag_of_words)
X_train_tf = tf_transformer.transform(X_train_bag_of_words)
X_test_tf = tf_transformer.transform(X_test_bag_of_words)

In [9]:
# Naive bayes

NBclassifier = MultinomialNB(alpha=0.1)

NBclassifier.fit(X_train_tf, y_train)

y_pred = NBclassifier.predict(X_test_tf)
print(classification_report(y_test, y_pred))

cf = confusion_matrix(y_test, y_pred)
print(cf)
print(accuracy_score(y_test, y_pred) * 100)


             precision    recall  f1-score   support

        ham       0.99      0.99      0.99       738
       spam       0.95      0.94      0.95       112

avg / total       0.99      0.99      0.99       850

[[733   5]
 [  7 105]]
98.5882352941


In [10]:
# train a logistic regression classifier
lregclassifier = LogisticRegression(C=1)

lregclassifier.fit(X_train_tf, y_train)

LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [11]:
# test logistic classifier

y_pred = lregclassifier.predict(X_test_tf)
print(classification_report(y_test, y_pred))

cf = confusion_matrix(y_test, y_pred)
print(cf)
print(accuracy_score(y_test, y_pred) * 100) 

             precision    recall  f1-score   support

        ham       0.97      1.00      0.99       738
       spam       0.98      0.82      0.89       112

avg / total       0.97      0.97      0.97       850

[[736   2]
 [ 20  92]]
97.4117647059
