# Spam detection

This project is based on the detection of spam messages in an e-mails dataset

In [14]:
import pandas as pd
import numpy as np
import re

spam_data = pd.read_csv('spam.csv')

spam_data['target'] = np.where(spam_data['target']=='spam',1,0)
#spam_data.head(10)

## Feature engineering

In [15]:
# Adding the length of the text
spam_data['length'] = spam_data["text"].apply(len)
# Adding the number of digits in the text
spam_data['len_digit'] = spam_data["text"].apply(lambda x: len(re.sub('\d','', x)))
# Adding the number of non-words in the text
spam_data['len_non_word'] = spam_data["text"].apply(lambda x: len(re.sub('\w','', x)))

In [16]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(spam_data['text'], spam_data['target'], random_state=0)

# Repartition of classes data
pd.value_counts(spam_data['target'])

0    4825
1     747
Name: target, dtype: int64

In [17]:
# Fit and transform the training data X_train by using a Tfid Vectorizer and ignoring terms that have a document 
# frequency strictly lower than 5 and using character n-grams from n=2 to n=5.
from sklearn.feature_extraction.text import TfidfVectorizer

vect = TfidfVectorizer(min_df=5, ngram_range=(2,5), analyzer='char_wb').fit(X_train)
Xtrain_vect = vect.transform(X_train)
X_test_vect = vect.transform(X_test)

In [18]:
# Creation and fit of a naive_bayes model
from sklearn.linear_model import LogisticRegression

clf_lr = LogisticRegression(C=100).fit(Xtrain_vect, y_train)

In [19]:
# Evalutation
from sklearn.metrics import roc_auc_score

y_predicted = clf_lr.predict(X_test_vect)
    
print(roc_auc_score(y_test, y_predicted))

0.9720812182741116


In [20]:
# Saving model
import pickle

with open('./spam_detection.pkl','wb') as model:
    pickle.dump(clf_lr, model)