In [1]:
import numpy as np
import pandas as pd


In [3]:
 #Read the Data

data = pd.read_csv('SPAM text message 20170820 - Data.csv')


In [4]:
# Text Preprocessing

import re # regex library
def preprocessor(text):
    text = re.sub('<[^>]*>', '', text) # Effectively removes HTML markup tags
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
    text = re.sub('[\W]+', ' ', text.lower()) + ' '.join(emoticons).replace('-', '')
    return text


In [5]:
# Train, Test Split

from sklearn.model_selection import train_test_split
X = data['Message']
y = data['Category']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [6]:
# Training a Neural Network Pipeline

from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import cross_val_score

tfidf = TfidfVectorizer(strip_accents=None, lowercase=False, 
                        max_features=700, preprocessor=preprocessor, 
                        ngram_range=(1,1))
neural_net_pipeline = Pipeline([('vectorizer', tfidf), 
                                ('nn', MLPClassifier(hidden_layer_sizes=(700, 700)))])

neural_net_pipeline.fit(X_train, y_train)


Pipeline(steps=[('vectorizer',
                 TfidfVectorizer(lowercase=False, max_features=700,
                                 preprocessor=<function preprocessor at 0x7f77781859d0>)),
                ('nn', MLPClassifier(hidden_layer_sizes=(700, 700)))])

In [7]:
# Testing the Pipeline

y_pred = neural_net_pipeline.predict(X_test)
print(classification_report(y_test, y_pred))
print('Accuracy: {} %'.format(100 * accuracy_score(y_test, y_pred)))

# Saving the Pipeline

from joblib import dump
dump(neural_net_pipeline, 'spam_classifier.joblib')

              precision    recall  f1-score   support

         ham       0.99      0.99      0.99      1448
        spam       0.96      0.92      0.94       224

    accuracy                           0.99      1672
   macro avg       0.98      0.96      0.97      1672
weighted avg       0.98      0.99      0.98      1672

Accuracy: 98.50478468899522 %


['spam_classifier.joblib']